/*
* parse-diff.c: functions for parsing diff files
*
* ====================================================================
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
* ====================================================================
*/
#include <stdlib.h>
#include <string.h>
#include "svn_types.h"
#include "svn_error.h"
#include "svn_io.h"
#include "svn_pools.h"
#include "svn_props.h"
#include "svn_string.h"
#include "svn_utf.h"
#include "svn_dirent_uri.h"
#include "svn_diff.h"
#include "private/svn_eol_private.h"
#include "private/svn_dep_compat.h"
/* Helper macro for readability */
#define starts_with(str, start) \
(strncmp((str), (start), strlen(start)) == 0)
/* Like strlen() but for string literals. */
#define STRLEN_LITERAL(str) (sizeof(str) - 1)
/* This struct describes a range within a file, as well as the
* current cursor position within the range. All numbers are in bytes. */
struct svn_diff__hunk_range {
apr_off_t start;
apr_off_t end;
apr_off_t current;
};
struct svn_diff_hunk_t {
/* The patch this hunk belongs to. */
svn_patch_t *patch;
/* APR file handle to the patch file this hunk came from. */
apr_file_t *apr_file;
/* Ranges used to keep track of this hunk's texts positions within
* the patch file. */
struct svn_diff__hunk_range diff_text_range;
struct svn_diff__hunk_range original_text_range;
struct svn_diff__hunk_range modified_text_range;
/* Hunk ranges as they appeared in the patch file.
* All numbers are lines, not bytes. */
svn_linenum_t original_start;
svn_linenum_t original_length;
svn_linenum_t modified_start;
svn_linenum_t modified_length;
/* Number of lines of leading and trailing hunk context. */
svn_linenum_t leading_context;
svn_linenum_t trailing_context;
};
void
svn_diff_hunk_reset_diff_text(svn_diff_hunk_t *hunk)
{
hunk->diff_text_range.current = hunk->diff_text_range.start;
}
void
svn_diff_hunk_reset_original_text(svn_diff_hunk_t *hunk)
{
if (hunk->patch->reverse)
hunk->modified_text_range.current = hunk->modified_text_range.start;
else
hunk->original_text_range.current = hunk->original_text_range.start;
}
void
svn_diff_hunk_reset_modified_text(svn_diff_hunk_t *hunk)
{
if (hunk->patch->reverse)
hunk->original_text_range.current = hunk->original_text_range.start;
else
hunk->modified_text_range.current = hunk->modified_text_range.start;
}
svn_linenum_t
svn_diff_hunk_get_original_start(const svn_diff_hunk_t *hunk)
{
return hunk->patch->reverse ? hunk->modified_start : hunk->original_start;
}
svn_linenum_t
svn_diff_hunk_get_original_length(const svn_diff_hunk_t *hunk)
{
return hunk->patch->reverse ? hunk->modified_length : hunk->original_length;
}
svn_linenum_t
svn_diff_hunk_get_modified_start(const svn_diff_hunk_t *hunk)
{
return hunk->patch->reverse ? hunk->original_start : hunk->modified_start;
}
svn_linenum_t
svn_diff_hunk_get_modified_length(const svn_diff_hunk_t *hunk)
{
return hunk->patch->reverse ? hunk->original_length : hunk->modified_length;
}
svn_linenum_t
svn_diff_hunk_get_leading_context(const svn_diff_hunk_t *hunk)
{
return hunk->leading_context;
}
svn_linenum_t
svn_diff_hunk_get_trailing_context(const svn_diff_hunk_t *hunk)
{
return hunk->trailing_context;
}
/* Try to parse a positive number from a decimal number encoded
* in the string NUMBER. Return parsed number in OFFSET, and return
* TRUE if parsing was successful. */
static svn_boolean_t
parse_offset(svn_linenum_t *offset, const char *number)
{
svn_error_t *err;
apr_uint64_t val;
err = svn_cstring_strtoui64(&val, number, 0, SVN_LINENUM_MAX_VALUE, 10);
if (err)
{
svn_error_clear(err);
return FALSE;
}
*offset = (svn_linenum_t)val;
return TRUE;
}
/* Try to parse a hunk range specification from the string RANGE.
* Return parsed information in *START and *LENGTH, and return TRUE
* if the range parsed correctly. Note: This function may modify the
* input value RANGE. */
static svn_boolean_t
parse_range(svn_linenum_t *start, svn_linenum_t *length, char *range)
{
char *comma;
if (*range == 0)
return FALSE;
comma = strstr(range, ",");
if (comma)
{
if (strlen(comma + 1) > 0)
{
/* Try to parse the length. */
if (! parse_offset(length, comma + 1))
return FALSE;
/* Snip off the end of the string,
* so we can comfortably parse the line
* number the hunk starts at. */
*comma = '\0';
}
else
/* A comma but no length? */
return FALSE;
}
else
{
*length = 1;
}
/* Try to parse the line number the hunk starts at. */
return parse_offset(start, range);
}
/* Try to parse a hunk header in string HEADER, putting parsed information
* into HUNK. Return TRUE if the header parsed correctly. ATAT is the
* character string used to delimit the hunk header.
* Do all allocations in POOL. */
static svn_boolean_t
parse_hunk_header(const char *header, svn_diff_hunk_t *hunk,
const char *atat, apr_pool_t *pool)
{
const char *p;
const char *start;
svn_stringbuf_t *range;
p = header + strlen(atat);
if (*p != ' ')
/* No. */
return FALSE;
p++;
if (*p != '-')
/* Nah... */
return FALSE;
/* OK, this may be worth allocating some memory for... */
range = svn_stringbuf_create_ensure(31, pool);
start = ++p;
while (*p && *p != ' ')
{
p++;
}
if (*p != ' ')
/* No no no... */
return FALSE;
svn_stringbuf_appendbytes(range, start, p - start);
/* Try to parse the first range. */
if (! parse_range(&hunk->original_start, &hunk->original_length, range->data))
return FALSE;
/* Clear the stringbuf so we can reuse it for the second range. */
svn_stringbuf_setempty(range);
p++;
if (*p != '+')
/* Eeek! */
return FALSE;
/* OK, this may be worth copying... */
start = ++p;
while (*p && *p != ' ')
{
p++;
}
if (*p != ' ')
/* No no no... */
return FALSE;
svn_stringbuf_appendbytes(range, start, p - start);
/* Check for trailing @@ */
p++;
if (! starts_with(p, atat))
return FALSE;
/* There may be stuff like C-function names after the trailing @@,
* but we ignore that. */
/* Try to parse the second range. */
if (! parse_range(&hunk->modified_start, &hunk->modified_length, range->data))
return FALSE;
/* Hunk header is good. */
return TRUE;
}
/* A helper for reading a line of text from a range in the patch file.
*
* Allocate *STRINGBUF in RESULT_POOL, and read into it one line from FILE.
* Reading stops either after a line-terminator was found or after MAX_LEN
* bytes have been read. The line-terminator is not stored in *STRINGBUF.
*
* The line-terminator is detected automatically and stored in *EOL
* if EOL is not NULL. If EOF is reached and FILE does not end
* with a newline character, and EOL is not NULL, *EOL is set to NULL.
*
* SCRATCH_POOL is used for temporary allocations.
*/
static svn_error_t *
readline(apr_file_t *file,
svn_stringbuf_t **stringbuf,
const char **eol,
svn_boolean_t *eof,
apr_size_t max_len,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
svn_stringbuf_t *str;
const char *eol_str;
apr_size_t numbytes;
char c;
apr_size_t len;
svn_boolean_t found_eof;
str = svn_stringbuf_create_ensure(80, result_pool);
/* Read bytes into STR up to and including, but not storing,
* the next EOL sequence. */
eol_str = NULL;
numbytes = 1;
len = 0;
found_eof = FALSE;
while (!found_eof)
{
if (len < max_len)
SVN_ERR(svn_io_file_read_full2(file, &c, sizeof(c), &numbytes,
&found_eof, scratch_pool));
len++;
if (numbytes != 1 || len > max_len)
{
found_eof = TRUE;
break;
}
if (c == '\n')
{
eol_str = "\n";
}
else if (c == '\r')
{
eol_str = "\r";
if (!found_eof && len < max_len)
{
apr_off_t pos;
/* Check for "\r\n" by peeking at the next byte. */
pos = 0;
SVN_ERR(svn_io_file_seek(file, APR_CUR, &pos, scratch_pool));
SVN_ERR(svn_io_file_read_full2(file, &c, sizeof(c), &numbytes,
&found_eof, scratch_pool));
if (numbytes == 1 && c == '\n')
{
eol_str = "\r\n";
len++;
}
else
{
/* Pretend we never peeked. */
SVN_ERR(svn_io_file_seek(file, APR_SET, &pos, scratch_pool));
found_eof = FALSE;
numbytes = 1;
}
}
}
else
svn_stringbuf_appendbyte(str, c);
if (eol_str)
break;
}
if (eol)
*eol = eol_str;
if (eof)
*eof = found_eof;
*stringbuf = str;
return SVN_NO_ERROR;
}
/* Read a line of original or modified hunk text from the specified
* RANGE within FILE. FILE is expected to contain unidiff text.
* Leading unidiff symbols ('+', '-', and ' ') are removed from the line,
* Any lines commencing with the VERBOTEN character are discarded.
* VERBOTEN should be '+' or '-', depending on which form of hunk text
* is being read.
*
* All other parameters are as in svn_diff_hunk_readline_original_text()
* and svn_diff_hunk_readline_modified_text().
*/
static svn_error_t *
hunk_readline_original_or_modified(apr_file_t *file,
struct svn_diff__hunk_range *range,
svn_stringbuf_t **stringbuf,
const char **eol,
svn_boolean_t *eof,
char verboten,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
apr_size_t max_len;
svn_boolean_t filtered;
apr_off_t pos;
svn_stringbuf_t *str;
if (range->current >= range->end)
{
/* We're past the range. Indicate that no bytes can be read. */
*eof = TRUE;
if (eol)
*eol = NULL;
*stringbuf = svn_stringbuf_create("", result_pool);
return SVN_NO_ERROR;
}
pos = 0;
SVN_ERR(svn_io_file_seek(file, APR_CUR, &pos, scratch_pool));
SVN_ERR(svn_io_file_seek(file, APR_SET, &range->current, scratch_pool));
do
{
max_len = range->end - range->current;
SVN_ERR(readline(file, &str, eol, eof, max_len,
result_pool, scratch_pool));
range->current = 0;
SVN_ERR(svn_io_file_seek(file, APR_CUR, &range->current, scratch_pool));
filtered = (str->data[0] == verboten || str->data[0] == '\\');
}
while (filtered && ! *eof);
if (filtered)
{
/* EOF, return an empty string. */
*stringbuf = svn_stringbuf_create_ensure(0, result_pool);
}
else if (str->data[0] == '+' || str->data[0] == '-' || str->data[0] == ' ')
{
/* Shave off leading unidiff symbols. */
*stringbuf = svn_stringbuf_create(str->data + 1, result_pool);
}
else
{
/* Return the line as-is. */
*stringbuf = svn_stringbuf_dup(str, result_pool);
}
SVN_ERR(svn_io_file_seek(file, APR_SET, &pos, scratch_pool));
return SVN_NO_ERROR;
}
svn_error_t *
svn_diff_hunk_readline_original_text(svn_diff_hunk_t *hunk,
svn_stringbuf_t **stringbuf,
const char **eol,
svn_boolean_t *eof,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
return svn_error_trace(
hunk_readline_original_or_modified(hunk->apr_file,
hunk->patch->reverse ?
&hunk->modified_text_range :
&hunk->original_text_range,
stringbuf, eol, eof,
hunk->patch->reverse ? '-' : '+',
result_pool, scratch_pool));
}
svn_error_t *
svn_diff_hunk_readline_modified_text(svn_diff_hunk_t *hunk,
svn_stringbuf_t **stringbuf,
const char **eol,
svn_boolean_t *eof,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
return svn_error_trace(
hunk_readline_original_or_modified(hunk->apr_file,
hunk->patch->reverse ?
&hunk->original_text_range :
&hunk->modified_text_range,
stringbuf, eol, eof,
hunk->patch->reverse ? '+' : '-',
result_pool, scratch_pool));
}
svn_error_t *
svn_diff_hunk_readline_diff_text(svn_diff_hunk_t *hunk,
svn_stringbuf_t **stringbuf,
const char **eol,
svn_boolean_t *eof,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
svn_diff_hunk_t dummy;
svn_stringbuf_t *line;
apr_size_t max_len;
apr_off_t pos;
if (hunk->diff_text_range.current >= hunk->diff_text_range.end)
{
/* We're past the range. Indicate that no bytes can be read. */
*eof = TRUE;
if (eol)
*eol = NULL;
*stringbuf = svn_stringbuf_create("", result_pool);
return SVN_NO_ERROR;
}
pos = 0;
SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_CUR, &pos, scratch_pool));
SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_SET,
&hunk->diff_text_range.current, scratch_pool));
max_len = hunk->diff_text_range.end - hunk->diff_text_range.current;
SVN_ERR(readline(hunk->apr_file, &line, eol, eof, max_len, result_pool,
scratch_pool));
hunk->diff_text_range.current = 0;
SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_CUR,
&hunk->diff_text_range.current, scratch_pool));
SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_SET, &pos, scratch_pool));
if (hunk->patch->reverse)
{
if (parse_hunk_header(line->data, &dummy, "@@", scratch_pool))
{
/* Line is a hunk header, reverse it. */
line = svn_stringbuf_createf(result_pool,
"@@ -%lu,%lu +%lu,%lu @@",
hunk->modified_start,
hunk->modified_length,
hunk->original_start,
hunk->original_length);
}
else if (parse_hunk_header(line->data, &dummy, "##", scratch_pool))
{
/* Line is a hunk header, reverse it. */
line = svn_stringbuf_createf(result_pool,
"## -%lu,%lu +%lu,%lu ##",
hunk->modified_start,
hunk->modified_length,
hunk->original_start,
hunk->original_length);
}
else
{
if (line->data[0] == '+')
line->data[0] = '-';
else if (line->data[0] == '-')
line->data[0] = '+';
}
}
*stringbuf = line;
return SVN_NO_ERROR;
}
/* Parse *PROP_NAME from HEADER as the part after the INDICATOR line.
* Allocate *PROP_NAME in RESULT_POOL.
* Set *PROP_NAME to NULL if no valid property name was found. */
static svn_error_t *
parse_prop_name(const char **prop_name, const char *header,
const char *indicator, apr_pool_t *result_pool)
{
SVN_ERR(svn_utf_cstring_to_utf8(prop_name,
header + strlen(indicator),
result_pool));
if (**prop_name == '\0')
*prop_name = NULL;
else if (! svn_prop_name_is_valid(*prop_name))
{
svn_stringbuf_t *buf = svn_stringbuf_create(*prop_name, result_pool);
svn_stringbuf_strip_whitespace(buf);
*prop_name = (svn_prop_name_is_valid(buf->data) ? buf->data : NULL);
}
return SVN_NO_ERROR;
}
/* Return the next *HUNK from a PATCH in APR_FILE.
* If no hunk can be found, set *HUNK to NULL.
* Set IS_PROPERTY to TRUE if we have a property hunk. If the returned HUNK
* is the first belonging to a certain property, then PROP_NAME and
* PROP_OPERATION will be set too. If we have a text hunk, PROP_NAME will be
* NULL. If IGNORE_WHITESPACE is TRUE, lines without leading spaces will be
* treated as context lines. Allocate results in RESULT_POOL.
* Use SCRATCH_POOL for all other allocations. */
static svn_error_t *
parse_next_hunk(svn_diff_hunk_t **hunk,
svn_boolean_t *is_property,
const char **prop_name,
svn_diff_operation_kind_t *prop_operation,
svn_patch_t *patch,
apr_file_t *apr_file,
svn_boolean_t ignore_whitespace,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
static const char * const minus = "--- ";
static const char * const text_atat = "@@";
static const char * const prop_atat = "##";
svn_stringbuf_t *line;
svn_boolean_t eof, in_hunk, hunk_seen;
apr_off_t pos, last_line;
apr_off_t start, end;
apr_off_t original_end;
apr_off_t modified_end;
svn_linenum_t original_lines;
svn_linenum_t modified_lines;
svn_linenum_t leading_context;
svn_linenum_t trailing_context;
svn_boolean_t changed_line_seen;
enum {
noise_line,
original_line,
modified_line,
context_line
} last_line_type;
apr_pool_t *iterpool;
*prop_operation = svn_diff_op_unchanged;
/* We only set this if we have a property hunk header. */
*prop_name = NULL;
*is_property = FALSE;
if (apr_file_eof(apr_file) == APR_EOF)
{
/* No more hunks here. */
*hunk = NULL;
return SVN_NO_ERROR;
}
in_hunk = FALSE;
hunk_seen = FALSE;
leading_context = 0;
trailing_context = 0;
changed_line_seen = FALSE;
original_end = 0;
modified_end = 0;
*hunk = apr_pcalloc(result_pool, sizeof(**hunk));
/* Get current seek position -- APR has no ftell() :( */
pos = 0;
SVN_ERR(svn_io_file_seek(apr_file, APR_CUR, &pos, scratch_pool));
/* Start out assuming noise. */
last_line_type = noise_line;
iterpool = svn_pool_create(scratch_pool);
do
{
svn_pool_clear(iterpool);
/* Remember the current line's offset, and read the line. */
last_line = pos;
SVN_ERR(readline(apr_file, &line, NULL, &eof, APR_SIZE_MAX,
iterpool, iterpool));
/* Update line offset for next iteration. */
pos = 0;
SVN_ERR(svn_io_file_seek(apr_file, APR_CUR, &pos, iterpool));
/* Lines starting with a backslash are comments, such as
* "\ No newline at end of file". */
if (line->data[0] == '\\')
{
if (in_hunk &&
((!*is_property &&
strcmp(line->data, "\\ No newline at end of file") == 0) ||
(*is_property &&
strcmp(line->data, "\\ No newline at end of property") == 0)))
{
char eolbuf[2];
apr_size_t len;
apr_off_t off;
apr_off_t hunk_text_end;
/* Comment terminates the hunk text and says the hunk text
* has no trailing EOL. Snip off trailing EOL which is part
* of the patch file but not part of the hunk text. */
off = last_line - 2;
SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &off, iterpool));
len = sizeof(eolbuf);
SVN_ERR(svn_io_file_read_full2(apr_file, eolbuf, len, &len,
&eof, iterpool));
if (eolbuf[0] == '\r' && eolbuf[1] == '\n')
hunk_text_end = last_line - 2;
else if (eolbuf[1] == '\n' || eolbuf[1] == '\r')
hunk_text_end = last_line - 1;
else
hunk_text_end = last_line;
if (last_line_type == original_line && original_end == 0)
original_end = hunk_text_end;
else if (last_line_type == modified_line && modified_end == 0)
modified_end = hunk_text_end;
else if (last_line_type == context_line)
{
if (original_end == 0)
original_end = hunk_text_end;
if (modified_end == 0)
modified_end = hunk_text_end;
}
SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &pos, iterpool));
}
continue;
}
if (in_hunk)
{
char c;
static const char add = '+';
static const char del = '-';
if (! hunk_seen)
{
/* We're reading the first line of the hunk, so the start
* of the line just read is the hunk text's byte offset. */
start = last_line;
}
c = line->data[0];
if (original_lines > 0 && modified_lines > 0 &&
((c == ' ')
/* Tolerate chopped leading spaces on empty lines. */
|| (! eof && line->len == 0)
/* Maybe tolerate chopped leading spaces on non-empty lines. */
|| (ignore_whitespace && c != del && c != add)))
{
/* It's a "context" line in the hunk. */
hunk_seen = TRUE;
original_lines--;
modified_lines--;
if (changed_line_seen)
trailing_context++;
else
leading_context++;
last_line_type = context_line;
}
else if (original_lines > 0 && c == del)
{
/* It's a "deleted" line in the hunk. */
hunk_seen = TRUE;
changed_line_seen = TRUE;
/* A hunk may have context in the middle. We only want
trailing lines of context. */
if (trailing_context > 0)
trailing_context = 0;
original_lines--;
last_line_type = original_line;
}
else if (modified_lines > 0 && c == add)
{
/* It's an "added" line in the hunk. */
hunk_seen = TRUE;
changed_line_seen = TRUE;
/* A hunk may have context in the middle. We only want
trailing lines of context. */
if (trailing_context > 0)
trailing_context = 0;
modified_lines--;
last_line_type = modified_line;
}
else
{
if (eof)
{
/* The hunk ends at EOF. */
end = pos;
}
else
{
/* The start of the current line marks the first byte
* after the hunk text. */
end = last_line;
}
if (original_end == 0)
original_end = end;
if (modified_end == 0)
modified_end = end;
break; /* Hunk was empty or has been read. */
}
}
else
{
if (starts_with(line->data, text_atat))
{
/* Looks like we have a hunk header, try to rip it apart. */
in_hunk = parse_hunk_header(line->data, *hunk, text_atat,
iterpool);
if (in_hunk)
{
original_lines = (*hunk)->original_length;
modified_lines = (*hunk)->modified_length;
*is_property = FALSE;
}
}
else if (starts_with(line->data, prop_atat))
{
/* Looks like we have a property hunk header, try to rip it
* apart. */
in_hunk = parse_hunk_header(line->data, *hunk, prop_atat,
iterpool);
if (in_hunk)
{
original_lines = (*hunk)->original_length;
modified_lines = (*hunk)->modified_length;
*is_property = TRUE;
}
}
else if (starts_with(line->data, "Added: "))
{
SVN_ERR(parse_prop_name(prop_name, line->data, "Added: ",
result_pool));
if (*prop_name)
*prop_operation = svn_diff_op_added;
}
else if (starts_with(line->data, "Deleted: "))
{
SVN_ERR(parse_prop_name(prop_name, line->data, "Deleted: ",
result_pool));
if (*prop_name)
*prop_operation = svn_diff_op_deleted;
}
else if (starts_with(line->data, "Modified: "))
{
SVN_ERR(parse_prop_name(prop_name, line->data, "Modified: ",
result_pool));
if (*prop_name)
*prop_operation = svn_diff_op_modified;
}
else if (starts_with(line->data, minus)
|| starts_with(line->data, "diff --git "))
/* This could be a header of another patch. Bail out. */
break;
}
}
/* Check for the line length since a file may not have a newline at the
* end and we depend upon the last line to be an empty one. */
while (! eof || line->len > 0);
svn_pool_destroy(iterpool);
if (! eof)
/* Rewind to the start of the line just read, so subsequent calls
* to this function or svn_diff_parse_next_patch() don't end
* up skipping the line -- it may contain a patch or hunk header. */
SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &last_line, scratch_pool));
if (hunk_seen && start < end)
{
(*hunk)->patch = patch;
(*hunk)->apr_file = apr_file;
(*hunk)->leading_context = leading_context;
(*hunk)->trailing_context = trailing_context;
(*hunk)->diff_text_range.start = start;
(*hunk)->diff_text_range.current = start;
(*hunk)->diff_text_range.end = end;
(*hunk)->original_text_range.start = start;
(*hunk)->original_text_range.current = start;
(*hunk)->original_text_range.end = original_end;
(*hunk)->modified_text_range.start = start;
(*hunk)->modified_text_range.current = start;
(*hunk)->modified_text_range.end = modified_end;
}
else
/* Something went wrong, just discard the result. */
*hunk = NULL;
return SVN_NO_ERROR;
}
/* Compare function for sorting hunks after parsing.
* We sort hunks by their original line offset. */
static int
compare_hunks(const void *a, const void *b)
{
const svn_diff_hunk_t *ha = *((const svn_diff_hunk_t *const *)a);
const svn_diff_hunk_t *hb = *((const svn_diff_hunk_t *const *)b);
if (ha->original_start < hb->original_start)
return -1;
if (ha->original_start > hb->original_start)
return 1;
return 0;
}
/* Possible states of the diff header parser. */
enum parse_state
{
state_start, /* initial */
state_git_diff_seen, /* diff --git */
state_git_tree_seen, /* a tree operation, rather then content change */
state_git_minus_seen, /* --- /dev/null; or --- a/ */
state_git_plus_seen, /* +++ /dev/null; or +++ a/ */
state_move_from_seen, /* rename from foo.c */
state_copy_from_seen, /* copy from foo.c */
state_minus_seen, /* --- foo.c */
state_unidiff_found, /* valid start of a regular unidiff header */
state_git_header_found /* valid start of a --git diff header */
};
/* Data type describing a valid state transition of the parser. */
struct transition
{
const char *expected_input;
enum parse_state required_state;
/* A callback called upon each parser state transition. */
svn_error_t *(*fn)(enum parse_state *new_state, char *input,
svn_patch_t *patch, apr_pool_t *result_pool,
apr_pool_t *scratch_pool);
};
/* UTF-8 encode and canonicalize the content of LINE as FILE_NAME. */
static svn_error_t *
grab_filename(const char **file_name, const char *line, apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
const char *utf8_path;
const char *canon_path;
/* Grab the filename and encode it in UTF-8. */
/* TODO: Allow specifying the patch file's encoding.
* For now, we assume its encoding is native. */
/* ### This can fail if the filename cannot be represented in the current
* ### locale's encoding. */
SVN_ERR(svn_utf_cstring_to_utf8(&utf8_path,
line,
scratch_pool));
/* Canonicalize the path name. */
canon_path = svn_dirent_canonicalize(utf8_path, scratch_pool);
*file_name = apr_pstrdup(result_pool, canon_path);
return SVN_NO_ERROR;
}
/* Parse the '--- ' line of a regular unidiff. */
static svn_error_t *
diff_minus(enum parse_state *new_state, char *line, svn_patch_t *patch,
apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
/* If we can find a tab, it separates the filename from
* the rest of the line which we can discard. */
char *tab = strchr(line, '\t');
if (tab)
*tab = '\0';
SVN_ERR(grab_filename(&patch->old_filename, line + STRLEN_LITERAL("--- "),
result_pool, scratch_pool));
*new_state = state_minus_seen;
return SVN_NO_ERROR;
}
/* Parse the '+++ ' line of a regular unidiff. */
static svn_error_t *
diff_plus(enum parse_state *new_state, char *line, svn_patch_t *patch,
apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
/* If we can find a tab, it separates the filename from
* the rest of the line which we can discard. */
char *tab = strchr(line, '\t');
if (tab)
*tab = '\0';
SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("+++ "),
result_pool, scratch_pool));
*new_state = state_unidiff_found;
return SVN_NO_ERROR;
}
/* Parse the first line of a git extended unidiff. */
static svn_error_t *
git_start(enum parse_state *new_state, char *line, svn_patch_t *patch,
apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
const char *old_path_start;
char *old_path_end;
const char *new_path_start;
const char *new_path_end;
char *new_path_marker;
const char *old_path_marker;
/* ### Add handling of escaped paths
* http://www.kernel.org/pub/software/scm/git/docs/git-diff.html:
*
* TAB, LF, double quote and backslash characters in pathnames are
* represented as \t, \n, \" and \\, respectively. If there is need for
* such substitution then the whole pathname is put in double quotes.
*/
/* Our line should look like this: 'diff --git a/path b/path'.
*
* If we find any deviations from that format, we return with state reset
* to start.
*/
old_path_marker = strstr(line, " a/");
if (! old_path_marker)
{
*new_state = state_start;
return SVN_NO_ERROR;
}
if (! *(old_path_marker + 3))
{
*new_state = state_start;
return SVN_NO_ERROR;
}
new_path_marker = strstr(old_path_marker, " b/");
if (! new_path_marker)
{
*new_state = state_start;
return SVN_NO_ERROR;
}
if (! *(new_path_marker + 3))
{
*new_state = state_start;
return SVN_NO_ERROR;
}
/* By now, we know that we have a line on the form '--git diff a/.+ b/.+'
* We only need the filenames when we have deleted or added empty
* files. In those cases the old_path and new_path is identical on the
* 'diff --git' line. For all other cases we fetch the filenames from
* other header lines. */
old_path_start = line + STRLEN_LITERAL("diff --git a/");
new_path_end = line + strlen(line);
new_path_start = old_path_start;
while (TRUE)
{
int len_old;
int len_new;
new_path_marker = strstr(new_path_start, " b/");
/* No new path marker, bail out. */
if (! new_path_marker)
break;
old_path_end = new_path_marker;
new_path_start = new_path_marker + STRLEN_LITERAL(" b/");
/* No path after the marker. */
if (! *new_path_start)
break;
len_old = old_path_end - old_path_start;
len_new = new_path_end - new_path_start;
/* Are the paths before and after the " b/" marker the same? */
if (len_old == len_new
&& ! strncmp(old_path_start, new_path_start, len_old))
{
*old_path_end = '\0';
SVN_ERR(grab_filename(&patch->old_filename, old_path_start,
result_pool, scratch_pool));
SVN_ERR(grab_filename(&patch->new_filename, new_path_start,
result_pool, scratch_pool));
break;
}
}
/* We assume that the path is only modified until we've found a 'tree'
* header */
patch->operation = svn_diff_op_modified;
*new_state = state_git_diff_seen;
return SVN_NO_ERROR;
}
/* Parse the '--- ' line of a git extended unidiff. */
static svn_error_t *
git_minus(enum parse_state *new_state, char *line, svn_patch_t *patch,
apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
/* If we can find a tab, it separates the filename from
* the rest of the line which we can discard. */
char *tab = strchr(line, '\t');
if (tab)
*tab = '\0';
if (starts_with(line, "--- /dev/null"))
SVN_ERR(grab_filename(&patch->old_filename, "/dev/null",
result_pool, scratch_pool));
else
SVN_ERR(grab_filename(&patch->old_filename, line + STRLEN_LITERAL("--- a/"),
result_pool, scratch_pool));
*new_state = state_git_minus_seen;
return SVN_NO_ERROR;
}
/* Parse the '+++ ' line of a git extended unidiff. */
static svn_error_t *
git_plus(enum parse_state *new_state, char *line, svn_patch_t *patch,
apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
/* If we can find a tab, it separates the filename from
* the rest of the line which we can discard. */
char *tab = strchr(line, '\t');
if (tab)
*tab = '\0';
if (starts_with(line, "+++ /dev/null"))
SVN_ERR(grab_filename(&patch->new_filename, "/dev/null",
result_pool, scratch_pool));
else
SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("+++ b/"),
result_pool, scratch_pool));
*new_state = state_git_header_found;
return SVN_NO_ERROR;
}
/* Parse the 'rename from ' line of a git extended unidiff. */
static svn_error_t *
git_move_from(enum parse_state *new_state, char *line, svn_patch_t *patch,
apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
SVN_ERR(grab_filename(&patch->old_filename,
line + STRLEN_LITERAL("rename from "),
result_pool, scratch_pool));
*new_state = state_move_from_seen;
return SVN_NO_ERROR;
}
/* Parse the 'rename to ' line of a git extended unidiff. */
static svn_error_t *
git_move_to(enum parse_state *new_state, char *line, svn_patch_t *patch,
apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
SVN_ERR(grab_filename(&patch->new_filename,
line + STRLEN_LITERAL("rename to "),
result_pool, scratch_pool));
patch->operation = svn_diff_op_moved;
*new_state = state_git_tree_seen;
return SVN_NO_ERROR;
}
/* Parse the 'copy from ' line of a git extended unidiff. */
static svn_error_t *
git_copy_from(enum parse_state *new_state, char *line, svn_patch_t *patch,
apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
SVN_ERR(grab_filename(&patch->old_filename,
line + STRLEN_LITERAL("copy from "),
result_pool, scratch_pool));
*new_state = state_copy_from_seen;
return SVN_NO_ERROR;
}
/* Parse the 'copy to ' line of a git extended unidiff. */
static svn_error_t *
git_copy_to(enum parse_state *new_state, char *line, svn_patch_t *patch,
apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("copy to "),
result_pool, scratch_pool));
patch->operation = svn_diff_op_copied;
*new_state = state_git_tree_seen;
return SVN_NO_ERROR;
}
/* Parse the 'new file ' line of a git extended unidiff. */
static svn_error_t *
git_new_file(enum parse_state *new_state, char *line, svn_patch_t *patch,
apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
patch->operation = svn_diff_op_added;
/* Filename already retrieved from diff --git header. */
*new_state = state_git_tree_seen;
return SVN_NO_ERROR;
}
/* Parse the 'deleted file ' line of a git extended unidiff. */
static svn_error_t *
git_deleted_file(enum parse_state *new_state, char *line, svn_patch_t *patch,
apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
patch->operation = svn_diff_op_deleted;
/* Filename already retrieved from diff --git header. */
*new_state = state_git_tree_seen;
return SVN_NO_ERROR;
}
/* Add a HUNK associated with the property PROP_NAME to PATCH. */
static svn_error_t *
add_property_hunk(svn_patch_t *patch, const char *prop_name,
svn_diff_hunk_t *hunk, svn_diff_operation_kind_t operation,
apr_pool_t *result_pool)
{
svn_prop_patch_t *prop_patch;
prop_patch = apr_hash_get(patch->prop_patches, prop_name,
APR_HASH_KEY_STRING);
if (! prop_patch)
{
prop_patch = apr_palloc(result_pool, sizeof(svn_prop_patch_t));
prop_patch->name = prop_name;
prop_patch->operation = operation;
prop_patch->hunks = apr_array_make(result_pool, 1,
sizeof(svn_diff_hunk_t *));
apr_hash_set(patch->prop_patches, prop_name, APR_HASH_KEY_STRING,
prop_patch);
}
APR_ARRAY_PUSH(prop_patch->hunks, svn_diff_hunk_t *) = hunk;
return SVN_NO_ERROR;
}
struct svn_patch_file_t
{
/* The APR file handle to the patch file. */
apr_file_t *apr_file;
/* The file offset at which the next patch is expected. */
apr_off_t next_patch_offset;
};
svn_error_t *
svn_diff_open_patch_file(svn_patch_file_t **patch_file,
const char *local_abspath,
apr_pool_t *result_pool)
{
svn_patch_file_t *p;
p = apr_palloc(result_pool, sizeof(*p));
SVN_ERR(svn_io_file_open(&p->apr_file, local_abspath,
APR_READ | APR_BINARY, 0, result_pool));
p->next_patch_offset = 0;
*patch_file = p;
return SVN_NO_ERROR;
}
/* Parse hunks from APR_FILE and store them in PATCH->HUNKS.
* Parsing stops if no valid next hunk can be found.
* If IGNORE_WHITESPACE is TRUE, lines without
* leading spaces will be treated as context lines.
* Allocate results in RESULT_POOL.
* Use SCRATCH_POOL for temporary allocations. */
static svn_error_t *
parse_hunks(svn_patch_t *patch, apr_file_t *apr_file,
svn_boolean_t ignore_whitespace,
apr_pool_t *result_pool, apr_pool_t *scratch_pool)
{
svn_diff_hunk_t *hunk;
svn_boolean_t is_property;
const char *last_prop_name;
const char *prop_name;
svn_diff_operation_kind_t prop_operation;
apr_pool_t *iterpool;
last_prop_name = NULL;
patch->hunks = apr_array_make(result_pool, 10, sizeof(svn_diff_hunk_t *));
patch->prop_patches = apr_hash_make(result_pool);
iterpool = svn_pool_create(scratch_pool);
do
{
svn_pool_clear(iterpool);
SVN_ERR(parse_next_hunk(&hunk, &is_property, &prop_name, &prop_operation,
patch, apr_file, ignore_whitespace, result_pool,
iterpool));
if (hunk && is_property)
{
if (! prop_name)
prop_name = last_prop_name;
else
last_prop_name = prop_name;
SVN_ERR(add_property_hunk(patch, prop_name, hunk, prop_operation,
result_pool));
}
else if (hunk)
{
APR_ARRAY_PUSH(patch->hunks, svn_diff_hunk_t *) = hunk;
last_prop_name = NULL;
}
}
while (hunk);
svn_pool_destroy(iterpool);
return SVN_NO_ERROR;
}
/* State machine for the diff header parser.
* Expected Input Required state Function to call */
static struct transition transitions[] =
{
{"--- ", state_start, diff_minus},
{"+++ ", state_minus_seen, diff_plus},
{"diff --git", state_start, git_start},
{"--- a/", state_git_diff_seen, git_minus},
{"--- a/", state_git_tree_seen, git_minus},
{"--- /dev/null", state_git_tree_seen, git_minus},
{"+++ b/", state_git_minus_seen, git_plus},
{"+++ /dev/null", state_git_minus_seen, git_plus},
{"rename from ", state_git_diff_seen, git_move_from},
{"rename to ", state_move_from_seen, git_move_to},
{"copy from ", state_git_diff_seen, git_copy_from},
{"copy to ", state_copy_from_seen, git_copy_to},
{"new file ", state_git_diff_seen, git_new_file},
{"deleted file ", state_git_diff_seen, git_deleted_file},
};
svn_error_t *
svn_diff_parse_next_patch(svn_patch_t **patch,
svn_patch_file_t *patch_file,
svn_boolean_t reverse,
svn_boolean_t ignore_whitespace,
apr_pool_t *result_pool,
apr_pool_t *scratch_pool)
{
apr_off_t pos, last_line;
svn_boolean_t eof;
svn_boolean_t line_after_tree_header_read = FALSE;
apr_pool_t *iterpool;
enum parse_state state = state_start;
if (apr_file_eof(patch_file->apr_file) == APR_EOF)
{
/* No more patches here. */
*patch = NULL;
return SVN_NO_ERROR;
}
*patch = apr_pcalloc(result_pool, sizeof(**patch));
pos = patch_file->next_patch_offset;
SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &pos, scratch_pool));
iterpool = svn_pool_create(scratch_pool);
do
{
svn_stringbuf_t *line;
svn_boolean_t valid_header_line = FALSE;
int i;
svn_pool_clear(iterpool);
/* Remember the current line's offset, and read the line. */
last_line = pos;
SVN_ERR(readline(patch_file->apr_file, &line, NULL, &eof,
APR_SIZE_MAX, iterpool, iterpool));
if (! eof)
{
/* Update line offset for next iteration. */
pos = 0;
SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_CUR, &pos,
iterpool));
}
/* Run the state machine. */
for (i = 0; i < (sizeof(transitions) / sizeof(transitions[0])); i++)
{
if (starts_with(line->data, transitions[i].expected_input)
&& state == transitions[i].required_state)
{
SVN_ERR(transitions[i].fn(&state, line->data, *patch,
result_pool, iterpool));
valid_header_line = TRUE;
break;
}
}
if (state == state_unidiff_found || state == state_git_header_found)
{
/* We have a valid diff header, yay! */
break;
}
else if (state == state_git_tree_seen && line_after_tree_header_read)
{
/* git patches can contain an index line after the file mode line */
if (!starts_with(line->data, "index "))
{
/* We have a valid diff header for a patch with only tree changes.
* Rewind to the start of the line just read, so subsequent calls
* to this function don't end up skipping the line -- it may
* contain a patch. */
SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &last_line,
scratch_pool));
break;
}
}
else if (state == state_git_tree_seen)
{
line_after_tree_header_read = TRUE;
}
else if (! valid_header_line && state != state_start
&& !starts_with(line->data, "index "))
{
/* We've encountered an invalid diff header.
*
* Rewind to the start of the line just read - it may be a new
* header that begins there. */
SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &last_line,
scratch_pool));
state = state_start;
}
}
while (! eof);
(*patch)->reverse = reverse;
if (reverse)
{
const char *temp;
temp = (*patch)->old_filename;
(*patch)->old_filename = (*patch)->new_filename;
(*patch)->new_filename = temp;
}
if ((*patch)->old_filename == NULL || (*patch)->new_filename == NULL)
{
/* Something went wrong, just discard the result. */
*patch = NULL;
}
else
SVN_ERR(parse_hunks(*patch, patch_file->apr_file, ignore_whitespace,
result_pool, iterpool));
svn_pool_destroy(iterpool);
patch_file->next_patch_offset = 0;
SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_CUR,
&patch_file->next_patch_offset, scratch_pool));
if (*patch)
{
/* Usually, hunks appear in the patch sorted by their original line
* offset. But just in case they weren't parsed in this order for
* some reason, we sort them so that our caller can assume that hunks
* are sorted as if parsed from a usual patch. */
qsort((*patch)->hunks->elts, (*patch)->hunks->nelts,
(*patch)->hunks->elt_size, compare_hunks);
}
return SVN_NO_ERROR;
}
svn_error_t *
svn_diff_close_patch_file(svn_patch_file_t *patch_file,
apr_pool_t *scratch_pool)
{
return svn_error_trace(svn_io_file_close(patch_file->apr_file,
scratch_pool));
}