Skip to content

Commit e45063a

Browse files
committed
Implement a fuzzy diffing option for interdiff
This implements a --fuzzy option to make interdiff perform a fuzzy comparison between two diffs. This is very helpful, for example, for comparing a backport patch to its upstream source patch to assist a human reviewer in verifying the correctness of the backport. To achieve fuzzy diffing: Instead of creating a different dummy original file for p1 and p2 and applying p1 to p2's original file and p2 to p1's original file, use p1's original file for both cases when fuzzy diffing. This way, interdiff can rely upon the patch command to fuzz the line offset and context differences between hunks in p1 and p2. And then the final diff command at the end will show the remaining differences using the same base file, so it won't report hunks as differing just because of differing line offsets. The diff command can't do any fuzzy diffing, hence why we need to rely on the patch command for this capability. The fuzzy diffing option also handles rejected hunks by showing them in the output as a differing hunk. And it tries to avoid *additionally* showing the *reverse* of the rejected hunks by applying the rejected hunks with maximum fuzz, so that the final diff command doesn't show them.
1 parent d35163c commit e45063a

File tree

1 file changed

+172
-13
lines changed

1 file changed

+172
-13
lines changed

src/interdiff.c

Lines changed: 172 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -122,6 +122,7 @@ static int no_revert_omitted = 0;
122122
static int use_colors = 0;
123123
static int color_option_specified = 0;
124124
static int debug = 0;
125+
static int fuzzy = 0;
125126

126127
static struct patlist *pat_drop_context = NULL;
127128

@@ -934,9 +935,9 @@ output_patch1_only (FILE *p1, FILE *out, int not_reverted)
934935
}
935936

936937
static int
937-
apply_patch (FILE *patch, const char *file, int reverted)
938+
apply_patch (FILE *patch, const char *file, int reverted, int max_fuzz_no_rej)
938939
{
939-
#define MAX_PATCH_ARGS 4
940+
#define MAX_PATCH_ARGS 7
940941
const char *argv[MAX_PATCH_ARGS];
941942
int argc = 0;
942943
const char *basename;
@@ -966,6 +967,17 @@ apply_patch (FILE *patch, const char *file, int reverted)
966967
argv[argc++] = PATCH;
967968
argv[argc++] = reverted ? (has_ignore_all_space ? "-Rlsp0" : "-Rsp0")
968969
: (has_ignore_all_space ? "-lsp0" : "-sp0");
970+
if (fuzzy) {
971+
/* Don't generate .orig files when we expect rejected hunks */
972+
argv[argc++] = "--no-backup-if-mismatch";
973+
974+
/* Ignore all context lines by setting fuzz to INT_MAX, and
975+
* don't generate a reject file even if any hunks fail. */
976+
if (max_fuzz_no_rej) {
977+
argv[argc++] = "--fuzz=2147483647";
978+
argv[argc++] = "--reject-file=-";
979+
}
980+
}
969981
argv[argc++] = file;
970982
argv[argc++] = NULL;
971983

@@ -1156,20 +1168,133 @@ trim_context (FILE *f /* positioned at start of @@ line */,
11561168
return 0;
11571169
}
11581170

1171+
static FILE *
1172+
open_rej_file (char *file)
1173+
{
1174+
/* Modify the `file` string in-place to look for a .rej file */
1175+
char *end = strchr (file, '\0');
1176+
FILE *f;
1177+
1178+
strcpy (end, ".rej");
1179+
f = fopen (file, "r");
1180+
*end = '\0';
1181+
return f;
1182+
}
1183+
1184+
/* Returns a pointer to the .rej file and the line offset of the first hunk */
1185+
static FILE *
1186+
fuzz_rejected_hunks (char *tmpp2, unsigned long *rej_offset)
1187+
{
1188+
/* In fuzzy mode, emit the patch rejects separately instead of
1189+
* erroring out, since the fuzzy mode is meant for producing an
1190+
* interdiff for human eyes rather than a functional diff. */
1191+
char *line = NULL;
1192+
size_t line_len;
1193+
long atat_pos;
1194+
FILE *rej;
1195+
1196+
if (!fuzzy)
1197+
return NULL;
1198+
1199+
/* It's only ever patch2 that can fail in fuzzy mode because patch1 is
1200+
* applied to its own original file in fuzzy mode. */
1201+
rej = open_rej_file (tmpp2);
1202+
if (!rej)
1203+
return NULL;
1204+
1205+
/* Skip (the first two) lines to get to the start of the @@ line */
1206+
do {
1207+
atat_pos = ftell (rej);
1208+
if (getline (&line, &line_len, rej) < 0)
1209+
error (EXIT_FAILURE, errno,
1210+
"Failed to read line from .rej");
1211+
} while (strncmp (line, "@@ ", 3));
1212+
fseek (rej, atat_pos, SEEK_SET);
1213+
1214+
/* Export the line offset of the first rej hunk */
1215+
if (read_atatline (line, rej_offset, NULL, NULL, NULL))
1216+
error (EXIT_FAILURE, 0, "line not understood: %s", line);
1217+
1218+
/* Apply the rejected hunks with maximum fuzzing to tmpp2 to increase
1219+
* the odds of them getting applied. The goal is to get the rejects
1220+
* applied so that they are excluded from the diff execution. These
1221+
* rejected hunks will still be shown. It's fine if this fails. */
1222+
apply_patch (rej, tmpp2, 0, 1);
1223+
1224+
/* Go back to the @@ after apply_patch() moved the file cursor */
1225+
fseek (rej, atat_pos, SEEK_SET);
1226+
free (line);
1227+
return rej;
1228+
}
1229+
1230+
static void
1231+
output_rej_hunk (const char *diff_line, FILE *rej, unsigned long *rej_offset,
1232+
FILE *out)
1233+
{
1234+
unsigned long diff_offset;
1235+
int first_line_done = 0;
1236+
char *line = NULL;
1237+
size_t line_len;
1238+
long atat_pos;
1239+
ssize_t got;
1240+
1241+
/* Nothing to do if no rej file or the rej file is finished */
1242+
if (!rej || feof (rej))
1243+
return;
1244+
1245+
/* Wait until the current diff line is an @@ line */
1246+
if (strncmp (diff_line, "@@ ", 3))
1247+
return;
1248+
1249+
if (read_atatline (diff_line, &diff_offset, NULL, NULL, NULL))
1250+
error (EXIT_FAILURE, 0, "line not understood: %s", diff_line);
1251+
1252+
/* Write the rej hunk before the diff hunk if its offset comes before */
1253+
if (*rej_offset > diff_offset)
1254+
return;
1255+
1256+
/* Write the rej hunk until EOF or the next @@ line (i.e., next hunk).
1257+
* Note that rej starts at the current @@ line that we must write, so
1258+
* don't look for the next @@ until after the first line is written. */
1259+
for (;;) {
1260+
got = getline (&line, &line_len, rej);
1261+
if (got < 0) {
1262+
if (feof (rej))
1263+
goto rej_file_eof;
1264+
error (EXIT_FAILURE, errno,
1265+
"Failed to read line from .rej");
1266+
}
1267+
if (first_line_done && !strncmp (line, "@@ ", 3))
1268+
break;
1269+
first_line_done = 1;
1270+
fwrite (line, (size_t) got, 1, out);
1271+
atat_pos = ftell (rej);
1272+
}
1273+
1274+
/* Record the line offset of the next rej hunk, only if there is one */
1275+
if (read_atatline (line, rej_offset, NULL, NULL, NULL))
1276+
error (EXIT_FAILURE, 0, "line not understood: %s", line);
1277+
fseek (rej, atat_pos, SEEK_SET);
1278+
1279+
rej_file_eof:
1280+
free (line);
1281+
}
1282+
11591283
static int
11601284
output_delta (FILE *p1, FILE *p2, FILE *out)
11611285
{
11621286
const char *tmpdir = getenv ("TMPDIR");
11631287
unsigned int tmplen;
11641288
const char tail1[] = "/interdiff-1.XXXXXX";
1165-
const char tail2[] = "/interdiff-2.XXXXXX";
1289+
const char tail2[] = "/interdiff-2.XXXXXX\0rej"; /* Add room for .rej */
11661290
char *tmpp1, *tmpp2;
11671291
int tmpp1fd, tmpp2fd;
11681292
struct lines_info file = { NULL, 0, 0, NULL, NULL };
11691293
struct lines_info file2 = { NULL, 0, 0, NULL, NULL };
11701294
char *oldname = NULL, *newname = NULL;
11711295
pid_t child;
1172-
FILE *in;
1296+
FILE *in, *rej = NULL;
1297+
unsigned long rej_offset;
11731298
size_t namelen;
11741299
long pos1 = ftell (p1), pos2 = ftell (p2);
11751300
long pristine1, pristine2;
@@ -1240,7 +1365,7 @@ output_delta (FILE *p1, FILE *p2, FILE *out)
12401365
start2 = ftell (p2);
12411366
fseek (p1, pos1, SEEK_SET);
12421367
fseek (p2, pos2, SEEK_SET);
1243-
create_orig (p2, &file, 0, NULL);
1368+
create_orig (fuzzy ? p1 : p2, &file, 0, NULL);
12441369
fseek (p1, pos1, SEEK_SET);
12451370
fseek (p2, pos2, SEEK_SET);
12461371
create_orig (p1, &file2, mode == mode_combine, NULL);
@@ -1254,13 +1379,17 @@ output_delta (FILE *p1, FILE *p2, FILE *out)
12541379
fseek (p1, start1, SEEK_SET);
12551380
fseek (p2, start2, SEEK_SET);
12561381

1257-
if (apply_patch (p1, tmpp1, mode == mode_combine))
1382+
if (apply_patch (p1, tmpp1, mode == mode_combine, 0))
12581383
error (EXIT_FAILURE, 0,
12591384
"Error applying patch1 to reconstructed file");
12601385

1261-
if (apply_patch (p2, tmpp2, 0))
1262-
error (EXIT_FAILURE, 0,
1263-
"Error applying patch2 to reconstructed file");
1386+
if (apply_patch (p2, tmpp2, 0, 0)) {
1387+
/* Cope with a .rej in fuzzy mode */
1388+
rej = fuzz_rejected_hunks (tmpp2, &rej_offset);
1389+
if (!rej)
1390+
error (EXIT_FAILURE, 0,
1391+
"Error applying patch2 to reconstructed file");
1392+
}
12641393

12651394
fseek (p1, pos1, SEEK_SET);
12661395

@@ -1287,7 +1416,7 @@ output_delta (FILE *p1, FILE *p2, FILE *out)
12871416
break;
12881417
}
12891418

1290-
if (!diff_is_empty) {
1419+
if (!diff_is_empty || rej) {
12911420
/* ANOTHER temporary file! This is to catch the case
12921421
* where we just don't have enough context to generate
12931422
* a proper interdiff. */
@@ -1298,6 +1427,7 @@ output_delta (FILE *p1, FILE *p2, FILE *out)
12981427
ssize_t got = getline (&line, &linelen, in);
12991428
if (got < 0)
13001429
break;
1430+
output_rej_hunk (line, rej, &rej_offset, tmpdiff);
13011431
fwrite (line, (size_t) got, 1, tmpdiff);
13021432
if (*line != ' ' && !strcmp (line + 1, file.unline)) {
13031433
/* Uh-oh. We're trying to output a
@@ -1323,6 +1453,16 @@ output_delta (FILE *p1, FILE *p2, FILE *out)
13231453
}
13241454
free (line);
13251455

1456+
/* Output any remaining rej hunks */
1457+
if (rej && !feof (rej)) {
1458+
for (;;) {
1459+
int ch = fgetc (rej);
1460+
if (ch == EOF)
1461+
break;
1462+
fputc (ch, tmpdiff);
1463+
}
1464+
}
1465+
13261466
/* First character */
13271467
if (human_readable) {
13281468
char *p, *q, c, d;
@@ -1356,6 +1496,11 @@ output_delta (FILE *p1, FILE *p2, FILE *out)
13561496
else {
13571497
unlink (tmpp1);
13581498
unlink (tmpp2);
1499+
if (rej) {
1500+
/* Delete the .rej file generated in fuzzy mode */
1501+
strcat (tmpp2, ".rej");
1502+
unlink (tmpp2);
1503+
}
13591504
}
13601505
free (oldname);
13611506
free (newname);
@@ -1368,6 +1513,11 @@ output_delta (FILE *p1, FILE *p2, FILE *out)
13681513
else {
13691514
unlink (tmpp1);
13701515
unlink (tmpp2);
1516+
if (rej) {
1517+
/* Delete the .rej file generated in fuzzy mode */
1518+
strcat (tmpp2, ".rej");
1519+
unlink (tmpp2);
1520+
}
13711521
}
13721522
if (human_readable)
13731523
fprintf (out, "%s impossible; taking evasive action\n",
@@ -1816,7 +1966,7 @@ flipdiff (FILE *p1, FILE *p2, FILE *flip1, FILE *flip2)
18161966
tmpfd = xmkstemp (tmpp1);
18171967
write_file (&intermediate, tmpfd);
18181968
fsetpos (p1, &at1);
1819-
if (apply_patch (p1, tmpp1, 1))
1969+
if (apply_patch (p1, tmpp1, 1, 0))
18201970
error (EXIT_FAILURE, 0,
18211971
"Error reconstructing original file");
18221972

@@ -1825,7 +1975,7 @@ flipdiff (FILE *p1, FILE *p2, FILE *flip1, FILE *flip2)
18251975
tmpfd = xmkstemp (tmpp3);
18261976
write_file (&intermediate, tmpfd);
18271977
fsetpos (p2, &at2);
1828-
if (apply_patch (p2, tmpp3, 0))
1978+
if (apply_patch (p2, tmpp3, 0, 0))
18291979
error (EXIT_FAILURE, 0,
18301980
"Error reconstructing final file");
18311981

@@ -2227,7 +2377,10 @@ syntax (int err)
22272377
" (interdiff) When a patch from patch1 is not in patch2,\n"
22282378
" don't revert it\n"
22292379
" --in-place (flipdiff) Write the output to the original input\n"
2230-
" files\n";
2380+
" files\n"
2381+
" --fuzzy\n"
2382+
" (interdiff) Perform a fuzzy comparison, which filters\n"
2383+
" out hunks that the patch utility can apply with fuzz\n";
22312384

22322385
fprintf (err ? stderr : stdout, syntax_str, progname, progname);
22332386
exit (err);
@@ -2289,6 +2442,7 @@ main (int argc, char *argv[])
22892442
{"flip", 0, 0, 1000 + 'F' },
22902443
{"no-revert-omitted", 0, 0, 1000 + 'R' },
22912444
{"in-place", 0, 0, 1000 + 'i' },
2445+
{"fuzzy", 0, 0, 1000 + 'f' },
22922446
{"debug", 0, 0, 1000 + 'D' },
22932447
{"strip-match", 1, 0, 'p'},
22942448
{"unified", 1, 0, 'U'},
@@ -2376,6 +2530,11 @@ main (int argc, char *argv[])
23762530
syntax (1);
23772531
flipdiff_inplace = 1;
23782532
break;
2533+
case 1000 + 'f':
2534+
if (mode != mode_inter)
2535+
syntax (1);
2536+
fuzzy = 1;
2537+
break;
23792538
case 1000 + 'D':
23802539
debug = 1;
23812540
break;

0 commit comments

Comments
 (0)