Blob


1 /* $OpenBSD: diffreg.c,v 1.91 2016/03/01 20:57:35 natano Exp $ */
3 /*
4 * Copyright (C) Caldera International Inc. 2001-2002.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code and documentation must retain the above
11 * copyright notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * 3. All advertising materials mentioning features or use of this software
16 * must display the following acknowledgement:
17 * This product includes software developed or owned by Caldera
18 * International, Inc.
19 * 4. Neither the name of Caldera International, Inc. nor the names of other
20 * contributors may be used to endorse or promote products derived from
21 * this software without specific prior written permission.
22 *
23 * USE OF THE SOFTWARE PROVIDED FOR UNDER THIS LICENSE BY CALDERA
24 * INTERNATIONAL, INC. AND CONTRIBUTORS ``AS IS'' AND ANY EXPRESS OR
25 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
26 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
27 * IN NO EVENT SHALL CALDERA INTERNATIONAL, INC. BE LIABLE FOR ANY DIRECT,
28 * INDIRECT INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
29 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
30 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
31 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
32 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING
33 * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
34 * POSSIBILITY OF SUCH DAMAGE.
35 */
36 /*-
37 * Copyright (c) 1991, 1993
38 * The Regents of the University of California. All rights reserved.
39 *
40 * Redistribution and use in source and binary forms, with or without
41 * modification, are permitted provided that the following conditions
42 * are met:
43 * 1. Redistributions of source code must retain the above copyright
44 * notice, this list of conditions and the following disclaimer.
45 * 2. Redistributions in binary form must reproduce the above copyright
46 * notice, this list of conditions and the following disclaimer in the
47 * documentation and/or other materials provided with the distribution.
48 * 3. Neither the name of the University nor the names of its contributors
49 * may be used to endorse or promote products derived from this software
50 * without specific prior written permission.
51 *
52 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62 * SUCH DAMAGE.
63 *
64 * @(#)diffreg.c 8.1 (Berkeley) 6/6/93
65 */
67 #include <sys/types.h>
68 #include <sys/stat.h>
69 #include <sys/wait.h>
70 #include <sys/queue.h>
72 #include <ctype.h>
73 #include <err.h>
74 #include <errno.h>
75 #include <fcntl.h>
76 #include <paths.h>
77 #include <stdarg.h>
78 #include <stddef.h>
79 #include <stdint.h>
80 #include <stdio.h>
81 #include <stdlib.h>
82 #include <string.h>
83 #include <time.h>
84 #include <unistd.h>
85 #include <limits.h>
86 #include <sha1.h>
87 #include <zlib.h>
89 #include "got_error.h"
90 #include "got_object.h"
91 #include "got_diff.h"
93 #include "got_lib_diff.h"
95 #define MINIMUM(a, b) (((a) < (b)) ? (a) : (b))
96 #define MAXIMUM(a, b) (((a) > (b)) ? (a) : (b))
98 /*
99 * diff - compare two files.
100 */
102 /*
103 * Uses an algorithm due to Harold Stone, which finds
104 * a pair of longest identical subsequences in the two
105 * files.
107 * The major goal is to generate the match vector J.
108 * J[i] is the index of the line in file1 corresponding
109 * to line i file0. J[i] = 0 if there is no
110 * such line in file1.
112 * Lines are hashed so as to work in core. All potential
113 * matches are located by sorting the lines of each file
114 * on the hash (called ``value''). In particular, this
115 * collects the equivalence classes in file1 together.
116 * Subroutine equiv replaces the value of each line in
117 * file0 by the index of the first element of its
118 * matching equivalence in (the reordered) file1.
119 * To save space equiv squeezes file1 into a single
120 * array member in which the equivalence classes
121 * are simply concatenated, except that their first
122 * members are flagged by changing sign.
124 * Next the indices that point into member are unsorted into
125 * array class according to the original order of file0.
127 * The cleverness lies in routine stone. This marches
128 * through the lines of file0, developing a vector klist
129 * of "k-candidates". At step i a k-candidate is a matched
130 * pair of lines x,y (x in file0 y in file1) such that
131 * there is a common subsequence of length k
132 * between the first i lines of file0 and the first y
133 * lines of file1, but there is no such subsequence for
134 * any smaller y. x is the earliest possible mate to y
135 * that occurs in such a subsequence.
137 * Whenever any of the members of the equivalence class of
138 * lines in file1 matable to a line in file0 has serial number
139 * less than the y of some k-candidate, that k-candidate
140 * with the smallest such y is replaced. The new
141 * k-candidate is chained (via pred) to the current
142 * k-1 candidate so that the actual subsequence can
143 * be recovered. When a member has serial number greater
144 * that the y of all k-candidates, the klist is extended.
145 * At the end, the longest subsequence is pulled out
146 * and placed in the array J by unravel
148 * With J in hand, the matches there recorded are
149 * check'ed against reality to assure that no spurious
150 * matches have crept in due to hashing. If they have,
151 * they are broken, and "jackpot" is recorded--a harmless
152 * matter except that a true match for a spuriously
153 * mated line may now be unnecessarily reported as a change.
155 * Much of the complexity of the program comes simply
156 * from trying to minimize core utilization and
157 * maximize the range of doable problems by dynamically
158 * allocating what is needed and reusing what is not.
159 * The core requirements for problems larger than somewhat
160 * are (in words) 2*length(file0) + length(file1) +
161 * 3*(number of k-candidates installed), typically about
162 * 6n words for files of length n.
163 */
165 struct cand {
166 int x;
167 int y;
168 int pred;
169 };
171 struct line {
172 int serial;
173 int value;
174 };
176 static void diff_output(FILE *, const char *, ...);
177 static int output(FILE *, struct got_diff_changes *, struct got_diff_state *, struct got_diff_args *, const char *, FILE *, const char *, FILE *, int);
178 static void check(struct got_diff_state *, FILE *, FILE *, int);
179 static void range(FILE *, int, int, char *);
180 static void uni_range(FILE *, int, int);
181 static void dump_unified_vec(FILE *, struct got_diff_changes *, struct got_diff_state *, struct got_diff_args *, FILE *, FILE *, int);
182 static int prepare(struct got_diff_state *, int, FILE *, off_t, int);
183 static void prune(struct got_diff_state *);
184 static void equiv(struct line *, int, struct line *, int, int *);
185 static void unravel(struct got_diff_state *, int);
186 static int unsort(struct line *, int, int *);
187 static int change(FILE *, struct got_diff_changes *, struct got_diff_state *, struct got_diff_args *, const char *, FILE *, const char *, FILE *, int, int, int, int, int *);
188 static void sort(struct line *, int);
189 static void print_header(FILE *, struct got_diff_state *, struct got_diff_args *, const char *, const char *);
190 static int asciifile(FILE *);
191 static void fetch(FILE *, struct got_diff_state *, struct got_diff_args *, long *, int, int, FILE *, int, int);
192 static int newcand(struct got_diff_state *, int, int, int, int *);
193 static int search(struct got_diff_state *, int *, int, int);
194 static int skipline(FILE *);
195 static int isqrt(int);
196 static int stone(struct got_diff_state *, int *, int, int *, int *, int);
197 static int readhash(struct got_diff_state *, FILE *, int);
198 static int files_differ(struct got_diff_state *, FILE *, FILE *);
199 static char *match_function(struct got_diff_state *, const long *, int, FILE *);
201 /*
202 * chrtran points to one of 2 translation tables: cup2low if folding upper to
203 * lower case clow2low if not folding case
204 */
205 u_char clow2low[256] = {
206 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
207 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
208 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
209 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
210 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
211 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41,
212 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c,
213 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57,
214 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62,
215 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
216 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
217 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
218 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
219 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
220 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
221 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
222 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
223 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
224 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
225 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
226 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
227 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
228 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
229 0xfd, 0xfe, 0xff
230 };
232 u_char cup2low[256] = {
233 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a,
234 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
235 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
236 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b,
237 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36,
238 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x60, 0x61,
239 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c,
240 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77,
241 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x60, 0x61, 0x62,
242 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d,
243 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
244 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83,
245 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e,
246 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99,
247 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4,
248 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
249 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba,
250 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5,
251 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0,
252 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb,
253 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6,
254 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
255 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc,
256 0xfd, 0xfe, 0xff
257 };
259 static void
260 diff_output(FILE *outfile, const char *fmt, ...)
262 va_list ap;
264 if (outfile == NULL)
265 return;
267 va_start(ap, fmt);
268 vfprintf(outfile, fmt, ap);
269 va_end(ap);
272 void
273 got_diff_state_free(struct got_diff_state *ds)
275 free(ds->J);
276 free(ds->member);
277 free(ds->class);
278 free(ds->clist);
279 free(ds->klist);
280 free(ds->ixold);
281 free(ds->ixnew);
284 const struct got_error *
285 got_diffreg(int *rval, FILE *f1, FILE *f2, int flags,
286 struct got_diff_args *args, struct got_diff_state *ds, FILE *outfile,
287 struct got_diff_changes *changes)
289 const struct got_error *err = NULL;
290 int i, *p;
291 long *lp;
293 *rval = D_SAME;
294 ds->anychange = 0;
295 ds->lastline = 0;
296 ds->lastmatchline = 0;
297 ds->context_vec_ptr = ds->context_vec_start - 1;
298 ds->max_context = GOT_DIFF_MAX_CONTEXT;
299 if (flags & D_IGNORECASE)
300 ds->chrtran = cup2low;
301 else
302 ds->chrtran = clow2low;
303 if (S_ISDIR(ds->stb1.st_mode) != S_ISDIR(ds->stb2.st_mode)) {
304 *rval = (S_ISDIR(ds->stb1.st_mode) ? D_MISMATCH1 : D_MISMATCH2);
305 return NULL;
307 if (!(flags & D_EMPTY1) && f1 == NULL) {
308 args->status |= 2;
309 goto closem;
312 if (!(flags & D_EMPTY2) && f2 == NULL) {
313 args->status |= 2;
314 goto closem;
317 switch (files_differ(ds, f1, f2)) {
318 case 0:
319 goto closem;
320 case 1:
321 break;
322 default:
323 /* error */
324 args->status |= 2;
325 goto closem;
328 if ((flags & D_FORCEASCII) == 0 &&
329 (!asciifile(f1) || !asciifile(f2))) {
330 *rval = D_BINARY;
331 args->status |= 1;
332 goto closem;
334 if (prepare(ds, 0, f1, ds->stb1.st_size, flags)) {
335 err = got_error_from_errno("prepare");
336 goto closem;
338 if (prepare(ds, 1, f2, ds->stb2.st_size, flags)) {
339 err = got_error_from_errno("prepare");
340 goto closem;
343 prune(ds);
344 sort(ds->sfile[0], ds->slen[0]);
345 sort(ds->sfile[1], ds->slen[1]);
347 ds->member = (int *)ds->file[1];
348 equiv(ds->sfile[0], ds->slen[0], ds->sfile[1], ds->slen[1], ds->member);
349 p = reallocarray(ds->member, ds->slen[1] + 2, sizeof(*ds->member));
350 if (p == NULL) {
351 err = got_error_from_errno("reallocarray");
352 goto closem;
354 ds->member = p;
356 ds->class = (int *)ds->file[0];
357 if (unsort(ds->sfile[0], ds->slen[0], ds->class)) {
358 err = got_error_from_errno("unsort");
359 goto closem;
361 p = reallocarray(ds->class, ds->slen[0] + 2, sizeof(*ds->class));
362 if (p == NULL) {
363 err = got_error_from_errno("reallocarray");
364 goto closem;
366 ds->class = p;
368 ds->klist = calloc(ds->slen[0] + 2, sizeof(*ds->klist));
369 if (ds->klist == NULL) {
370 err = got_error_from_errno("calloc");
371 goto closem;
373 ds->clen = 0;
374 ds->clistlen = 100;
375 ds->clist = calloc(ds->clistlen, sizeof(*ds->clist));
376 if (ds->clist == NULL) {
377 err = got_error_from_errno("calloc");
378 goto closem;
380 i = stone(ds, ds->class, ds->slen[0], ds->member, ds->klist, flags);
381 if (i < 0) {
382 err = got_error_from_errno("stone");
383 goto closem;
386 p = reallocarray(ds->J, ds->len[0] + 2, sizeof(*ds->J));
387 if (p == NULL) {
388 err = got_error_from_errno("reallocarray");
389 goto closem;
391 ds->J = p;
392 unravel(ds, ds->klist[i]);
394 lp = reallocarray(ds->ixold, ds->len[0] + 2, sizeof(*ds->ixold));
395 if (lp == NULL) {
396 err = got_error_from_errno("reallocarray");
397 goto closem;
399 ds->ixold = lp;
400 lp = reallocarray(ds->ixnew, ds->len[1] + 2, sizeof(*ds->ixnew));
401 if (lp == NULL) {
402 err = got_error_from_errno("reallocarray");
403 goto closem;
405 ds->ixnew = lp;
406 check(ds, f1, f2, flags);
407 if (output(outfile, changes, ds, args, args->label[0], f1,
408 args->label[1], f2, flags))
409 err = got_error_from_errno("output");
410 closem:
411 if (ds->anychange) {
412 args->status |= 1;
413 if (*rval == D_SAME)
414 *rval = D_DIFFER;
416 return (err);
419 /*
420 * Check to see if the given files differ.
421 * Returns 0 if they are the same, 1 if different, and -1 on error.
422 * XXX - could use code from cmp(1) [faster]
423 */
424 static int
425 files_differ(struct got_diff_state *ds, FILE *f1, FILE *f2)
427 char buf1[BUFSIZ], buf2[BUFSIZ];
428 size_t i, j;
430 if (f1 == NULL || f2 == NULL || ds->stb1.st_size != ds->stb2.st_size ||
431 (ds->stb1.st_mode & S_IFMT) != (ds->stb2.st_mode & S_IFMT))
432 return (1);
433 for (;;) {
434 i = fread(buf1, 1, sizeof(buf1), f1);
435 j = fread(buf2, 1, sizeof(buf2), f2);
436 if ((!i && ferror(f1)) || (!j && ferror(f2)))
437 return (-1);
438 if (i != j)
439 return (1);
440 if (i == 0)
441 return (0);
442 if (memcmp(buf1, buf2, i) != 0)
443 return (1);
447 static int
448 prepare(struct got_diff_state *ds, int i, FILE *fd, off_t filesize, int flags)
450 struct line *p, *q;
451 int j, h;
452 size_t sz;
454 if (fd != NULL)
455 rewind(fd);
457 sz = (filesize <= SIZE_MAX ? filesize : SIZE_MAX) / 25;
458 if (sz < 100)
459 sz = 100;
461 p = calloc(sz + 3, sizeof(*p));
462 if (p == NULL)
463 return (-1);
464 for (j = 0; fd != NULL && (h = readhash(ds, fd, flags));) {
465 if (j == sz) {
466 sz = sz * 3 / 2;
467 q = reallocarray(p, sz + 3, sizeof(*p));
468 if (q == NULL) {
469 free(p);
470 return (-1);
472 p = q;
474 p[++j].value = h;
476 ds->len[i] = j;
477 ds->file[i] = p;
479 return (0);
482 static void
483 prune(struct got_diff_state *ds)
485 int i, j;
487 for (ds->pref = 0; ds->pref < ds->len[0] && ds->pref < ds->len[1] &&
488 ds->file[0][ds->pref + 1].value == ds->file[1][ds->pref + 1].value;
489 ds->pref++)
491 for (ds->suff = 0; ds->suff < ds->len[0] - ds->pref && ds->suff < ds->len[1] - ds->pref &&
492 ds->file[0][ds->len[0] - ds->suff].value == ds->file[1][ds->len[1] - ds->suff].value;
493 ds->suff++)
495 for (j = 0; j < 2; j++) {
496 ds->sfile[j] = ds->file[j] + ds->pref;
497 ds->slen[j] = ds->len[j] - ds->pref - ds->suff;
498 for (i = 0; i <= ds->slen[j]; i++)
499 ds->sfile[j][i].serial = i;
503 static void
504 equiv(struct line *a, int n, struct line *b, int m, int *c)
506 int i, j;
508 i = j = 1;
509 while (i <= n && j <= m) {
510 if (a[i].value < b[j].value)
511 a[i++].value = 0;
512 else if (a[i].value == b[j].value)
513 a[i++].value = j;
514 else
515 j++;
517 while (i <= n)
518 a[i++].value = 0;
519 b[m + 1].value = 0;
520 j = 0;
521 while (++j <= m) {
522 c[j] = -b[j].serial;
523 while (b[j + 1].value == b[j].value) {
524 j++;
525 c[j] = b[j].serial;
528 c[j] = -1;
531 /* Code taken from ping.c */
532 static int
533 isqrt(int n)
535 int y, x = 1;
537 if (n == 0)
538 return (0);
540 do { /* newton was a stinker */
541 y = x;
542 x = n / x;
543 x += y;
544 x /= 2;
545 } while ((x - y) > 1 || (x - y) < -1);
547 return (x);
550 static int
551 stone(struct got_diff_state *ds, int *a, int n, int *b, int *c, int flags)
553 int i, k, y, j, l;
554 int oldc, tc, oldl, sq;
555 u_int numtries, bound;
556 int error;
558 if (flags & D_MINIMAL)
559 bound = UINT_MAX;
560 else {
561 sq = isqrt(n);
562 bound = MAXIMUM(256, sq);
565 k = 0;
566 c[0] = newcand(ds, 0, 0, 0, &error);
567 if (error)
568 return -1;
569 for (i = 1; i <= n; i++) {
570 j = a[i];
571 if (j == 0)
572 continue;
573 y = -b[j];
574 oldl = 0;
575 oldc = c[0];
576 numtries = 0;
577 do {
578 if (y <= ds->clist[oldc].y)
579 continue;
580 l = search(ds, c, k, y);
581 if (l != oldl + 1)
582 oldc = c[l - 1];
583 if (l <= k) {
584 if (ds->clist[c[l]].y <= y)
585 continue;
586 tc = c[l];
587 c[l] = newcand(ds, i, y, oldc, &error);
588 if (error)
589 return -1;
590 oldc = tc;
591 oldl = l;
592 numtries++;
593 } else {
594 c[l] = newcand(ds, i, y, oldc, &error);
595 if (error)
596 return -1;
597 k++;
598 break;
600 } while ((y = b[++j]) > 0 && numtries < bound);
602 return (k);
605 static int
606 newcand(struct got_diff_state *ds, int x, int y, int pred, int *errorp)
608 struct cand *q;
610 if (ds->clen == ds->clistlen) {
611 ds->clistlen = ds->clistlen * 11 / 10;
612 q = reallocarray(ds->clist, ds->clistlen, sizeof(*ds->clist));
613 if (q == NULL) {
614 *errorp = -1;
615 free(ds->clist);
616 ds->clist = NULL;
617 return 0;
619 ds->clist = q;
621 q = ds->clist + ds->clen;
622 q->x = x;
623 q->y = y;
624 q->pred = pred;
625 *errorp = 0;
626 return (ds->clen++);
629 static int
630 search(struct got_diff_state *ds, int *c, int k, int y)
632 int i, j, l, t;
634 if (ds->clist[c[k]].y < y) /* quick look for typical case */
635 return (k + 1);
636 i = 0;
637 j = k + 1;
638 for (;;) {
639 l = (i + j) / 2;
640 if (l <= i)
641 break;
642 t = ds->clist[c[l]].y;
643 if (t > y)
644 j = l;
645 else if (t < y)
646 i = l;
647 else
648 return (l);
650 return (l + 1);
653 static void
654 unravel(struct got_diff_state *ds, int p)
656 struct cand *q;
657 int i;
659 for (i = 0; i <= ds->len[0]; i++)
660 ds->J[i] = i <= ds->pref ? i :
661 i > ds->len[0] - ds->suff ? i + ds->len[1] - ds->len[0] : 0;
662 for (q = ds->clist + p; q->y != 0; q = ds->clist + q->pred)
663 ds->J[q->x + ds->pref] = q->y + ds->pref;
666 /*
667 * Check does double duty:
668 * 1. ferret out any fortuitous correspondences due
669 * to confounding by hashing (which result in "jackpot")
670 * 2. collect random access indexes to the two files
671 */
672 static void
673 check(struct got_diff_state *ds, FILE *f1, FILE *f2, int flags)
675 int i, j, jackpot, c, d;
676 long ctold, ctnew;
678 if (f1 != NULL)
679 rewind(f1);
680 if (f2 != NULL)
681 rewind(f2);
682 j = 1;
683 ds->ixold[0] = ds->ixnew[0] = 0;
684 jackpot = 0;
685 ctold = ctnew = 0;
686 for (i = 1; i <= ds->len[0]; i++) {
687 if (ds->J[i] == 0) {
688 ds->ixold[i] = ctold += skipline(f1);
689 continue;
691 while (j < ds->J[i]) {
692 ds->ixnew[j] = ctnew += skipline(f2);
693 j++;
695 if (flags & (D_FOLDBLANKS|D_IGNOREBLANKS|D_IGNORECASE)) {
696 for (;;) {
697 c = (f1 == NULL ? EOF : getc(f1));
698 d = (f2 == NULL ? EOF : getc(f2));
699 /*
700 * GNU diff ignores a missing newline
701 * in one file for -b or -w.
702 */
703 if (flags & (D_FOLDBLANKS|D_IGNOREBLANKS)) {
704 if (c == EOF && d == '\n') {
705 ctnew++;
706 break;
707 } else if (c == '\n' && d == EOF) {
708 ctold++;
709 break;
712 ctold++;
713 ctnew++;
714 if ((flags & D_FOLDBLANKS) && isspace(c) &&
715 isspace(d)) {
716 do {
717 if (c == '\n')
718 break;
719 ctold++;
720 } while (f1 && isspace(c = getc(f1)));
721 do {
722 if (d == '\n')
723 break;
724 ctnew++;
725 } while (f2 && isspace(d = getc(f2)));
726 } else if ((flags & D_IGNOREBLANKS)) {
727 while (f1 && isspace(c) && c != '\n') {
728 c = getc(f1);
729 ctold++;
731 while (f2 && isspace(d) && d != '\n') {
732 d = getc(f2);
733 ctnew++;
736 if (ds->chrtran[c] != ds->chrtran[d]) {
737 jackpot++;
738 ds->J[i] = 0;
739 if (c != '\n' && c != EOF)
740 ctold += skipline(f1);
741 if (d != '\n' && c != EOF)
742 ctnew += skipline(f2);
743 break;
745 if (c == '\n' || c == EOF)
746 break;
748 } else {
749 for (;;) {
750 ctold++;
751 ctnew++;
752 c = (f1 == NULL ? EOF : getc(f1));
753 d = (f2 == NULL ? EOF : getc(f2));
754 if (c != d) {
755 /* jackpot++; */
756 ds->J[i] = 0;
757 if (c != '\n' && c != EOF)
758 ctold += skipline(f1);
759 if (d != '\n' && c != EOF)
760 ctnew += skipline(f2);
761 break;
763 if (c == '\n' || c == EOF)
764 break;
767 ds->ixold[i] = ctold;
768 ds->ixnew[j] = ctnew;
769 j++;
771 for (; j <= ds->len[1]; j++)
772 ds->ixnew[j] = ctnew += skipline(f2);
773 /*
774 * if (jackpot)
775 * fprintf(stderr, "jackpot\n");
776 */
779 /* shellsort CACM #201 */
780 static void
781 sort(struct line *a, int n)
783 struct line *ai, *aim, w;
784 int j, m = 0, k;
786 if (n == 0)
787 return;
788 for (j = 1; j <= n; j *= 2)
789 m = 2 * j - 1;
790 for (m /= 2; m != 0; m /= 2) {
791 k = n - m;
792 for (j = 1; j <= k; j++) {
793 for (ai = &a[j]; ai > a; ai -= m) {
794 aim = &ai[m];
795 if (aim < ai)
796 break; /* wraparound */
797 if (aim->value > ai[0].value ||
798 (aim->value == ai[0].value &&
799 aim->serial > ai[0].serial))
800 break;
801 w.value = ai[0].value;
802 ai[0].value = aim->value;
803 aim->value = w.value;
804 w.serial = ai[0].serial;
805 ai[0].serial = aim->serial;
806 aim->serial = w.serial;
812 static int
813 unsort(struct line *f, int l, int *b)
815 int *a, i;
817 a = calloc(l + 1, sizeof(*a));
818 if (a == NULL)
819 return (-1);
820 for (i = 1; i <= l; i++)
821 a[f[i].serial] = f[i].value;
822 for (i = 1; i <= l; i++)
823 b[i] = a[i];
824 free(a);
826 return (0);
829 static int
830 skipline(FILE *f)
832 int i, c;
834 for (i = 1; f != NULL && (c = getc(f)) != '\n' && c != EOF; i++)
835 continue;
836 return (i);
839 static int
840 output(FILE *outfile, struct got_diff_changes *changes,
841 struct got_diff_state *ds, struct got_diff_args *args,
842 const char *file1, FILE *f1, const char *file2, FILE *f2, int flags)
844 int m, i0, i1, j0, j1;
845 int error = 0;
847 if (f1 != NULL)
848 rewind(f1);
849 if (f2 != NULL)
850 rewind(f2);
851 m = ds->len[0];
852 ds->J[0] = 0;
853 ds->J[m + 1] = ds->len[1] + 1;
854 for (i0 = 1; i0 <= m; i0 = i1 + 1) {
855 while (i0 <= m && ds->J[i0] == ds->J[i0 - 1] + 1)
856 i0++;
857 j0 = ds->J[i0 - 1] + 1;
858 i1 = i0 - 1;
859 while (i1 < m && ds->J[i1 + 1] == 0)
860 i1++;
861 j1 = ds->J[i1 + 1] - 1;
862 ds->J[i1] = j1;
863 error = change(outfile, changes, ds, args, file1, f1, file2, f2,
864 i0, i1, j0, j1, &flags);
865 if (error)
866 return (error);
868 if (m == 0) {
869 error = change(outfile, changes, ds, args, file1, f1, file2, f2,
870 1, 0, 1, ds->len[1], &flags);
871 if (error)
872 return (error);
874 if (ds->anychange != 0 && args->diff_format == D_UNIFIED)
875 dump_unified_vec(outfile, changes, ds, args, f1, f2, flags);
877 return (0);
880 static void
881 range(FILE *outfile, int a, int b, char *separator)
883 diff_output(outfile, "%d", a > b ? b : a);
884 if (a < b)
885 diff_output(outfile, "%s%d", separator, b);
888 static void
889 uni_range(FILE *outfile, int a, int b)
891 if (a < b)
892 diff_output(outfile, "%d,%d", a, b - a + 1);
893 else if (a == b)
894 diff_output(outfile, "%d", b);
895 else
896 diff_output(outfile, "%d,0", b);
899 /*
900 * Indicate that there is a difference between lines a and b of the from file
901 * to get to lines c to d of the to file. If a is greater then b then there
902 * are no lines in the from file involved and this means that there were
903 * lines appended (beginning at b). If c is greater than d then there are
904 * lines missing from the to file.
905 */
906 static int
907 change(FILE *outfile, struct got_diff_changes *changes,
908 struct got_diff_state *ds, struct got_diff_args *args,
909 const char *file1, FILE *f1, const char *file2, FILE *f2,
910 int a, int b, int c, int d, int *pflags)
912 if (a > b && c > d)
913 return (0);
915 if (*pflags & D_HEADER) {
916 diff_output(outfile, "%s %s %s\n", args->diffargs, file1, file2);
917 *pflags &= ~D_HEADER;
919 if (args->diff_format == D_UNIFIED) {
920 /*
921 * Allocate change records as needed.
922 */
923 if (ds->context_vec_ptr == ds->context_vec_end - 1) {
924 struct context_vec *cvp;
925 ptrdiff_t offset;
926 offset = ds->context_vec_ptr - ds->context_vec_start;
927 ds->max_context <<= 1;
928 cvp = reallocarray(ds->context_vec_start,
929 ds->max_context, sizeof(*ds->context_vec_start));
930 if (cvp == NULL) {
931 free(ds->context_vec_start);
932 return (-1);
934 ds->context_vec_start = cvp;
935 ds->context_vec_end = ds->context_vec_start +
936 ds->max_context;
937 ds->context_vec_ptr = ds->context_vec_start + offset;
939 if (ds->anychange == 0) {
940 /*
941 * Print the context/unidiff header first time through.
942 */
943 print_header(outfile, ds, args, file1, file2);
944 ds->anychange = 1;
945 } else if (a > ds->context_vec_ptr->b + (2 * args->diff_context) + 1 &&
946 c > ds->context_vec_ptr->d + (2 * args->diff_context) + 1) {
947 /*
948 * If this change is more than 'diff_context' lines from the
949 * previous change, dump the record and reset it.
950 */
951 dump_unified_vec(outfile, changes, ds, args, f1, f2,
952 *pflags);
954 ds->context_vec_ptr++;
955 ds->context_vec_ptr->a = a;
956 ds->context_vec_ptr->b = b;
957 ds->context_vec_ptr->c = c;
958 ds->context_vec_ptr->d = d;
959 return (0);
961 if (ds->anychange == 0)
962 ds->anychange = 1;
963 if (args->diff_format == D_BRIEF)
964 return (0);
965 if (args->diff_format == D_NORMAL) {
966 range(outfile, a, b, ",");
967 diff_output(outfile, "%c", a > b ? 'a' : c > d ? 'd' : 'c');
968 range(outfile, c, d, ",");
969 diff_output(outfile, "\n");
970 fetch(outfile, ds, args, ds->ixold, a, b, f1, '<', *pflags);
971 if (a <= b && c <= d)
972 diff_output(outfile, "---\n");
974 fetch(outfile, ds, args, ds->ixnew, c, d, f2,
975 args->diff_format == D_NORMAL ? '>' : '\0', *pflags);
976 return (0);
979 static void
980 fetch(FILE *outfile, struct got_diff_state *ds, struct got_diff_args *args,
981 long *f, int a, int b, FILE *lb, int ch, int flags)
983 int i, j, c, col, nc;
985 if (lb == NULL || a > b)
986 return;
987 for (i = a; i <= b; i++) {
988 fseek(lb, f[i - 1], SEEK_SET);
989 nc = f[i] - f[i - 1];
990 if (ch != '\0') {
991 diff_output(outfile, "%c", ch);
992 if (args->Tflag && (args->diff_format == D_UNIFIED ||
993 args->diff_format == D_NORMAL))
994 diff_output(outfile, "\t");
995 else if (args->diff_format != D_UNIFIED)
996 diff_output(outfile, " ");
998 col = 0;
999 for (j = 0; j < nc; j++) {
1000 if ((c = getc(lb)) == EOF) {
1001 diff_output(outfile, "\n\\ No newline at end of "
1002 "file\n");
1003 return;
1005 if (c == '\t' && (flags & D_EXPANDTABS)) {
1006 do {
1007 diff_output(outfile, " ");
1008 } while (++col & 7);
1009 } else {
1010 diff_output(outfile, "%c", c);
1011 col++;
1018 * Hash function taken from Robert Sedgewick, Algorithms in C, 3d ed., p 578.
1020 static int
1021 readhash(struct got_diff_state *ds, FILE *f, int flags)
1023 int i, t, space;
1024 int sum;
1026 sum = 1;
1027 space = 0;
1028 if ((flags & (D_FOLDBLANKS|D_IGNOREBLANKS)) == 0) {
1029 if (flags & D_IGNORECASE)
1030 for (i = 0; (t = getc(f)) != '\n'; i++) {
1031 if (t == EOF) {
1032 if (i == 0)
1033 return (0);
1034 break;
1036 sum = sum * 127 + ds->chrtran[t];
1038 else
1039 for (i = 0; (t = getc(f)) != '\n'; i++) {
1040 if (t == EOF) {
1041 if (i == 0)
1042 return (0);
1043 break;
1045 sum = sum * 127 + t;
1047 } else {
1048 for (i = 0;;) {
1049 switch (t = getc(f)) {
1050 case '\t':
1051 case '\r':
1052 case '\v':
1053 case '\f':
1054 case ' ':
1055 space++;
1056 continue;
1057 default:
1058 if (space && (flags & D_IGNOREBLANKS) == 0) {
1059 i++;
1060 space = 0;
1062 sum = sum * 127 + ds->chrtran[t];
1063 i++;
1064 continue;
1065 case EOF:
1066 if (i == 0)
1067 return (0);
1068 /* FALLTHROUGH */
1069 case '\n':
1070 break;
1072 break;
1076 * There is a remote possibility that we end up with a zero sum.
1077 * Zero is used as an EOF marker, so return 1 instead.
1079 return (sum == 0 ? 1 : sum);
1082 static int
1083 asciifile(FILE *f)
1085 unsigned char buf[BUFSIZ];
1086 size_t cnt;
1088 if (f == NULL)
1089 return (1);
1091 rewind(f);
1092 cnt = fread(buf, 1, sizeof(buf), f);
1093 return (memchr(buf, '\0', cnt) == NULL);
1096 #define begins_with(s, pre) (strncmp(s, pre, sizeof(pre)-1) == 0)
1098 static char *
1099 match_function(struct got_diff_state *ds, const long *f, int pos, FILE *fp)
1101 unsigned char buf[FUNCTION_CONTEXT_SIZE];
1102 size_t nc;
1103 int last = ds->lastline;
1104 char *state = NULL;
1106 ds->lastline = pos;
1107 while (pos > last) {
1108 fseek(fp, f[pos - 1], SEEK_SET);
1109 nc = f[pos] - f[pos - 1];
1110 if (nc >= sizeof(buf))
1111 nc = sizeof(buf) - 1;
1112 nc = fread(buf, 1, nc, fp);
1113 if (nc > 0) {
1114 buf[nc] = '\0';
1115 buf[strcspn(buf, "\n")] = '\0';
1116 if (isalpha(buf[0]) || buf[0] == '_' || buf[0] == '$') {
1117 if (begins_with(buf, "private:")) {
1118 if (!state)
1119 state = " (private)";
1120 } else if (begins_with(buf, "protected:")) {
1121 if (!state)
1122 state = " (protected)";
1123 } else if (begins_with(buf, "public:")) {
1124 if (!state)
1125 state = " (public)";
1126 } else {
1127 strlcpy(ds->lastbuf, buf, sizeof ds->lastbuf);
1128 if (state)
1129 strlcat(ds->lastbuf, state,
1130 sizeof ds->lastbuf);
1131 ds->lastmatchline = pos;
1132 return ds->lastbuf;
1136 pos--;
1138 return ds->lastmatchline > 0 ? ds->lastbuf : NULL;
1141 /* dump accumulated "unified" diff changes */
1142 static void
1143 dump_unified_vec(FILE *outfile, struct got_diff_changes *changes,
1144 struct got_diff_state *ds, struct got_diff_args *args,
1145 FILE *f1, FILE *f2, int flags)
1147 struct context_vec *cvp = ds->context_vec_start;
1148 int lowa, upb, lowc, upd;
1149 int a, b, c, d;
1150 char ch, *f;
1152 if (ds->context_vec_start > ds->context_vec_ptr)
1153 return;
1155 b = d = 0; /* gcc */
1156 lowa = MAXIMUM(1, cvp->a - args->diff_context);
1157 upb = MINIMUM(ds->len[0], ds->context_vec_ptr->b + args->diff_context);
1158 lowc = MAXIMUM(1, cvp->c - args->diff_context);
1159 upd = MINIMUM(ds->len[1], ds->context_vec_ptr->d + args->diff_context);
1161 diff_output(outfile, "@@ -");
1162 uni_range(outfile, lowa, upb);
1163 diff_output(outfile, " +");
1164 uni_range(outfile, lowc, upd);
1165 diff_output(outfile, " @@");
1166 if (f1 != NULL && (flags & D_PROTOTYPE)) {
1167 f = match_function(ds, ds->ixold, lowa-1, f1);
1168 if (f != NULL)
1169 diff_output(outfile, " %s", f);
1171 diff_output(outfile, "\n");
1174 * Output changes in "unified" diff format--the old and new lines
1175 * are printed together.
1177 for (; cvp <= ds->context_vec_ptr; cvp++) {
1178 if (changes) {
1179 struct got_diff_change *change;
1180 change = calloc(1, sizeof(*change));
1181 if (change) {
1182 memcpy(&change->cv, cvp, sizeof(change->cv));
1183 SIMPLEQ_INSERT_TAIL(&changes->entries, change,
1184 entry);
1185 changes->nchanges++;
1189 a = cvp->a;
1190 b = cvp->b;
1191 c = cvp->c;
1192 d = cvp->d;
1195 * c: both new and old changes
1196 * d: only changes in the old file
1197 * a: only changes in the new file
1199 if (a <= b && c <= d)
1200 ch = 'c';
1201 else
1202 ch = (a <= b) ? 'd' : 'a';
1204 switch (ch) {
1205 case 'c':
1206 fetch(outfile, ds, args, ds->ixold, lowa, a - 1, f1, ' ', flags);
1207 fetch(outfile, ds, args, ds->ixold, a, b, f1, '-', flags);
1208 fetch(outfile, ds, args, ds->ixnew, c, d, f2, '+', flags);
1209 break;
1210 case 'd':
1211 fetch(outfile, ds, args, ds->ixold, lowa, a - 1, f1, ' ', flags);
1212 fetch(outfile, ds, args, ds->ixold, a, b, f1, '-', flags);
1213 break;
1214 case 'a':
1215 fetch(outfile, ds, args, ds->ixnew, lowc, c - 1, f2, ' ', flags);
1216 fetch(outfile, ds, args, ds->ixnew, c, d, f2, '+', flags);
1217 break;
1219 lowa = b + 1;
1220 lowc = d + 1;
1222 fetch(outfile, ds, args, ds->ixnew, d + 1, upd, f2, ' ', flags);
1224 ds->context_vec_ptr = ds->context_vec_start - 1;
1227 void
1228 got_diff_dump_change(FILE *outfile, struct got_diff_change *change,
1229 struct got_diff_state *ds, struct got_diff_args *args,
1230 FILE *f1, FILE *f2, int diff_flags)
1232 ds->context_vec_ptr = &change->cv;
1233 ds->context_vec_start = &change->cv;
1234 ds->context_vec_end = &change->cv;
1236 /* XXX TODO needs error checking */
1237 dump_unified_vec(outfile, NULL, ds, args, f1, f2, diff_flags);
1240 static void
1241 print_header(FILE *outfile, struct got_diff_state *ds, struct got_diff_args *args,
1242 const char *file1, const char *file2)
1244 if (args->label[0] != NULL)
1245 diff_output(outfile, "--- %s\n", args->label[0]);
1246 else
1247 diff_output(outfile, "--- %s\t%s", file1,
1248 ctime(&ds->stb1.st_mtime));
1249 if (args->label[1] != NULL)
1250 diff_output(outfile, "+++ %s\n", args->label[1]);
1251 else
1252 diff_output(outfile, "+++ %s\t%s", file2,
1253 ctime(&ds->stb2.st_mtime));