Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include "diff.h"
6 /* diff - differential file comparison
7 *
8 * Uses an algorithm due to Harold Stone, which finds
9 * a pair of longest identical subsequences in the two
10 * files.
11 *
12 * The major goal is to generate the match vector J.
13 * J[i] is the index of the line in file1 corresponding
14 * to line i file0. J[i] = 0 if there is no
15 * such line in file1.
16 *
17 * Lines are hashed so as to work in core. All potential
18 * matches are located by sorting the lines of each file
19 * on the hash (called value). In particular, this
20 * collects the equivalence classes in file1 together.
21 * Subroutine equiv replaces the value of each line in
22 * file0 by the index of the first element of its
23 * matching equivalence in (the reordered) file1.
24 * To save space equiv squeezes file1 into a single
25 * array member in which the equivalence classes
26 * are simply concatenated, except that their first
27 * members are flagged by changing sign.
28 *
29 * Next the indices that point into member are unsorted into
30 * array class according to the original order of file0.
31 *
32 * The cleverness lies in routine stone. This marches
33 * through the lines of file0, developing a vector klist
34 * of "k-candidates". At step i a k-candidate is a matched
35 * pair of lines x,y (x in file0 y in file1) such that
36 * there is a common subsequence of lenght k
37 * between the first i lines of file0 and the first y
38 * lines of file1, but there is no such subsequence for
39 * any smaller y. x is the earliest possible mate to y
40 * that occurs in such a subsequence.
41 *
42 * Whenever any of the members of the equivalence class of
43 * lines in file1 matable to a line in file0 has serial number
44 * less than the y of some k-candidate, that k-candidate
45 * with the smallest such y is replaced. The new
46 * k-candidate is chained (via pred) to the current
47 * k-1 candidate so that the actual subsequence can
48 * be recovered. When a member has serial number greater
49 * that the y of all k-candidates, the klist is extended.
50 * At the end, the longest subsequence is pulled out
51 * and placed in the array J by unravel.
52 *
53 * With J in hand, the matches there recorded are
54 * check'ed against reality to assure that no spurious
55 * matches have crept in due to hashing. If they have,
56 * they are broken, and "jackpot " is recorded--a harmless
57 * matter except that a true match for a spuriously
58 * mated line may now be unnecessarily reported as a change.
59 *
60 * Much of the complexity of the program comes simply
61 * from trying to minimize core utilization and
62 * maximize the range of doable problems by dynamically
63 * allocating what is needed and reusing what is not.
64 * The core requirements for problems larger than somewhat
65 * are (in words) 2*length(file0) + length(file1) +
66 * 3*(number of k-candidates installed), typically about
67 * 6n words for files of length n.
68 */
70 #define class diffclass
72 /* TIDY THIS UP */
73 struct cand {
74 int x;
75 int y;
76 int pred;
77 } cand;
78 struct line {
79 int serial;
80 int value;
81 } *file[2], line;
82 int len[2];
83 int binary;
84 struct line *sfile[2]; /*shortened by pruning common prefix and suffix*/
85 int slen[2];
86 int pref, suff; /*length of prefix and suffix*/
87 int *class; /*will be overlaid on file[0]*/
88 int *member; /*will be overlaid on file[1]*/
89 int *klist; /*will be overlaid on file[0] after class*/
90 struct cand *clist; /* merely a free storage pot for candidates */
91 int clen;
92 int *J; /*will be overlaid on class*/
93 long *ixold; /*will be overlaid on klist*/
94 long *ixnew; /*will be overlaid on file[1]*/
95 /* END OF SOME TIDYING */
97 static void
98 sort(struct line *a, int n) /*shellsort CACM #201*/
99 {
100 int m;
101 struct line *ai, *aim, *j, *k;
102 struct line w;
103 int i;
105 m = 0;
106 for (i = 1; i <= n; i *= 2)
107 m = 2*i - 1;
108 for (m /= 2; m != 0; m /= 2) {
109 k = a+(n-m);
110 for (j = a+1; j <= k; j++) {
111 ai = j;
112 aim = ai+m;
113 do {
114 if (aim->value > ai->value ||
115 aim->value == ai->value &&
116 aim->serial > ai->serial)
117 break;
118 w = *ai;
119 *ai = *aim;
120 *aim = w;
122 aim = ai;
123 ai -= m;
124 } while (ai > a && aim >= ai);
129 static void
130 unsort(struct line *f, int l, int *b)
132 int *a;
133 int i;
135 a = MALLOC(int, (l+1));
136 for(i=1;i<=l;i++)
137 a[f[i].serial] = f[i].value;
138 for(i=1;i<=l;i++)
139 b[i] = a[i];
140 FREE(a);
143 static void
144 prune(void)
146 int i,j;
148 for(pref=0;pref<len[0]&&pref<len[1]&&
149 file[0][pref+1].value==file[1][pref+1].value;
150 pref++ ) ;
151 for(suff=0;suff<len[0]-pref&&suff<len[1]-pref&&
152 file[0][len[0]-suff].value==file[1][len[1]-suff].value;
153 suff++) ;
154 for(j=0;j<2;j++) {
155 sfile[j] = file[j]+pref;
156 slen[j] = len[j]-pref-suff;
157 for(i=0;i<=slen[j];i++)
158 sfile[j][i].serial = i;
162 static void
163 equiv(struct line *a, int n, struct line *b, int m, int *c)
165 int i, j;
167 i = j = 1;
168 while(i<=n && j<=m) {
169 if(a[i].value < b[j].value)
170 a[i++].value = 0;
171 else if(a[i].value == b[j].value)
172 a[i++].value = j;
173 else
174 j++;
176 while(i <= n)
177 a[i++].value = 0;
178 b[m+1].value = 0;
179 j = 0;
180 while(++j <= m) {
181 c[j] = -b[j].serial;
182 while(b[j+1].value == b[j].value) {
183 j++;
184 c[j] = b[j].serial;
187 c[j] = -1;
190 static int
191 newcand(int x, int y, int pred)
193 struct cand *q;
195 clist = REALLOC(clist, struct cand, (clen+1));
196 q = clist + clen;
197 q->x = x;
198 q->y = y;
199 q->pred = pred;
200 return clen++;
203 static int
204 search(int *c, int k, int y)
206 int i, j, l;
207 int t;
209 if(clist[c[k]].y < y) /*quick look for typical case*/
210 return k+1;
211 i = 0;
212 j = k+1;
213 while((l=(i+j)/2) > i) {
214 t = clist[c[l]].y;
215 if(t > y)
216 j = l;
217 else if(t < y)
218 i = l;
219 else
220 return l;
222 return l+1;
225 static int
226 stone(int *a, int n, int *b, int *c)
228 int i, k,y;
229 int j, l;
230 int oldc, tc;
231 int oldl;
233 k = 0;
234 c[0] = newcand(0,0,0);
235 for(i=1; i<=n; i++) {
236 j = a[i];
237 if(j==0)
238 continue;
239 y = -b[j];
240 oldl = 0;
241 oldc = c[0];
242 do {
243 if(y <= clist[oldc].y)
244 continue;
245 l = search(c, k, y);
246 if(l!=oldl+1)
247 oldc = c[l-1];
248 if(l<=k) {
249 if(clist[c[l]].y <= y)
250 continue;
251 tc = c[l];
252 c[l] = newcand(i,y,oldc);
253 oldc = tc;
254 oldl = l;
255 } else {
256 c[l] = newcand(i,y,oldc);
257 k++;
258 break;
260 } while((y=b[++j]) > 0);
262 return k;
265 static void
266 unravel(int p)
268 int i;
269 struct cand *q;
271 for(i=0; i<=len[0]; i++) {
272 if (i <= pref)
273 J[i] = i;
274 else if (i > len[0]-suff)
275 J[i] = i+len[1]-len[0];
276 else
277 J[i] = 0;
279 for(q=clist+p;q->y!=0;q=clist+q->pred)
280 J[q->x+pref] = q->y+pref;
283 static void
284 output(void)
286 int m, i0, i1, j0, j1;
288 m = len[0];
289 J[0] = 0;
290 J[m+1] = len[1]+1;
291 if (mode != 'e') {
292 for (i0 = 1; i0 <= m; i0 = i1+1) {
293 while (i0 <= m && J[i0] == J[i0-1]+1)
294 i0++;
295 j0 = J[i0-1]+1;
296 i1 = i0-1;
297 while (i1 < m && J[i1+1] == 0)
298 i1++;
299 j1 = J[i1+1]-1;
300 J[i1] = j1;
301 change(i0, i1, j0, j1);
304 else {
305 for (i0 = m; i0 >= 1; i0 = i1-1) {
306 while (i0 >= 1 && J[i0] == J[i0+1]-1 && J[i0])
307 i0--;
308 j0 = J[i0+1]-1;
309 i1 = i0+1;
310 while (i1 > 1 && J[i1-1] == 0)
311 i1--;
312 j1 = J[i1-1]+1;
313 J[i1] = j1;
314 change(i1 , i0, j1, j0);
317 if (m == 0)
318 change(1, 0, 1, len[1]);
319 flushchanges();
322 #define BUF 4096
323 static int
324 cmp(Biobuf* b1, Biobuf* b2)
326 int n;
327 uchar buf1[BUF], buf2[BUF];
328 int f1, f2;
329 vlong nc = 1;
330 uchar *b1s, *b1e, *b2s, *b2e;
332 f1 = Bfildes(b1);
333 f2 = Bfildes(b2);
334 seek(f1, 0, 0);
335 seek(f2, 0, 0);
336 b1s = b1e = buf1;
337 b2s = b2e = buf2;
338 for(;;){
339 if(b1s >= b1e){
340 if(b1s >= &buf1[BUF])
341 b1s = buf1;
342 n = read(f1, b1s, &buf1[BUF] - b1s);
343 b1e = b1s + n;
345 if(b2s >= b2e){
346 if(b2s >= &buf2[BUF])
347 b2s = buf2;
348 n = read(f2, b2s, &buf2[BUF] - b2s);
349 b2e = b2s + n;
351 n = b2e - b2s;
352 if(n > b1e - b1s)
353 n = b1e - b1s;
354 if(n <= 0)
355 break;
356 if(memcmp((void *)b1s, (void *)b2s, n) != 0){
357 return 1;
359 nc += n;
360 b1s += n;
361 b2s += n;
363 if(b1e - b1s == b2e - b2s)
364 return 0;
365 return 1;
368 void
369 diffreg(char *f, char *t)
371 Biobuf *b0, *b1;
372 int k;
374 binary = 0;
375 b0 = prepare(0, f);
376 if (!b0)
377 return;
378 b1 = prepare(1, t);
379 if (!b1) {
380 FREE(file[0]);
381 Bterm(b0);
382 return;
384 if (binary){
385 /* could use b0 and b1 but this is simpler. */
386 if (cmp(b0, b1))
387 print("binary files %s %s differ\n", f, t);
388 Bterm(b0);
389 Bterm(b1);
390 return;
392 clen = 0;
393 prune();
394 sort(sfile[0], slen[0]);
395 sort(sfile[1], slen[1]);
397 member = (int *)file[1];
398 equiv(sfile[0], slen[0], sfile[1], slen[1], member);
399 member = REALLOC(member, int, slen[1]+2);
401 class = (int *)file[0];
402 unsort(sfile[0], slen[0], class);
403 class = REALLOC(class, int, slen[0]+2);
405 klist = MALLOC(int, slen[0]+2);
406 clist = MALLOC(struct cand, 1);
407 k = stone(class, slen[0], member, klist);
408 FREE(member);
409 FREE(class);
411 J = MALLOC(int, len[0]+2);
412 unravel(klist[k]);
413 FREE(clist);
414 FREE(klist);
416 ixold = MALLOC(long, len[0]+2);
417 ixnew = MALLOC(long, len[1]+2);
418 Bseek(b0, 0, 0); Bseek(b1, 0, 0);
419 check(b0, b1);
420 output();
421 FREE(J); FREE(ixold); FREE(ixnew);
422 Bterm(b0); Bterm(b1); /* ++++ */