Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
5 /*
6 bugs:
7 00/ff for end of file can conflict with 00/ff characters
8 */
10 enum
11 {
12 Nline = 500000, /* default max number of lines saved in memory */
13 Nmerge = 10, /* max number of temporary files merged */
14 Nfield = 20, /* max number of argument fields */
16 Bflag = 1<<0, /* flags per field */
17 B1flag = 1<<1,
19 Dflag = 1<<2,
20 Fflag = 1<<3,
21 Gflag = 1<<4,
22 Iflag = 1<<5,
23 Mflag = 1<<6,
24 Nflag = 1<<7,
25 Rflag = 1<<8,
26 Wflag = 1<<9,
28 NSstart = 0, /* states for number to key decoding */
29 NSsign,
30 NSzero,
31 NSdigit,
32 NSpoint,
33 NSfract,
34 NSzerofract,
35 NSexp,
36 NSexpsign,
37 NSexpdigit
38 };
40 typedef struct Line Line;
41 typedef struct Key Key;
42 typedef struct Merge Merge;
43 typedef struct Field Field;
45 struct Line
46 {
47 Key* key;
48 int llen; /* always >= 1 */
49 uchar line[1]; /* always ends in '\n' */
50 };
52 struct Merge
53 {
54 Key* key; /* copy of line->key so (Line*) looks like (Merge*) */
55 Line* line; /* line at the head of a merged temp file */
56 int fd; /* file descriptor */
57 Biobuf b; /* iobuf for reading a temp file */
58 };
60 struct Key
61 {
62 int klen;
63 uchar key[1];
64 };
66 struct Field
67 {
68 int beg1;
69 int beg2;
70 int end1;
71 int end2;
73 long flags;
74 uchar mapto[256];
76 void (*dokey)(Key*, uchar*, uchar*, Field*);
77 };
79 struct args
80 {
81 char* ofile;
82 char* tname;
83 Rune tabchar;
84 char cflag;
85 char uflag;
86 char vflag;
87 int nfield;
88 int nfile;
89 Field field[Nfield];
91 Line** linep;
92 long nline; /* number of lines in this temp file */
93 long lineno; /* overall ordinal for -s option */
94 int ntemp;
95 long mline; /* max lines per file */
96 } args;
98 extern int latinmap[];
99 extern Rune* month[12];
101 void buildkey(Line*);
102 void doargs(int, char*[]);
103 void dofield(char*, int*, int*, int, int);
104 void dofile(Biobuf*);
105 void dokey_(Key*, uchar*, uchar*, Field*);
106 void dokey_dfi(Key*, uchar*, uchar*, Field*);
107 void dokey_gn(Key*, uchar*, uchar*, Field*);
108 void dokey_m(Key*, uchar*, uchar*, Field*);
109 void dokey_r(Key*, uchar*, uchar*, Field*);
110 void done(char*);
111 int kcmp(Key*, Key*);
112 void makemapd(Field*);
113 void makemapm(Field*);
114 void mergefiles(int, int, Biobuf*);
115 void mergeout(Biobuf*);
116 void newfield(void);
117 Line* newline(Biobuf*);
118 void nomem(void);
119 void notifyf(void*, char*);
120 void printargs(void);
121 void printout(Biobuf*);
122 void setfield(int, int);
123 uchar* skip(uchar*, int, int, int, int);
124 void sort4(void*, ulong);
125 char* tempfile(int);
126 void tempout(void);
127 void lineout(Biobuf*, Line*);
129 void
130 main(int argc, char *argv[])
132 int i, f;
133 char *s;
134 Biobuf bbuf;
136 notify(notifyf); /**/
137 doargs(argc, argv);
138 if(args.vflag)
139 printargs();
141 for(i=1; i<argc; i++) {
142 s = argv[i];
143 if(s == 0)
144 continue;
145 if(strcmp(s, "-") == 0) {
146 Binit(&bbuf, 0, OREAD);
147 dofile(&bbuf);
148 Bterm(&bbuf);
149 continue;
151 f = open(s, OREAD);
152 if(f < 0) {
153 fprint(2, "sort: open %s: %r\n", s);
154 done("open");
156 Binit(&bbuf, f, OREAD);
157 dofile(&bbuf);
158 Bterm(&bbuf);
159 close(f);
161 if(args.nfile == 0) {
162 Binit(&bbuf, 0, OREAD);
163 dofile(&bbuf);
164 Bterm(&bbuf);
166 if(args.cflag)
167 done(0);
168 if(args.vflag)
169 fprint(2, "=========\n");
171 f = 1;
172 if(args.ofile) {
173 f = create(args.ofile, OWRITE, 0666);
174 if(f < 0) {
175 fprint(2, "sort: create %s: %r\n", args.ofile);
176 done("create");
180 Binit(&bbuf, f, OWRITE);
181 if(args.ntemp) {
182 tempout();
183 mergeout(&bbuf);
184 } else {
185 printout(&bbuf);
187 Bterm(&bbuf);
188 done(0);
191 void
192 dofile(Biobuf *b)
194 Line *l, *ol;
195 int n;
197 if(args.cflag) {
198 ol = newline(b);
199 if(ol == 0)
200 return;
201 for(;;) {
202 l = newline(b);
203 if(l == 0)
204 break;
205 n = kcmp(ol->key, l->key);
206 if(n > 0 || (n == 0 && args.uflag)) {
207 fprint(2, "sort: -c file not in sort\n"); /**/
208 done("order");
210 free(ol->key);
211 free(ol);
212 ol = l;
214 return;
217 if(args.linep == 0) {
218 args.linep = malloc(args.mline * sizeof(args.linep));
219 if(args.linep == 0)
220 nomem();
222 for(;;) {
223 l = newline(b);
224 if(l == 0)
225 break;
226 if(args.nline >= args.mline)
227 tempout();
228 args.linep[args.nline] = l;
229 args.nline++;
230 args.lineno++;
234 void
235 notifyf(void *a, char *s)
237 USED(a);
238 if(strcmp(s, "interrupt") == 0)
239 done(0);
240 if(strcmp(s, "hangup") == 0)
241 done(0);
242 if(strcmp(s, "kill") == 0)
243 done(0);
244 if(strncmp(s, "sys: write on closed pipe", 25) == 0)
245 done(0);
246 noted(NDFLT);
249 Line*
250 newline(Biobuf *b)
252 Line *l;
253 char *p;
254 int n, c;
256 p = Brdline(b, '\n');
257 n = Blinelen(b);
258 if(p == 0) {
259 if(n == 0)
260 return 0;
261 l = 0;
262 for(n=0;;) {
263 if((n & 31) == 0) {
264 l = realloc(l, sizeof(Line) +
265 (n+31)*sizeof(l->line[0]));
266 if(l == 0)
267 nomem();
269 c = Bgetc(b);
270 if(c < 0) {
271 fprint(2, "sort: newline added\n");
272 c = '\n';
274 l->line[n++] = c;
275 if(c == '\n')
276 break;
278 l->llen = n;
279 buildkey(l);
280 return l;
282 l = malloc(sizeof(Line) +
283 (n-1)*sizeof(l->line[0]));
284 if(l == 0)
285 nomem();
286 l->llen = n;
287 memmove(l->line, p, n);
288 buildkey(l);
289 return l;
292 void
293 lineout(Biobuf *b, Line *l)
295 int n, m;
297 n = l->llen;
298 m = Bwrite(b, l->line, n);
299 if(n != m)
300 exits("write");
303 void
304 tempout(void)
306 long n;
307 Line **lp, *l;
308 char *tf;
309 int f;
310 Biobuf tb;
312 sort4(args.linep, args.nline);
313 tf = tempfile(args.ntemp);
314 args.ntemp++;
315 f = create(tf, OWRITE, 0666);
316 if(f < 0) {
317 fprint(2, "sort: create %s: %r\n", tf);
318 done("create");
321 Binit(&tb, f, OWRITE);
322 lp = args.linep;
323 for(n=args.nline; n>0; n--) {
324 l = *lp++;
325 lineout(&tb, l);
326 free(l->key);
327 free(l);
329 args.nline = 0;
330 Bterm(&tb);
331 close(f);
334 void
335 done(char *xs)
337 int i;
339 for(i=0; i<args.ntemp; i++)
340 remove(tempfile(i));
341 exits(xs);
344 void
345 nomem(void)
347 fprint(2, "sort: out of memory\n");
348 done("mem");
351 char*
352 tempfile(int n)
354 static char file[100];
355 static uint pid;
356 char *dir;
358 dir = "/var/tmp";
359 if(args.tname)
360 dir = args.tname;
361 if(strlen(dir) >= nelem(file)-20) {
362 fprint(2, "temp file directory name is too long: %s\n", dir);
363 done("tdir");
366 if(pid == 0) {
367 pid = getpid();
368 if(pid == 0) {
369 pid = time(0);
370 if(pid == 0)
371 pid = 1;
375 sprint(file, "%s/sort.%.4d.%.4d", dir, pid%10000, n);
376 return file;
379 void
380 mergeout(Biobuf *b)
382 int n, i, f;
383 char *tf;
384 Biobuf tb;
386 for(i=0; i<args.ntemp; i+=n) {
387 n = args.ntemp - i;
388 if(n > Nmerge) {
389 tf = tempfile(args.ntemp);
390 args.ntemp++;
391 f = create(tf, OWRITE, 0666);
392 if(f < 0) {
393 fprint(2, "sort: create %s: %r\n", tf);
394 done("create");
396 Binit(&tb, f, OWRITE);
398 n = Nmerge;
399 mergefiles(i, n, &tb);
401 Bterm(&tb);
402 close(f);
403 } else
404 mergefiles(i, n, b);
408 void
409 mergefiles(int t, int n, Biobuf *b)
411 Merge *m, *mp, **mmp;
412 Key *ok;
413 Line *l;
414 char *tf;
415 int i, f, nn;
417 mmp = malloc(n*sizeof(*mmp));
418 mp = malloc(n*sizeof(*mp));
419 if(mmp == 0 || mp == 0)
420 nomem();
422 nn = 0;
423 m = mp;
424 for(i=0; i<n; i++,m++) {
425 tf = tempfile(t+i);
426 f = open(tf, OREAD);
427 if(f < 0) {
428 fprint(2, "sort: reopen %s: %r\n", tf);
429 done("open");
431 m->fd = f;
432 Binit(&m->b, f, OREAD);
433 mmp[nn] = m;
435 l = newline(&m->b);
436 if(l == 0)
437 continue;
438 nn++;
439 m->line = l;
440 m->key = l->key;
443 ok = 0;
444 for(;;) {
445 sort4(mmp, nn);
446 m = *mmp;
447 if(nn == 0)
448 break;
449 for(;;) {
450 l = m->line;
451 if(args.uflag && ok && kcmp(ok, l->key) == 0) {
452 free(l->key);
453 free(l);
454 } else {
455 lineout(b, l);
456 if(ok)
457 free(ok);
458 ok = l->key;
459 free(l);
462 l = newline(&m->b);
463 if(l == 0) {
464 nn--;
465 mmp[0] = mmp[nn];
466 break;
468 m->line = l;
469 m->key = l->key;
470 if(nn > 1 && kcmp(mmp[0]->key, mmp[1]->key) > 0)
471 break;
474 if(ok)
475 free(ok);
477 m = mp;
478 for(i=0; i<n; i++,m++) {
479 Bterm(&m->b);
480 close(m->fd);
483 free(mp);
484 free(mmp);
487 int
488 kcmp(Key *ka, Key *kb)
490 int n, m;
492 /*
493 * set n to length of smaller key
494 */
495 n = ka->klen;
496 m = kb->klen;
497 if(n > m)
498 n = m;
499 return memcmp(ka->key, kb->key, n);
502 void
503 printout(Biobuf *b)
505 long n;
506 Line **lp, *l;
507 Key *ok;
509 sort4(args.linep, args.nline);
510 lp = args.linep;
511 ok = 0;
512 for(n=args.nline; n>0; n--) {
513 l = *lp++;
514 if(args.uflag && ok && kcmp(ok, l->key) == 0)
515 continue;
516 lineout(b, l);
517 ok = l->key;
521 void
522 setfield(int n, int c)
524 Field *f;
526 f = &args.field[n];
527 switch(c) {
528 default:
529 fprint(2, "sort: unknown option: field.%C\n", c);
530 done("option");
531 case 'b': /* skip blanks */
532 f->flags |= Bflag;
533 break;
534 case 'd': /* directory order */
535 f->flags |= Dflag;
536 break;
537 case 'f': /* fold case */
538 f->flags |= Fflag;
539 break;
540 case 'g': /* floating point -n case */
541 f->flags |= Gflag;
542 break;
543 case 'i': /* ignore non-ascii */
544 f->flags |= Iflag;
545 break;
546 case 'M': /* month */
547 f->flags |= Mflag;
548 break;
549 case 'n': /* numbers */
550 f->flags |= Nflag;
551 break;
552 case 'r': /* reverse */
553 f->flags |= Rflag;
554 break;
555 case 'w': /* ignore white */
556 f->flags |= Wflag;
557 break;
561 void
562 dofield(char *s, int *n1, int *n2, int off1, int off2)
564 int c, n;
566 c = *s++;
567 if(c >= '0' && c <= '9') {
568 n = 0;
569 while(c >= '0' && c <= '9') {
570 n = n*10 + (c-'0');
571 c = *s++;
573 n -= off1; /* posix committee: rot in hell */
574 if(n < 0) {
575 fprint(2, "sort: field offset must be positive\n");
576 done("option");
578 *n1 = n;
580 if(c == '.') {
581 c = *s++;
582 if(c >= '0' && c <= '9') {
583 n = 0;
584 while(c >= '0' && c <= '9') {
585 n = n*10 + (c-'0');
586 c = *s++;
588 n -= off2;
589 if(n < 0) {
590 fprint(2, "sort: character offset must be positive\n");
591 done("option");
593 *n2 = n;
596 while(c != 0) {
597 setfield(args.nfield, c);
598 c = *s++;
602 void
603 printargs(void)
605 int i, n;
606 Field *f;
607 char *prefix;
609 fprint(2, "sort");
610 for(i=0; i<=args.nfield; i++) {
611 f = &args.field[i];
612 prefix = " -";
613 if(i) {
614 n = f->beg1;
615 if(n >= 0)
616 fprint(2, " +%d", n);
617 else
618 fprint(2, " +*");
619 n = f->beg2;
620 if(n >= 0)
621 fprint(2, ".%d", n);
622 else
623 fprint(2, ".*");
625 if(f->flags & B1flag)
626 fprint(2, "b");
628 n = f->end1;
629 if(n >= 0)
630 fprint(2, " -%d", n);
631 else
632 fprint(2, " -*");
633 n = f->end2;
634 if(n >= 0)
635 fprint(2, ".%d", n);
636 else
637 fprint(2, ".*");
638 prefix = "";
640 if(f->flags & Bflag)
641 fprint(2, "%sb", prefix);
642 if(f->flags & Dflag)
643 fprint(2, "%sd", prefix);
644 if(f->flags & Fflag)
645 fprint(2, "%sf", prefix);
646 if(f->flags & Gflag)
647 fprint(2, "%sg", prefix);
648 if(f->flags & Iflag)
649 fprint(2, "%si", prefix);
650 if(f->flags & Mflag)
651 fprint(2, "%sM", prefix);
652 if(f->flags & Nflag)
653 fprint(2, "%sn", prefix);
654 if(f->flags & Rflag)
655 fprint(2, "%sr", prefix);
656 if(f->flags & Wflag)
657 fprint(2, "%sw", prefix);
659 if(args.cflag)
660 fprint(2, " -c");
661 if(args.uflag)
662 fprint(2, " -u");
663 if(args.ofile)
664 fprint(2, " -o %s", args.ofile);
665 if(args.mline != Nline)
666 fprint(2, " -l %ld", args.mline);
667 fprint(2, "\n");
670 void
671 newfield(void)
673 int n;
674 Field *f;
676 n = args.nfield + 1;
677 if(n >= Nfield) {
678 fprint(2, "sort: too many fields specified\n");
679 done("option");
681 args.nfield = n;
682 f = &args.field[n];
683 f->beg1 = -1;
684 f->beg2 = -1;
685 f->end1 = -1;
686 f->end2 = -1;
689 void
690 doargs(int argc, char *argv[])
692 int i, c, hadplus;
693 char *s, *p, *q;
694 Field *f;
696 hadplus = 0;
697 args.mline = Nline;
698 for(i=1; i<argc; i++) {
699 s = argv[i];
700 c = *s++;
701 if(c == '-') {
702 c = *s;
703 if(c == 0) /* forced end of arg marker */
704 break;
705 argv[i] = 0; /* clobber args processed */
706 if(c == '.' || (c >= '0' && c <= '9')) {
707 if(!hadplus)
708 newfield();
709 f = &args.field[args.nfield];
710 dofield(s, &f->end1, &f->end2, 0, 0);
711 hadplus = 0;
712 continue;
715 while(c = *s++)
716 switch(c) {
717 case '-': /* end of options */
718 i = argc;
719 continue;
720 case 'T': /* temp directory */
721 if(*s == 0) {
722 i++;
723 if(i < argc) {
724 args.tname = argv[i];
725 argv[i] = 0;
727 } else
728 args.tname = s;
729 s = strchr(s, 0);
730 break;
731 case 'o': /* output file */
732 if(*s == 0) {
733 i++;
734 if(i < argc) {
735 args.ofile = argv[i];
736 argv[i] = 0;
738 } else
739 args.ofile = s;
740 s = strchr(s, 0);
741 break;
742 case 'k': /* posix key (what were they thinking?) */
743 p = 0;
744 if(*s == 0) {
745 i++;
746 if(i < argc) {
747 p = argv[i];
748 argv[i] = 0;
750 } else
751 p = s;
752 s = strchr(s, 0);
753 if(p == 0)
754 break;
756 newfield();
757 q = strchr(p, ',');
758 if(q)
759 *q++ = 0;
760 f = &args.field[args.nfield];
761 dofield(p, &f->beg1, &f->beg2, 1, 1);
762 if(f->flags & Bflag) {
763 f->flags |= B1flag;
764 f->flags &= ~Bflag;
766 if(q) {
767 dofield(q, &f->end1, &f->end2, 1, 0);
768 if(f->end2 <= 0)
769 f->end1++;
771 hadplus = 0;
772 break;
773 case 't': /* tab character */
774 if(*s == 0) {
775 i++;
776 if(i < argc) {
777 chartorune(&args.tabchar, argv[i]);
778 argv[i] = 0;
780 } else
781 s += chartorune(&args.tabchar, s);
782 if(args.tabchar == '\n') {
783 fprint(2, "aw come on, rob\n");
784 done("rob");
786 break;
787 case 'c': /* check order */
788 args.cflag = 1;
789 break;
790 case 'u': /* unique */
791 args.uflag = 1;
792 break;
793 case 'v': /* debugging noise */
794 args.vflag = 1;
795 break;
796 case 'l':
797 if(*s == 0) {
798 i++;
799 if(i < argc) {
800 args.mline = atol(argv[i]);
801 argv[i] = 0;
803 } else
804 args.mline = atol(s);
805 s = strchr(s, 0);
806 break;
808 case 'M': /* month */
809 case 'b': /* skip blanks */
810 case 'd': /* directory order */
811 case 'f': /* fold case */
812 case 'g': /* floating numbers */
813 case 'i': /* ignore non-ascii */
814 case 'n': /* numbers */
815 case 'r': /* reverse */
816 case 'w': /* ignore white */
817 if(args.nfield > 0)
818 fprint(2, "sort: global field set after -k\n");
819 setfield(0, c);
820 break;
821 case 'm':
822 /* option m silently ignored but required by posix */
823 break;
824 default:
825 fprint(2, "sort: unknown option: -%C\n", c);
826 done("option");
828 continue;
830 if(c == '+') {
831 argv[i] = 0; /* clobber args processed */
832 c = *s;
833 if(c == '.' || (c >= '0' && c <= '9')) {
834 newfield();
835 f = &args.field[args.nfield];
836 dofield(s, &f->beg1, &f->beg2, 0, 0);
837 if(f->flags & Bflag) {
838 f->flags |= B1flag;
839 f->flags &= ~Bflag;
841 hadplus = 1;
842 continue;
844 fprint(2, "sort: unknown option: +%C\n", c);
845 done("option");
847 args.nfile++;
850 for(i=0; i<=args.nfield; i++) {
851 f = &args.field[i];
853 /*
854 * global options apply to fields that
855 * specify no options
856 */
857 if(f->flags == 0) {
858 f->flags = args.field[0].flags;
859 if(args.field[0].flags & Bflag)
860 f->flags |= B1flag;
864 /*
865 * build buildkey specification
866 */
867 switch(f->flags & ~(Bflag|B1flag)) {
868 default:
869 fprint(2, "sort: illegal combination of flags: %lx\n", f->flags);
870 done("option");
871 case 0:
872 f->dokey = dokey_;
873 break;
874 case Rflag:
875 f->dokey = dokey_r;
876 break;
877 case Gflag:
878 case Nflag:
879 case Gflag|Nflag:
880 case Gflag|Rflag:
881 case Nflag|Rflag:
882 case Gflag|Nflag|Rflag:
883 f->dokey = dokey_gn;
884 break;
885 case Mflag:
886 case Mflag|Rflag:
887 f->dokey = dokey_m;
888 makemapm(f);
889 break;
890 case Dflag:
891 case Dflag|Fflag:
892 case Dflag|Fflag|Iflag:
893 case Dflag|Fflag|Iflag|Rflag:
894 case Dflag|Fflag|Iflag|Rflag|Wflag:
895 case Dflag|Fflag|Iflag|Wflag:
896 case Dflag|Fflag|Rflag:
897 case Dflag|Fflag|Rflag|Wflag:
898 case Dflag|Fflag|Wflag:
899 case Dflag|Iflag:
900 case Dflag|Iflag|Rflag:
901 case Dflag|Iflag|Rflag|Wflag:
902 case Dflag|Iflag|Wflag:
903 case Dflag|Rflag:
904 case Dflag|Rflag|Wflag:
905 case Dflag|Wflag:
906 case Fflag:
907 case Fflag|Iflag:
908 case Fflag|Iflag|Rflag:
909 case Fflag|Iflag|Rflag|Wflag:
910 case Fflag|Iflag|Wflag:
911 case Fflag|Rflag:
912 case Fflag|Rflag|Wflag:
913 case Fflag|Wflag:
914 case Iflag:
915 case Iflag|Rflag:
916 case Iflag|Rflag|Wflag:
917 case Iflag|Wflag:
918 case Wflag:
919 f->dokey = dokey_dfi;
920 makemapd(f);
921 break;
925 /*
926 * random spot checks
927 */
928 if(args.nfile > 1 && args.cflag) {
929 fprint(2, "sort: -c can have at most one input file\n");
930 done("option");
932 return;
935 uchar*
936 skip(uchar *l, int n1, int n2, int bflag, int endfield)
938 int i, c, tc;
939 Rune r;
941 if(endfield && n1 < 0)
942 return 0;
944 c = *l++;
945 tc = args.tabchar;
946 if(tc) {
947 if(tc < Runeself) {
948 for(i=n1; i>0; i--) {
949 while(c != tc) {
950 if(c == '\n')
951 return 0;
952 c = *l++;
954 if(!(endfield && i == 1))
955 c = *l++;
957 } else {
958 l--;
959 l += chartorune(&r, (char*)l);
960 for(i=n1; i>0; i--) {
961 while(r != tc) {
962 if(r == '\n')
963 return 0;
964 l += chartorune(&r, (char*)l);
966 if(!(endfield && i == 1))
967 l += chartorune(&r, (char*)l);
969 c = r;
971 } else {
972 for(i=n1; i>0; i--) {
973 while(c == ' ' || c == '\t')
974 c = *l++;
975 while(c != ' ' && c != '\t') {
976 if(c == '\n')
977 return 0;
978 c = *l++;
983 if(bflag)
984 while(c == ' ' || c == '\t')
985 c = *l++;
987 l--;
988 for(i=n2; i>0; i--) {
989 c = *l;
990 if(c < Runeself) {
991 if(c == '\n')
992 return 0;
993 l++;
994 continue;
996 l += chartorune(&r, (char*)l);
998 return l;
1001 void
1002 dokey_gn(Key *k, uchar *lp, uchar *lpe, Field *f)
1004 uchar *kp;
1005 int c, cl, dp;
1006 int state, nzero, exp, expsign, rflag;
1008 cl = k->klen + 3;
1009 kp = k->key + cl; /* skip place for sign, exponent[2] */
1011 nzero = 0; /* number of trailing zeros */
1012 exp = 0; /* value of the exponent */
1013 expsign = 0; /* sign of the exponent */
1014 dp = 0x4040; /* location of decimal point */
1015 rflag = f->flags&Rflag; /* xor of rflag and - sign */
1016 state = NSstart;
1018 for(;; lp++) {
1019 if(lp >= lpe)
1020 break;
1021 c = *lp;
1023 if(c == ' ' || c == '\t') {
1024 switch(state) {
1025 case NSstart:
1026 case NSsign:
1027 continue;
1029 break;
1031 if(c == '+' || c == '-') {
1032 switch(state) {
1033 case NSstart:
1034 state = NSsign;
1035 if(c == '-')
1036 rflag = !rflag;
1037 continue;
1038 case NSexp:
1039 state = NSexpsign;
1040 if(c == '-')
1041 expsign = 1;
1042 continue;
1044 break;
1046 if(c == '0') {
1047 switch(state) {
1048 case NSdigit:
1049 if(rflag)
1050 c = ~c;
1051 *kp++ = c;
1052 cl++;
1053 nzero++;
1054 dp++;
1055 state = NSdigit;
1056 continue;
1057 case NSfract:
1058 if(rflag)
1059 c = ~c;
1060 *kp++ = c;
1061 cl++;
1062 nzero++;
1063 state = NSfract;
1064 continue;
1065 case NSstart:
1066 case NSsign:
1067 case NSzero:
1068 state = NSzero;
1069 continue;
1070 case NSzerofract:
1071 case NSpoint:
1072 dp--;
1073 state = NSzerofract;
1074 continue;
1075 case NSexpsign:
1076 case NSexp:
1077 case NSexpdigit:
1078 exp = exp*10 + (c - '0');
1079 state = NSexpdigit;
1080 continue;
1082 break;
1084 if(c >= '1' && c <= '9') {
1085 switch(state) {
1086 case NSzero:
1087 case NSstart:
1088 case NSsign:
1089 case NSdigit:
1090 if(rflag)
1091 c = ~c;
1092 *kp++ = c;
1093 cl++;
1094 nzero = 0;
1095 dp++;
1096 state = NSdigit;
1097 continue;
1098 case NSzerofract:
1099 case NSpoint:
1100 case NSfract:
1101 if(rflag)
1102 c = ~c;
1103 *kp++ = c;
1104 cl++;
1105 nzero = 0;
1106 state = NSfract;
1107 continue;
1108 case NSexpsign:
1109 case NSexp:
1110 case NSexpdigit:
1111 exp = exp*10 + (c - '0');
1112 state = NSexpdigit;
1113 continue;
1115 break;
1117 if(c == '.') {
1118 switch(state) {
1119 case NSstart:
1120 case NSsign:
1121 state = NSpoint;
1122 continue;
1123 case NSzero:
1124 state = NSzerofract;
1125 continue;
1126 case NSdigit:
1127 state = NSfract;
1128 continue;
1130 break;
1132 if((f->flags & Gflag) && (c == 'e' || c == 'E')) {
1133 switch(state) {
1134 case NSdigit:
1135 case NSfract:
1136 state = NSexp;
1137 continue;
1139 break;
1141 break;
1144 switch(state) {
1146 * result is zero
1148 case NSstart:
1149 case NSsign:
1150 case NSzero:
1151 case NSzerofract:
1152 case NSpoint:
1153 kp = k->key + k->klen;
1154 k->klen += 2;
1155 kp[0] = 0x20; /* between + and - */
1156 kp[1] = 0;
1157 return;
1159 * result has exponent
1161 case NSexpsign:
1162 case NSexp:
1163 case NSexpdigit:
1164 if(expsign)
1165 exp = -exp;
1166 dp += exp;
1169 * result is fixed point number
1171 case NSdigit:
1172 case NSfract:
1173 kp -= nzero;
1174 cl -= nzero;
1175 break;
1179 * end of number
1181 c = 0;
1182 if(rflag)
1183 c = ~c;
1184 *kp = c;
1187 * sign and exponent
1189 c = 0x30;
1190 if(rflag) {
1191 c = 0x10;
1192 dp = ~dp;
1194 kp = k->key + k->klen;
1195 kp[0] = c;
1196 kp[1] = (dp >> 8);
1197 kp[2] = dp;
1198 k->klen = cl+1;
1201 void
1202 dokey_m(Key *k, uchar *lp, uchar *lpe, Field *f)
1204 uchar *kp;
1205 Rune r, place[3];
1206 int c, cl, pc;
1207 int rflag;
1209 rflag = f->flags&Rflag;
1210 pc = 0;
1212 cl = k->klen;
1213 kp = k->key + cl;
1215 for(;;) {
1217 * get the character
1219 if(lp >= lpe)
1220 break;
1221 c = *lp;
1222 if(c >= Runeself) {
1223 lp += chartorune(&r, (char*)lp);
1224 c = r;
1225 } else
1226 lp++;
1228 if(c < nelem(f->mapto)) {
1229 c = f->mapto[c];
1230 if(c == 0)
1231 continue;
1233 place[pc++] = c;
1234 if(pc < 3)
1235 continue;
1236 for(c=11; c>=0; c--)
1237 if(memcmp(month[c], place, sizeof(place)) == 0)
1238 break;
1239 c += 10;
1240 if(rflag)
1241 c = ~c;
1242 *kp++ = c;
1243 cl++;
1244 break;
1247 c = 0;
1248 if(rflag)
1249 c = ~c;
1250 *kp = c;
1251 k->klen = cl+1;
1254 void
1255 dokey_dfi(Key *k, uchar *lp, uchar *lpe, Field *f)
1257 uchar *kp;
1258 Rune r;
1259 int c, cl, n, rflag;
1261 cl = k->klen;
1262 kp = k->key + cl;
1263 rflag = f->flags & Rflag;
1265 for(;;) {
1267 * get the character
1269 if(lp >= lpe)
1270 break;
1271 c = *lp;
1272 if(c >= Runeself) {
1273 lp += chartorune(&r, (char*)lp);
1274 c = r;
1275 } else
1276 lp++;
1279 * do the various mappings.
1280 * the common case is handled
1281 * completely by the table.
1283 if(c != 0 && c < Runeself) {
1284 c = f->mapto[c];
1285 if(c) {
1286 *kp++ = c;
1287 cl++;
1289 continue;
1293 * for characters out of range,
1294 * the table does not do Rflag.
1295 * ignore is based on mapto[255]
1297 if(c != 0 && c < nelem(f->mapto)) {
1298 c = f->mapto[c];
1299 if(c == 0)
1300 continue;
1301 } else
1302 if(f->mapto[nelem(f->mapto)-1] == 0)
1303 continue;
1306 * put it in the key
1308 r = c;
1309 n = runetochar((char*)kp, &r);
1310 kp += n;
1311 cl += n;
1312 if(rflag)
1313 while(n > 0) {
1314 kp[-n] = ~kp[-n];
1315 n--;
1320 * end of key
1322 k->klen = cl+1;
1323 if(rflag) {
1324 *kp = ~0;
1325 return;
1327 *kp = 0;
1330 void
1331 dokey_r(Key *k, uchar *lp, uchar *lpe, Field *f)
1333 int cl, n;
1334 uchar *kp;
1336 USED(f);
1337 n = lpe - lp;
1338 if(n < 0)
1339 n = 0;
1340 cl = k->klen;
1341 kp = k->key + cl;
1342 k->klen = cl+n+1;
1344 lpe -= 3;
1345 while(lp < lpe) {
1346 kp[0] = ~lp[0];
1347 kp[1] = ~lp[1];
1348 kp[2] = ~lp[2];
1349 kp[3] = ~lp[3];
1350 kp += 4;
1351 lp += 4;
1354 lpe += 3;
1355 while(lp < lpe)
1356 *kp++ = ~*lp++;
1357 *kp = ~0;
1360 void
1361 dokey_(Key *k, uchar *lp, uchar *lpe, Field *f)
1363 int n, cl;
1364 uchar *kp;
1366 USED(f);
1367 n = lpe - lp;
1368 if(n < 0)
1369 n = 0;
1370 cl = k->klen;
1371 kp = k->key + cl;
1372 k->klen = cl+n+1;
1373 memmove(kp, lp, n);
1374 kp[n] = 0;
1377 void
1378 buildkey(Line *l)
1380 Key *k;
1381 uchar *lp, *lpe;
1382 int ll, kl, cl, i, n;
1383 Field *f;
1385 ll = l->llen - 1;
1386 kl = 0; /* allocated length */
1387 cl = 0; /* current length */
1388 k = 0;
1390 for(i=1; i<=args.nfield; i++) {
1391 f = &args.field[i];
1392 lp = skip(l->line, f->beg1, f->beg2, f->flags&B1flag, 0);
1393 if(lp == 0)
1394 lp = l->line + ll;
1395 lpe = skip(l->line, f->end1, f->end2, f->flags&Bflag, 1);
1396 if(lpe == 0)
1397 lpe = l->line + ll;
1398 n = (lpe - lp) + 1;
1399 if(n <= 0)
1400 n = 1;
1401 if(cl+(n+4) > kl) {
1402 kl = cl+(n+4);
1403 k = realloc(k, sizeof(Key) +
1404 (kl-1)*sizeof(k->key[0]));
1405 if(k == 0)
1406 nomem();
1408 k->klen = cl;
1409 (*f->dokey)(k, lp, lpe, f);
1410 cl = k->klen;
1414 * global comparisons
1416 if(!(args.uflag && cl > 0)) {
1417 f = &args.field[0];
1418 if(cl+(ll+4) > kl) {
1419 kl = cl+(ll+4);
1420 k = realloc(k, sizeof(Key) +
1421 (kl-1)*sizeof(k->key[0]));
1422 if(k == 0)
1423 nomem();
1425 k->klen = cl;
1426 (*f->dokey)(k, l->line, l->line+ll, f);
1427 cl = k->klen;
1430 l->key = k;
1431 k->klen = cl;
1433 if(args.vflag) {
1434 write(2, l->line, l->llen);
1435 for(i=0; i<k->klen; i++) {
1436 fprint(2, " %.2x", k->key[i]);
1437 if(k->key[i] == 0x00 || k->key[i] == 0xff)
1438 fprint(2, "\n");
1443 void
1444 makemapm(Field *f)
1446 int i, c;
1448 for(i=0; i<nelem(f->mapto); i++) {
1449 c = 1;
1450 if(i == ' ' || i == '\t')
1451 c = 0;
1452 if(i >= 'a' && i <= 'z')
1453 c = i + ('A' - 'a');
1454 if(i >= 'A' && i <= 'Z')
1455 c = i;
1456 f->mapto[i] = c;
1457 if(args.vflag) {
1458 if((i & 15) == 0)
1459 fprint(2, " ");
1460 fprint(2, " %.2x", c);
1461 if((i & 15) == 15)
1462 fprint(2, "\n");
1467 void
1468 makemapd(Field *f)
1470 int i, j, c;
1472 for(i=0; i<nelem(f->mapto); i++) {
1473 c = i;
1474 if(f->flags & Iflag)
1475 if(c < 040 || c > 0176)
1476 c = -1;
1477 if((f->flags & Wflag) && c >= 0)
1478 if(c == ' ' || c == '\t')
1479 c = -1;
1480 if((f->flags & Dflag) && c >= 0)
1481 if(!(c == ' ' || c == '\t' ||
1482 (c >= 'a' && c <= 'z') ||
1483 (c >= 'A' && c <= 'Z') ||
1484 (c >= '0' && c <= '9'))) {
1485 for(j=0; latinmap[j]; j+=3)
1486 if(c == latinmap[j+0] ||
1487 c == latinmap[j+1])
1488 break;
1489 if(latinmap[j] == 0)
1490 c = -1;
1492 if((f->flags & Fflag) && c >= 0) {
1493 if(c >= 'a' && c <= 'z')
1494 c += 'A' - 'a';
1495 for(j=0; latinmap[j]; j+=3)
1496 if(c == latinmap[j+0] ||
1497 c == latinmap[j+1]) {
1498 c = latinmap[j+2];
1499 break;
1502 if((f->flags & Rflag) && c >= 0 && i > 0 && i < Runeself)
1503 c = ~c & 0xff;
1504 if(c < 0)
1505 c = 0;
1506 f->mapto[i] = c;
1507 if(args.vflag) {
1508 if((i & 15) == 0)
1509 fprint(2, " ");
1510 fprint(2, " %.2x", c);
1511 if((i & 15) == 15)
1512 fprint(2, "\n");
1517 int latinmap[] =
1519 /* lcase ucase fold */
1520 0xe0, 0xc0, 0x41, /* L'à', L'À', L'A', */
1521 0xe1, 0xc1, 0x41, /* L'á', L'Á', L'A', */
1522 0xe2, 0xc2, 0x41, /* L'â', L'Â', L'A', */
1523 0xe4, 0xc4, 0x41, /* L'ä', L'Ä', L'A', */
1524 0xe3, 0xc3, 0x41, /* L'ã', L'Ã', L'A', */
1525 0xe5, 0xc5, 0x41, /* L'å', L'Å', L'A', */
1526 0xe8, 0xc8, 0x45, /* L'è', L'È', L'E', */
1527 0xe9, 0xc9, 0x45, /* L'é', L'É', L'E', */
1528 0xea, 0xca, 0x45, /* L'ê', L'Ê', L'E', */
1529 0xeb, 0xcb, 0x45, /* L'ë', L'Ë', L'E', */
1530 0xec, 0xcc, 0x49, /* L'ì', L'Ì', L'I', */
1531 0xed, 0xcd, 0x49, /* L'í', L'Í', L'I', */
1532 0xee, 0xce, 0x49, /* L'î', L'Î', L'I', */
1533 0xef, 0xcf, 0x49, /* L'ï', L'Ï', L'I', */
1534 0xf2, 0xd2, 0x4f, /* L'ò', L'Ò', L'O', */
1535 0xf3, 0xd3, 0x4f, /* L'ó', L'Ó', L'O', */
1536 0xf4, 0xd4, 0x4f, /* L'ô', L'Ô', L'O', */
1537 0xf6, 0xd6, 0x4f, /* L'ö', L'Ö', L'O', */
1538 0xf5, 0xd5, 0x4f, /* L'õ', L'Õ', L'O', */
1539 0xf8, 0xd8, 0x4f, /* L'ø', L'Ø', L'O', */
1540 0xf9, 0xd9, 0x55, /* L'ù', L'Ù', L'U', */
1541 0xfa, 0xda, 0x55, /* L'ú', L'Ú', L'U', */
1542 0xfb, 0xdb, 0x55, /* L'û', L'Û', L'U', */
1543 0xfc, 0xdc, 0x55, /* L'ü', L'Ü', L'U', */
1544 0xe6, 0xc6, 0x41, /* L'æ', L'Æ', L'A', */
1545 0xf0, 0xd0, 0x44, /* L'ð', L'Ð', L'D', */
1546 0xf1, 0xd1, 0x4e, /* L'ñ', L'Ñ', L'N', */
1547 0xfd, 0xdd, 0x59, /* L'ý', L'Ý', L'Y', */
1548 0xe7, 0xc7, 0x43, /* L'ç', L'Ç', L'C', */
1552 Rune LJAN[] = { 'J', 'A', 'N', 0 };
1553 Rune LFEB[] = { 'F', 'E', 'B', 0 };
1554 Rune LMAR[] = { 'M', 'A', 'R', 0 };
1555 Rune LAPR[] = { 'A', 'P', 'R', 0 };
1556 Rune LMAY[] = { 'M', 'A', 'Y', 0 };
1557 Rune LJUN[] = { 'J', 'U', 'N', 0 };
1558 Rune LJUL[] = { 'J', 'U', 'L', 0 };
1559 Rune LAUG[] = { 'A', 'U', 'G', 0 };
1560 Rune LSEP[] = { 'S', 'E', 'P', 0 };
1561 Rune LOCT[] = { 'O', 'C', 'T', 0 };
1562 Rune LNOV[] = { 'N', 'O', 'V', 0 };
1563 Rune LDEC[] = { 'D', 'E', 'C', 0 };
1565 Rune* month[12] =
1567 LJAN,
1568 LFEB,
1569 LMAR,
1570 LAPR,
1571 LMAY,
1572 LJUN,
1573 LJUL,
1574 LAUG,
1575 LSEP,
1576 LOCT,
1577 LNOV,
1578 LDEC,
1581 /************** radix sort ***********/
1583 enum
1585 Threshold = 14
1588 void rsort4(Key***, ulong, int);
1589 void bsort4(Key***, ulong, int);
1591 void
1592 sort4(void *a, ulong n)
1594 if(n > Threshold)
1595 rsort4((Key***)a, n, 0);
1596 else
1597 bsort4((Key***)a, n, 0);
1600 void
1601 rsort4(Key ***a, ulong n, int b)
1603 Key ***ea, ***t, ***u, **t1, **u1, *k;
1604 Key ***part[257];
1605 static long count[257];
1606 long clist[257+257], *cp, *cp1;
1607 int c, lowc, higc;
1610 * pass 1 over all keys,
1611 * count the number of each key[b].
1612 * find low count and high count.
1614 lowc = 256;
1615 higc = 0;
1616 ea = a+n;
1617 for(t=a; t<ea; t++) {
1618 k = **t;
1619 n = k->klen;
1620 if(b >= n) {
1621 count[256]++;
1622 continue;
1624 c = k->key[b];
1625 n = count[c]++;
1626 if(n == 0) {
1627 if(c < lowc)
1628 lowc = c;
1629 if(c > higc)
1630 higc = c;
1635 * pass 2 over all counts,
1636 * put partition pointers in part[c].
1637 * save compacted indexes and counts
1638 * in clist[].
1640 t = a;
1641 n = count[256];
1642 clist[0] = n;
1643 part[256] = t;
1644 t += n;
1646 cp1 = clist+1;
1647 cp = count+lowc;
1648 for(c=lowc; c<=higc; c++,cp++) {
1649 n = *cp;
1650 if(n) {
1651 cp1[0] = n;
1652 cp1[1] = c;
1653 cp1 += 2;
1654 part[c] = t;
1655 t += n;
1658 *cp1 = 0;
1661 * pass 3 over all counts.
1662 * chase lowest pointer in each partition
1663 * around a permutation until it comes
1664 * back and is stored where it started.
1665 * static array, count[], should be
1666 * reduced to zero entries except maybe
1667 * count[256].
1669 for(cp1=clist+1; cp1[0]; cp1+=2) {
1670 c = cp1[1];
1671 cp = count+c;
1672 while(*cp) {
1673 t1 = *part[c];
1674 for(;;) {
1675 k = *t1;
1676 n = 256;
1677 if(b < k->klen)
1678 n = k->key[b];
1679 u = part[n]++;
1680 count[n]--;
1681 u1 = *u;
1682 *u = t1;
1683 if(n == c)
1684 break;
1685 t1 = u1;
1691 * pass 4 over all partitions.
1692 * call recursively.
1694 b++;
1695 t = a + clist[0];
1696 count[256] = 0;
1697 for(cp1=clist+1; n=cp1[0]; cp1+=2) {
1698 if(n > Threshold)
1699 rsort4(t, n, b);
1700 else
1701 if(n > 1)
1702 bsort4(t, n, b);
1703 t += n;
1708 * bubble sort to pick up
1709 * the pieces.
1711 void
1712 bsort4(Key ***a, ulong n, int b)
1714 Key ***i, ***j, ***k, ***l, **t;
1715 Key *ka, *kb;
1716 int n1, n2;
1718 l = a+n;
1719 j = a;
1721 loop:
1722 i = j;
1723 j++;
1724 if(j >= l)
1725 return;
1727 ka = **i;
1728 kb = **j;
1729 n1 = ka->klen - b;
1730 n2 = kb->klen - b;
1731 if(n1 > n2)
1732 n1 = n2;
1733 if(n1 <= 0)
1734 goto loop;
1735 n2 = ka->key[b] - kb->key[b];
1736 if(n2 == 0)
1737 n2 = memcmp(ka->key+b, kb->key+b, n1);
1738 if(n2 <= 0)
1739 goto loop;
1741 for(;;) {
1742 k = i+1;
1744 t = *k;
1745 *k = *i;
1746 *i = t;
1748 if(i <= a)
1749 goto loop;
1751 i--;
1752 ka = **i;
1753 kb = *t;
1754 n1 = ka->klen - b;
1755 n2 = kb->klen - b;
1756 if(n1 > n2)
1757 n1 = n2;
1758 if(n1 <= 0)
1759 goto loop;
1760 n2 = ka->key[b] - kb->key[b];
1761 if(n2 == 0)
1762 n2 = memcmp(ka->key+b, kb->key+b, n1);
1763 if(n2 <= 0)
1764 goto loop;