Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
5 /*
6 bugs:
7 00/ff for end of file can conflict with 00/ff characters
8 */
10 enum
11 {
12 Nline = 500000, /* default max number of lines saved in memory */
13 Nmerge = 10, /* max number of temporary files merged */
14 Nfield = 20, /* max number of argument fields */
16 Bflag = 1<<0, /* flags per field */
17 B1flag = 1<<1,
19 Dflag = 1<<2,
20 Fflag = 1<<3,
21 Gflag = 1<<4,
22 Iflag = 1<<5,
23 Mflag = 1<<6,
24 Nflag = 1<<7,
25 Rflag = 1<<8,
26 Wflag = 1<<9,
28 NSstart = 0, /* states for number to key decoding */
29 NSsign,
30 NSzero,
31 NSdigit,
32 NSpoint,
33 NSfract,
34 NSzerofract,
35 NSexp,
36 NSexpsign,
37 NSexpdigit,
38 };
40 typedef struct Line Line;
41 typedef struct Key Key;
42 typedef struct Merge Merge;
43 typedef struct Field Field;
45 struct Line
46 {
47 Key* key;
48 int llen; /* always >= 1 */
49 uchar line[1]; /* always ends in '\n' */
50 };
52 struct Merge
53 {
54 Key* key; /* copy of line->key so (Line*) looks like (Merge*) */
55 Line* line; /* line at the head of a merged temp file */
56 int fd; /* file descriptor */
57 Biobuf b; /* iobuf for reading a temp file */
58 };
60 struct Key
61 {
62 int klen;
63 uchar key[1];
64 };
66 struct Field
67 {
68 int beg1;
69 int beg2;
70 int end1;
71 int end2;
73 long flags;
74 uchar mapto[256];
76 void (*dokey)(Key*, uchar*, uchar*, Field*);
77 };
79 struct args
80 {
81 char* ofile;
82 char* tname;
83 Rune tabchar;
84 char cflag;
85 char uflag;
86 char vflag;
87 int nfield;
88 int nfile;
89 Field field[Nfield];
91 Line** linep;
92 long nline; /* number of lines in this temp file */
93 long lineno; /* overall ordinal for -s option */
94 int ntemp;
95 long mline; /* max lines per file */
96 } args;
98 extern int latinmap[];
99 extern Rune* month[12];
101 void buildkey(Line*);
102 void doargs(int, char*[]);
103 void dofield(char*, int*, int*, int, int);
104 void dofile(Biobuf*);
105 void dokey_(Key*, uchar*, uchar*, Field*);
106 void dokey_dfi(Key*, uchar*, uchar*, Field*);
107 void dokey_gn(Key*, uchar*, uchar*, Field*);
108 void dokey_m(Key*, uchar*, uchar*, Field*);
109 void dokey_r(Key*, uchar*, uchar*, Field*);
110 void done(char*);
111 int kcmp(Key*, Key*);
112 void makemapd(Field*);
113 void makemapm(Field*);
114 void mergefiles(int, int, Biobuf*);
115 void mergeout(Biobuf*);
116 void newfield(void);
117 Line* newline(Biobuf*);
118 void nomem(void);
119 void notifyf(void*, char*);
120 void printargs(void);
121 void printout(Biobuf*);
122 void setfield(int, int);
123 uchar* skip(uchar*, int, int, int, int);
124 void sort4(void*, ulong);
125 char* tempfile(int);
126 void tempout(void);
127 void lineout(Biobuf*, Line*);
129 void
130 main(int argc, char *argv[])
132 int i, f;
133 char *s;
134 Biobuf bbuf;
136 notify(notifyf); /**/
137 doargs(argc, argv);
138 if(args.vflag)
139 printargs();
141 for(i=1; i<argc; i++) {
142 s = argv[i];
143 if(s == 0)
144 continue;
145 if(strcmp(s, "-") == 0) {
146 Binit(&bbuf, 0, OREAD);
147 dofile(&bbuf);
148 Bterm(&bbuf);
149 continue;
151 f = open(s, OREAD);
152 if(f < 0) {
153 fprint(2, "sort: open %s: %r\n", s);
154 done("open");
156 Binit(&bbuf, f, OREAD);
157 dofile(&bbuf);
158 Bterm(&bbuf);
159 close(f);
161 if(args.nfile == 0) {
162 Binit(&bbuf, 0, OREAD);
163 dofile(&bbuf);
164 Bterm(&bbuf);
166 if(args.cflag)
167 done(0);
168 if(args.vflag)
169 fprint(2, "=========\n");
171 f = 1;
172 if(args.ofile) {
173 f = create(args.ofile, OWRITE, 0666);
174 if(f < 0) {
175 fprint(2, "sort: create %s: %r\n", args.ofile);
176 done("create");
180 Binit(&bbuf, f, OWRITE);
181 if(args.ntemp) {
182 tempout();
183 mergeout(&bbuf);
184 } else {
185 printout(&bbuf);
187 Bterm(&bbuf);
188 done(0);
191 void
192 dofile(Biobuf *b)
194 Line *l, *ol;
195 int n;
197 if(args.cflag) {
198 ol = newline(b);
199 if(ol == 0)
200 return;
201 for(;;) {
202 l = newline(b);
203 if(l == 0)
204 break;
205 n = kcmp(ol->key, l->key);
206 if(n > 0 || (n == 0 && args.uflag)) {
207 fprint(2, "sort: -c file not in sort\n"); /**/
208 done("order");
210 free(ol->key);
211 free(ol);
212 ol = l;
214 return;
217 if(args.linep == 0) {
218 args.linep = malloc(args.mline * sizeof(args.linep));
219 if(args.linep == 0)
220 nomem();
222 for(;;) {
223 l = newline(b);
224 if(l == 0)
225 break;
226 if(args.nline >= args.mline)
227 tempout();
228 args.linep[args.nline] = l;
229 args.nline++;
230 args.lineno++;
234 void
235 notifyf(void *a, char *s)
237 USED(a);
238 if(strcmp(s, "interrupt") == 0)
239 done(0);
240 if(strcmp(s, "hangup") == 0)
241 done(0);
242 if(strcmp(s, "kill") == 0)
243 done(0);
244 if(strncmp(s, "sys: write on closed pipe", 25) == 0)
245 done(0);
246 fprint(2, "sort: note: %s\n", s);
247 abort();
250 Line*
251 newline(Biobuf *b)
253 Line *l;
254 char *p;
255 int n, c;
257 p = Brdline(b, '\n');
258 n = Blinelen(b);
259 if(p == 0) {
260 if(n == 0)
261 return 0;
262 l = 0;
263 for(n=0;;) {
264 if((n & 31) == 0) {
265 l = realloc(l, sizeof(Line) +
266 (n+31)*sizeof(l->line[0]));
267 if(l == 0)
268 nomem();
270 c = Bgetc(b);
271 if(c < 0) {
272 fprint(2, "sort: newline added\n");
273 c = '\n';
275 l->line[n++] = c;
276 if(c == '\n')
277 break;
279 l->llen = n;
280 buildkey(l);
281 return l;
283 l = malloc(sizeof(Line) +
284 (n-1)*sizeof(l->line[0]));
285 if(l == 0)
286 nomem();
287 l->llen = n;
288 memmove(l->line, p, n);
289 buildkey(l);
290 return l;
293 void
294 lineout(Biobuf *b, Line *l)
296 int n, m;
298 n = l->llen;
299 m = Bwrite(b, l->line, n);
300 if(n != m)
301 exits("write");
304 void
305 tempout(void)
307 long n;
308 Line **lp, *l;
309 char *tf;
310 int f;
311 Biobuf tb;
313 sort4(args.linep, args.nline);
314 tf = tempfile(args.ntemp);
315 args.ntemp++;
316 f = create(tf, OWRITE, 0666);
317 if(f < 0) {
318 fprint(2, "sort: create %s: %r\n", tf);
319 done("create");
322 Binit(&tb, f, OWRITE);
323 lp = args.linep;
324 for(n=args.nline; n>0; n--) {
325 l = *lp++;
326 lineout(&tb, l);
327 free(l->key);
328 free(l);
330 args.nline = 0;
331 Bterm(&tb);
332 close(f);
335 void
336 done(char *xs)
338 int i;
340 for(i=0; i<args.ntemp; i++)
341 remove(tempfile(i));
342 exits(xs);
345 void
346 nomem(void)
348 fprint(2, "sort: out of memory\n");
349 done("mem");
352 char*
353 tempfile(int n)
355 static char file[100];
356 static uint pid;
357 char *dir;
359 dir = "/var/tmp";
360 if(args.tname)
361 dir = args.tname;
362 if(strlen(dir) >= nelem(file)-20) {
363 fprint(2, "temp file directory name is too long: %s\n", dir);
364 done("tdir");
367 if(pid == 0) {
368 pid = getpid();
369 if(pid == 0) {
370 pid = time(0);
371 if(pid == 0)
372 pid = 1;
376 sprint(file, "%s/sort.%.4d.%.4d", dir, pid%10000, n);
377 return file;
380 void
381 mergeout(Biobuf *b)
383 int n, i, f;
384 char *tf;
385 Biobuf tb;
387 for(i=0; i<args.ntemp; i+=n) {
388 n = args.ntemp - i;
389 if(n > Nmerge) {
390 tf = tempfile(args.ntemp);
391 args.ntemp++;
392 f = create(tf, OWRITE, 0666);
393 if(f < 0) {
394 fprint(2, "sort: create %s: %r\n", tf);
395 done("create");
397 Binit(&tb, f, OWRITE);
399 n = Nmerge;
400 mergefiles(i, n, &tb);
402 Bterm(&tb);
403 close(f);
404 } else
405 mergefiles(i, n, b);
409 void
410 mergefiles(int t, int n, Biobuf *b)
412 Merge *m, *mp, **mmp;
413 Key *ok;
414 Line *l;
415 char *tf;
416 int i, f, nn;
418 mmp = malloc(n*sizeof(*mmp));
419 mp = malloc(n*sizeof(*mp));
420 if(mmp == 0 || mp == 0)
421 nomem();
423 nn = 0;
424 m = mp;
425 for(i=0; i<n; i++,m++) {
426 tf = tempfile(t+i);
427 f = open(tf, OREAD);
428 if(f < 0) {
429 fprint(2, "sort: reopen %s: %r\n", tf);
430 done("open");
432 m->fd = f;
433 Binit(&m->b, f, OREAD);
434 mmp[nn] = m;
436 l = newline(&m->b);
437 if(l == 0)
438 continue;
439 nn++;
440 m->line = l;
441 m->key = l->key;
444 ok = 0;
445 for(;;) {
446 sort4(mmp, nn);
447 m = *mmp;
448 if(nn == 0)
449 break;
450 for(;;) {
451 l = m->line;
452 if(args.uflag && ok && kcmp(ok, l->key) == 0) {
453 free(l->key);
454 free(l);
455 } else {
456 lineout(b, l);
457 if(ok)
458 free(ok);
459 ok = l->key;
460 free(l);
463 l = newline(&m->b);
464 if(l == 0) {
465 nn--;
466 mmp[0] = mmp[nn];
467 break;
469 m->line = l;
470 m->key = l->key;
471 if(nn > 1 && kcmp(mmp[0]->key, mmp[1]->key) > 0)
472 break;
475 if(ok)
476 free(ok);
478 m = mp;
479 for(i=0; i<n; i++,m++) {
480 Bterm(&m->b);
481 close(m->fd);
484 free(mp);
485 free(mmp);
488 int
489 kcmp(Key *ka, Key *kb)
491 int n, m;
493 /*
494 * set n to length of smaller key
495 */
496 n = ka->klen;
497 m = kb->klen;
498 if(n > m)
499 n = m;
500 return memcmp(ka->key, kb->key, n);
503 void
504 printout(Biobuf *b)
506 long n;
507 Line **lp, *l;
508 Key *ok;
510 sort4(args.linep, args.nline);
511 lp = args.linep;
512 ok = 0;
513 for(n=args.nline; n>0; n--) {
514 l = *lp++;
515 if(args.uflag && ok && kcmp(ok, l->key) == 0)
516 continue;
517 lineout(b, l);
518 ok = l->key;
522 void
523 setfield(int n, int c)
525 Field *f;
527 f = &args.field[n];
528 switch(c) {
529 default:
530 fprint(2, "sort: unknown option: field.%C\n", c);
531 done("option");
532 case 'b': /* skip blanks */
533 f->flags |= Bflag;
534 break;
535 case 'd': /* directory order */
536 f->flags |= Dflag;
537 break;
538 case 'f': /* fold case */
539 f->flags |= Fflag;
540 break;
541 case 'g': /* floating point -n case */
542 f->flags |= Gflag;
543 break;
544 case 'i': /* ignore non-ascii */
545 f->flags |= Iflag;
546 break;
547 case 'M': /* month */
548 f->flags |= Mflag;
549 break;
550 case 'n': /* numbers */
551 f->flags |= Nflag;
552 break;
553 case 'r': /* reverse */
554 f->flags |= Rflag;
555 break;
556 case 'w': /* ignore white */
557 f->flags |= Wflag;
558 break;
562 void
563 dofield(char *s, int *n1, int *n2, int off1, int off2)
565 int c, n;
567 c = *s++;
568 if(c >= '0' && c <= '9') {
569 n = 0;
570 while(c >= '0' && c <= '9') {
571 n = n*10 + (c-'0');
572 c = *s++;
574 n -= off1; /* posix committee: rot in hell */
575 if(n < 0) {
576 fprint(2, "sort: field offset must be positive\n");
577 done("option");
579 *n1 = n;
581 if(c == '.') {
582 c = *s++;
583 if(c >= '0' && c <= '9') {
584 n = 0;
585 while(c >= '0' && c <= '9') {
586 n = n*10 + (c-'0');
587 c = *s++;
589 n -= off2;
590 if(n < 0) {
591 fprint(2, "sort: character offset must be positive\n");
592 done("option");
594 *n2 = n;
597 while(c != 0) {
598 setfield(args.nfield, c);
599 c = *s++;
603 void
604 printargs(void)
606 int i, n;
607 Field *f;
608 char *prefix;
610 fprint(2, "sort");
611 for(i=0; i<=args.nfield; i++) {
612 f = &args.field[i];
613 prefix = " -";
614 if(i) {
615 n = f->beg1;
616 if(n >= 0)
617 fprint(2, " +%d", n);
618 else
619 fprint(2, " +*");
620 n = f->beg2;
621 if(n >= 0)
622 fprint(2, ".%d", n);
623 else
624 fprint(2, ".*");
626 if(f->flags & B1flag)
627 fprint(2, "b");
629 n = f->end1;
630 if(n >= 0)
631 fprint(2, " -%d", n);
632 else
633 fprint(2, " -*");
634 n = f->end2;
635 if(n >= 0)
636 fprint(2, ".%d", n);
637 else
638 fprint(2, ".*");
639 prefix = "";
641 if(f->flags & Bflag)
642 fprint(2, "%sb", prefix);
643 if(f->flags & Dflag)
644 fprint(2, "%sd", prefix);
645 if(f->flags & Fflag)
646 fprint(2, "%sf", prefix);
647 if(f->flags & Gflag)
648 fprint(2, "%sg", prefix);
649 if(f->flags & Iflag)
650 fprint(2, "%si", prefix);
651 if(f->flags & Mflag)
652 fprint(2, "%sM", prefix);
653 if(f->flags & Nflag)
654 fprint(2, "%sn", prefix);
655 if(f->flags & Rflag)
656 fprint(2, "%sr", prefix);
657 if(f->flags & Wflag)
658 fprint(2, "%sw", prefix);
660 if(args.cflag)
661 fprint(2, " -c");
662 if(args.uflag)
663 fprint(2, " -u");
664 if(args.ofile)
665 fprint(2, " -o %s", args.ofile);
666 if(args.mline != Nline)
667 fprint(2, " -l %ld", args.mline);
668 fprint(2, "\n");
671 void
672 newfield(void)
674 int n;
675 Field *f;
677 n = args.nfield + 1;
678 if(n >= Nfield) {
679 fprint(2, "sort: too many fields specified\n");
680 done("option");
682 args.nfield = n;
683 f = &args.field[n];
684 f->beg1 = -1;
685 f->beg2 = -1;
686 f->end1 = -1;
687 f->end2 = -1;
690 void
691 doargs(int argc, char *argv[])
693 int i, c, hadplus;
694 char *s, *p, *q;
695 Field *f;
697 hadplus = 0;
698 args.mline = Nline;
699 for(i=1; i<argc; i++) {
700 s = argv[i];
701 c = *s++;
702 if(c == '-') {
703 c = *s;
704 if(c == 0) /* forced end of arg marker */
705 break;
706 argv[i] = 0; /* clobber args processed */
707 if(c == '.' || (c >= '0' && c <= '9')) {
708 if(!hadplus)
709 newfield();
710 f = &args.field[args.nfield];
711 dofield(s, &f->end1, &f->end2, 0, 0);
712 hadplus = 0;
713 continue;
716 while(c = *s++)
717 switch(c) {
718 case '-': /* end of options */
719 i = argc;
720 continue;
721 case 'T': /* temp directory */
722 if(*s == 0) {
723 i++;
724 if(i < argc) {
725 args.tname = argv[i];
726 argv[i] = 0;
728 } else
729 args.tname = s;
730 s = strchr(s, 0);
731 break;
732 case 'o': /* output file */
733 if(*s == 0) {
734 i++;
735 if(i < argc) {
736 args.ofile = argv[i];
737 argv[i] = 0;
739 } else
740 args.ofile = s;
741 s = strchr(s, 0);
742 break;
743 case 'k': /* posix key (what were they thinking?) */
744 p = 0;
745 if(*s == 0) {
746 i++;
747 if(i < argc) {
748 p = argv[i];
749 argv[i] = 0;
751 } else
752 p = s;
753 s = strchr(s, 0);
754 if(p == 0)
755 break;
757 newfield();
758 q = strchr(p, ',');
759 if(q)
760 *q++ = 0;
761 f = &args.field[args.nfield];
762 dofield(p, &f->beg1, &f->beg2, 1, 1);
763 if(f->flags & Bflag) {
764 f->flags |= B1flag;
765 f->flags &= ~Bflag;
767 if(q) {
768 dofield(q, &f->end1, &f->end2, 1, 0);
769 if(f->end2 <= 0)
770 f->end1++;
772 hadplus = 0;
773 break;
774 case 't': /* tab character */
775 if(*s == 0) {
776 i++;
777 if(i < argc) {
778 chartorune(&args.tabchar, argv[i]);
779 argv[i] = 0;
781 } else
782 s += chartorune(&args.tabchar, s);
783 if(args.tabchar == '\n') {
784 fprint(2, "aw come on, rob\n");
785 done("rob");
787 break;
788 case 'c': /* check order */
789 args.cflag = 1;
790 break;
791 case 'u': /* unique */
792 args.uflag = 1;
793 break;
794 case 'v': /* debugging noise */
795 args.vflag = 1;
796 break;
797 case 'l':
798 if(*s == 0) {
799 i++;
800 if(i < argc) {
801 args.mline = atol(argv[i]);
802 argv[i] = 0;
804 } else
805 args.mline = atol(s);
806 s = strchr(s, 0);
807 break;
809 case 'M': /* month */
810 case 'b': /* skip blanks */
811 case 'd': /* directory order */
812 case 'f': /* fold case */
813 case 'g': /* floating numbers */
814 case 'i': /* ignore non-ascii */
815 case 'n': /* numbers */
816 case 'r': /* reverse */
817 case 'w': /* ignore white */
818 if(args.nfield > 0)
819 fprint(2, "sort: global field set after -k\n");
820 setfield(0, c);
821 break;
822 case 'm':
823 /* option m silently ignored but required by posix */
824 break;
825 default:
826 fprint(2, "sort: unknown option: -%C\n", c);
827 done("option");
829 continue;
831 if(c == '+') {
832 argv[i] = 0; /* clobber args processed */
833 c = *s;
834 if(c == '.' || (c >= '0' && c <= '9')) {
835 newfield();
836 f = &args.field[args.nfield];
837 dofield(s, &f->beg1, &f->beg2, 0, 0);
838 if(f->flags & Bflag) {
839 f->flags |= B1flag;
840 f->flags &= ~Bflag;
842 hadplus = 1;
843 continue;
845 fprint(2, "sort: unknown option: +%C\n", c);
846 done("option");
848 args.nfile++;
851 for(i=0; i<=args.nfield; i++) {
852 f = &args.field[i];
854 /*
855 * global options apply to fields that
856 * specify no options
857 */
858 if(f->flags == 0) {
859 f->flags = args.field[0].flags;
860 if(args.field[0].flags & Bflag)
861 f->flags |= B1flag;
865 /*
866 * build buildkey specification
867 */
868 switch(f->flags & ~(Bflag|B1flag)) {
869 default:
870 fprint(2, "sort: illegal combination of flags: %lx\n", f->flags);
871 done("option");
872 case 0:
873 f->dokey = dokey_;
874 break;
875 case Rflag:
876 f->dokey = dokey_r;
877 break;
878 case Gflag:
879 case Nflag:
880 case Gflag|Nflag:
881 case Gflag|Rflag:
882 case Nflag|Rflag:
883 case Gflag|Nflag|Rflag:
884 f->dokey = dokey_gn;
885 break;
886 case Mflag:
887 case Mflag|Rflag:
888 f->dokey = dokey_m;
889 makemapm(f);
890 break;
891 case Dflag:
892 case Dflag|Fflag:
893 case Dflag|Fflag|Iflag:
894 case Dflag|Fflag|Iflag|Rflag:
895 case Dflag|Fflag|Iflag|Rflag|Wflag:
896 case Dflag|Fflag|Iflag|Wflag:
897 case Dflag|Fflag|Rflag:
898 case Dflag|Fflag|Rflag|Wflag:
899 case Dflag|Fflag|Wflag:
900 case Dflag|Iflag:
901 case Dflag|Iflag|Rflag:
902 case Dflag|Iflag|Rflag|Wflag:
903 case Dflag|Iflag|Wflag:
904 case Dflag|Rflag:
905 case Dflag|Rflag|Wflag:
906 case Dflag|Wflag:
907 case Fflag:
908 case Fflag|Iflag:
909 case Fflag|Iflag|Rflag:
910 case Fflag|Iflag|Rflag|Wflag:
911 case Fflag|Iflag|Wflag:
912 case Fflag|Rflag:
913 case Fflag|Rflag|Wflag:
914 case Fflag|Wflag:
915 case Iflag:
916 case Iflag|Rflag:
917 case Iflag|Rflag|Wflag:
918 case Iflag|Wflag:
919 case Wflag:
920 f->dokey = dokey_dfi;
921 makemapd(f);
922 break;
926 /*
927 * random spot checks
928 */
929 if(args.nfile > 1 && args.cflag) {
930 fprint(2, "sort: -c can have at most one input file\n");
931 done("option");
933 return;
936 uchar*
937 skip(uchar *l, int n1, int n2, int bflag, int endfield)
939 int i, c, tc;
940 Rune r;
942 if(endfield && n1 < 0)
943 return 0;
945 c = *l++;
946 tc = args.tabchar;
947 if(tc) {
948 if(tc < Runeself) {
949 for(i=n1; i>0; i--) {
950 while(c != tc) {
951 if(c == '\n')
952 return 0;
953 c = *l++;
955 if(!(endfield && i == 1))
956 c = *l++;
958 } else {
959 l--;
960 l += chartorune(&r, (char*)l);
961 for(i=n1; i>0; i--) {
962 while(r != tc) {
963 if(r == '\n')
964 return 0;
965 l += chartorune(&r, (char*)l);
967 if(!(endfield && i == 1))
968 l += chartorune(&r, (char*)l);
970 c = r;
972 } else {
973 for(i=n1; i>0; i--) {
974 while(c == ' ' || c == '\t')
975 c = *l++;
976 while(c != ' ' && c != '\t') {
977 if(c == '\n')
978 return 0;
979 c = *l++;
984 if(bflag)
985 while(c == ' ' || c == '\t')
986 c = *l++;
988 l--;
989 for(i=n2; i>0; i--) {
990 c = *l;
991 if(c < Runeself) {
992 if(c == '\n')
993 return 0;
994 l++;
995 continue;
997 l += chartorune(&r, (char*)l);
999 return l;
1002 void
1003 dokey_gn(Key *k, uchar *lp, uchar *lpe, Field *f)
1005 uchar *kp;
1006 int c, cl, dp;
1007 int state, nzero, exp, expsign, rflag;
1009 cl = k->klen + 3;
1010 kp = k->key + cl; /* skip place for sign, exponent[2] */
1012 nzero = 0; /* number of trailing zeros */
1013 exp = 0; /* value of the exponent */
1014 expsign = 0; /* sign of the exponent */
1015 dp = 0x4040; /* location of decimal point */
1016 rflag = f->flags&Rflag; /* xor of rflag and - sign */
1017 state = NSstart;
1019 for(;; lp++) {
1020 if(lp >= lpe)
1021 break;
1022 c = *lp;
1024 if(c == ' ' || c == '\t') {
1025 switch(state) {
1026 case NSstart:
1027 case NSsign:
1028 continue;
1030 break;
1032 if(c == '+' || c == '-') {
1033 switch(state) {
1034 case NSstart:
1035 state = NSsign;
1036 if(c == '-')
1037 rflag = !rflag;
1038 continue;
1039 case NSexp:
1040 state = NSexpsign;
1041 if(c == '-')
1042 expsign = 1;
1043 continue;
1045 break;
1047 if(c == '0') {
1048 switch(state) {
1049 case NSdigit:
1050 if(rflag)
1051 c = ~c;
1052 *kp++ = c;
1053 cl++;
1054 nzero++;
1055 dp++;
1056 state = NSdigit;
1057 continue;
1058 case NSfract:
1059 if(rflag)
1060 c = ~c;
1061 *kp++ = c;
1062 cl++;
1063 nzero++;
1064 state = NSfract;
1065 continue;
1066 case NSstart:
1067 case NSsign:
1068 case NSzero:
1069 state = NSzero;
1070 continue;
1071 case NSzerofract:
1072 case NSpoint:
1073 dp--;
1074 state = NSzerofract;
1075 continue;
1076 case NSexpsign:
1077 case NSexp:
1078 case NSexpdigit:
1079 exp = exp*10 + (c - '0');
1080 state = NSexpdigit;
1081 continue;
1083 break;
1085 if(c >= '1' && c <= '9') {
1086 switch(state) {
1087 case NSzero:
1088 case NSstart:
1089 case NSsign:
1090 case NSdigit:
1091 if(rflag)
1092 c = ~c;
1093 *kp++ = c;
1094 cl++;
1095 nzero = 0;
1096 dp++;
1097 state = NSdigit;
1098 continue;
1099 case NSzerofract:
1100 case NSpoint:
1101 case NSfract:
1102 if(rflag)
1103 c = ~c;
1104 *kp++ = c;
1105 cl++;
1106 nzero = 0;
1107 state = NSfract;
1108 continue;
1109 case NSexpsign:
1110 case NSexp:
1111 case NSexpdigit:
1112 exp = exp*10 + (c - '0');
1113 state = NSexpdigit;
1114 continue;
1116 break;
1118 if(c == '.') {
1119 switch(state) {
1120 case NSstart:
1121 case NSsign:
1122 state = NSpoint;
1123 continue;
1124 case NSzero:
1125 state = NSzerofract;
1126 continue;
1127 case NSdigit:
1128 state = NSfract;
1129 continue;
1131 break;
1133 if((f->flags & Gflag) && (c == 'e' || c == 'E')) {
1134 switch(state) {
1135 case NSdigit:
1136 case NSfract:
1137 state = NSexp;
1138 continue;
1140 break;
1142 break;
1145 switch(state) {
1147 * result is zero
1149 case NSstart:
1150 case NSsign:
1151 case NSzero:
1152 case NSzerofract:
1153 case NSpoint:
1154 kp = k->key + k->klen;
1155 k->klen += 2;
1156 kp[0] = 0x20; /* between + and - */
1157 kp[1] = 0;
1158 return;
1160 * result has exponent
1162 case NSexpsign:
1163 case NSexp:
1164 case NSexpdigit:
1165 if(expsign)
1166 exp = -exp;
1167 dp += exp;
1170 * result is fixed point number
1172 case NSdigit:
1173 case NSfract:
1174 kp -= nzero;
1175 cl -= nzero;
1176 break;
1180 * end of number
1182 c = 0;
1183 if(rflag)
1184 c = ~c;
1185 *kp = c;
1188 * sign and exponent
1190 c = 0x30;
1191 if(rflag) {
1192 c = 0x10;
1193 dp = ~dp;
1195 kp = k->key + k->klen;
1196 kp[0] = c;
1197 kp[1] = (dp >> 8);
1198 kp[2] = dp;
1199 k->klen = cl+1;
1202 void
1203 dokey_m(Key *k, uchar *lp, uchar *lpe, Field *f)
1205 uchar *kp;
1206 Rune r, place[3];
1207 int c, cl, pc;
1208 int rflag;
1210 rflag = f->flags&Rflag;
1211 pc = 0;
1213 cl = k->klen;
1214 kp = k->key + cl;
1216 for(;;) {
1218 * get the character
1220 if(lp >= lpe)
1221 break;
1222 c = *lp;
1223 if(c >= Runeself) {
1224 lp += chartorune(&r, (char*)lp);
1225 c = r;
1226 } else
1227 lp++;
1229 if(c < nelem(f->mapto)) {
1230 c = f->mapto[c];
1231 if(c == 0)
1232 continue;
1234 place[pc++] = c;
1235 if(pc < 3)
1236 continue;
1237 for(c=11; c>=0; c--)
1238 if(memcmp(month[c], place, sizeof(place)) == 0)
1239 break;
1240 c += 10;
1241 if(rflag)
1242 c = ~c;
1243 *kp++ = c;
1244 cl++;
1245 break;
1248 c = 0;
1249 if(rflag)
1250 c = ~c;
1251 *kp = c;
1252 k->klen = cl+1;
1255 void
1256 dokey_dfi(Key *k, uchar *lp, uchar *lpe, Field *f)
1258 uchar *kp;
1259 Rune r;
1260 int c, cl, n, rflag;
1262 cl = k->klen;
1263 kp = k->key + cl;
1264 rflag = f->flags & Rflag;
1266 for(;;) {
1268 * get the character
1270 if(lp >= lpe)
1271 break;
1272 c = *lp;
1273 if(c >= Runeself) {
1274 lp += chartorune(&r, (char*)lp);
1275 c = r;
1276 } else
1277 lp++;
1280 * do the various mappings.
1281 * the common case is handled
1282 * completely by the table.
1284 if(c != 0 && c < Runeself) {
1285 c = f->mapto[c];
1286 if(c) {
1287 *kp++ = c;
1288 cl++;
1290 continue;
1294 * for characters out of range,
1295 * the table does not do Rflag.
1296 * ignore is based on mapto[255]
1298 if(c != 0 && c < nelem(f->mapto)) {
1299 c = f->mapto[c];
1300 if(c == 0)
1301 continue;
1302 } else
1303 if(f->mapto[nelem(f->mapto)-1] == 0)
1304 continue;
1307 * put it in the key
1309 r = c;
1310 n = runetochar((char*)kp, &r);
1311 kp += n;
1312 cl += n;
1313 if(rflag)
1314 while(n > 0) {
1315 kp[-n] = ~kp[-n];
1316 n--;
1321 * end of key
1323 k->klen = cl+1;
1324 if(rflag) {
1325 *kp = ~0;
1326 return;
1328 *kp = 0;
1331 void
1332 dokey_r(Key *k, uchar *lp, uchar *lpe, Field *f)
1334 int cl, n;
1335 uchar *kp;
1337 USED(f);
1338 n = lpe - lp;
1339 if(n < 0)
1340 n = 0;
1341 cl = k->klen;
1342 kp = k->key + cl;
1343 k->klen = cl+n+1;
1345 lpe -= 3;
1346 while(lp < lpe) {
1347 kp[0] = ~lp[0];
1348 kp[1] = ~lp[1];
1349 kp[2] = ~lp[2];
1350 kp[3] = ~lp[3];
1351 kp += 4;
1352 lp += 4;
1355 lpe += 3;
1356 while(lp < lpe)
1357 *kp++ = ~*lp++;
1358 *kp = ~0;
1361 void
1362 dokey_(Key *k, uchar *lp, uchar *lpe, Field *f)
1364 int n, cl;
1365 uchar *kp;
1367 USED(f);
1368 n = lpe - lp;
1369 if(n < 0)
1370 n = 0;
1371 cl = k->klen;
1372 kp = k->key + cl;
1373 k->klen = cl+n+1;
1374 memmove(kp, lp, n);
1375 kp[n] = 0;
1378 void
1379 buildkey(Line *l)
1381 Key *k;
1382 uchar *lp, *lpe;
1383 int ll, kl, cl, i, n;
1384 Field *f;
1386 ll = l->llen - 1;
1387 kl = 0; /* allocated length */
1388 cl = 0; /* current length */
1389 k = 0;
1391 for(i=1; i<=args.nfield; i++) {
1392 f = &args.field[i];
1393 lp = skip(l->line, f->beg1, f->beg2, f->flags&B1flag, 0);
1394 if(lp == 0)
1395 lp = l->line + ll;
1396 lpe = skip(l->line, f->end1, f->end2, f->flags&Bflag, 1);
1397 if(lpe == 0)
1398 lpe = l->line + ll;
1399 n = (lpe - lp) + 1;
1400 if(n <= 0)
1401 n = 1;
1402 if(cl+(n+4) > kl) {
1403 kl = cl+(n+4);
1404 k = realloc(k, sizeof(Key) +
1405 (kl-1)*sizeof(k->key[0]));
1406 if(k == 0)
1407 nomem();
1409 k->klen = cl;
1410 (*f->dokey)(k, lp, lpe, f);
1411 cl = k->klen;
1415 * global comparisons
1417 if(!(args.uflag && cl > 0)) {
1418 f = &args.field[0];
1419 if(cl+(ll+4) > kl) {
1420 kl = cl+(ll+4);
1421 k = realloc(k, sizeof(Key) +
1422 (kl-1)*sizeof(k->key[0]));
1423 if(k == 0)
1424 nomem();
1426 k->klen = cl;
1427 (*f->dokey)(k, l->line, l->line+ll, f);
1428 cl = k->klen;
1431 l->key = k;
1432 k->klen = cl;
1434 if(args.vflag) {
1435 write(2, l->line, l->llen);
1436 for(i=0; i<k->klen; i++) {
1437 fprint(2, " %.2x", k->key[i]);
1438 if(k->key[i] == 0x00 || k->key[i] == 0xff)
1439 fprint(2, "\n");
1444 void
1445 makemapm(Field *f)
1447 int i, c;
1449 for(i=0; i<nelem(f->mapto); i++) {
1450 c = 1;
1451 if(i == ' ' || i == '\t')
1452 c = 0;
1453 if(i >= 'a' && i <= 'z')
1454 c = i + ('A' - 'a');
1455 if(i >= 'A' && i <= 'Z')
1456 c = i;
1457 f->mapto[i] = c;
1458 if(args.vflag) {
1459 if((i & 15) == 0)
1460 fprint(2, " ");
1461 fprint(2, " %.2x", c);
1462 if((i & 15) == 15)
1463 fprint(2, "\n");
1468 void
1469 makemapd(Field *f)
1471 int i, j, c;
1473 for(i=0; i<nelem(f->mapto); i++) {
1474 c = i;
1475 if(f->flags & Iflag)
1476 if(c < 040 || c > 0176)
1477 c = -1;
1478 if((f->flags & Wflag) && c >= 0)
1479 if(c == ' ' || c == '\t')
1480 c = -1;
1481 if((f->flags & Dflag) && c >= 0)
1482 if(!(c == ' ' || c == '\t' ||
1483 (c >= 'a' && c <= 'z') ||
1484 (c >= 'A' && c <= 'Z') ||
1485 (c >= '0' && c <= '9'))) {
1486 for(j=0; latinmap[j]; j+=3)
1487 if(c == latinmap[j+0] ||
1488 c == latinmap[j+1])
1489 break;
1490 if(latinmap[j] == 0)
1491 c = -1;
1493 if((f->flags & Fflag) && c >= 0) {
1494 if(c >= 'a' && c <= 'z')
1495 c += 'A' - 'a';
1496 for(j=0; latinmap[j]; j+=3)
1497 if(c == latinmap[j+0] ||
1498 c == latinmap[j+1]) {
1499 c = latinmap[j+2];
1500 break;
1503 if((f->flags & Rflag) && c >= 0 && i > 0 && i < Runeself)
1504 c = ~c & 0xff;
1505 if(c < 0)
1506 c = 0;
1507 f->mapto[i] = c;
1508 if(args.vflag) {
1509 if((i & 15) == 0)
1510 fprint(2, " ");
1511 fprint(2, " %.2x", c);
1512 if((i & 15) == 15)
1513 fprint(2, "\n");
1518 int latinmap[] =
1520 /* lcase ucase fold */
1521 0xe0, 0xc0, 0x41, /* L'à', L'À', L'A', */
1522 0xe1, 0xc1, 0x41, /* L'á', L'Á', L'A', */
1523 0xe2, 0xc2, 0x41, /* L'â', L'Â', L'A', */
1524 0xe4, 0xc4, 0x41, /* L'ä', L'Ä', L'A', */
1525 0xe3, 0xc3, 0x41, /* L'ã', L'Ã', L'A', */
1526 0xe5, 0xc5, 0x41, /* L'å', L'Å', L'A', */
1527 0xe8, 0xc8, 0x45, /* L'è', L'È', L'E', */
1528 0xe9, 0xc9, 0x45, /* L'é', L'É', L'E', */
1529 0xea, 0xca, 0x45, /* L'ê', L'Ê', L'E', */
1530 0xeb, 0xcb, 0x45, /* L'ë', L'Ë', L'E', */
1531 0xec, 0xcc, 0x49, /* L'ì', L'Ì', L'I', */
1532 0xed, 0xcd, 0x49, /* L'í', L'Í', L'I', */
1533 0xee, 0xce, 0x49, /* L'î', L'Î', L'I', */
1534 0xef, 0xcf, 0x49, /* L'ï', L'Ï', L'I', */
1535 0xf2, 0xd2, 0x4f, /* L'ò', L'Ò', L'O', */
1536 0xf3, 0xd3, 0x4f, /* L'ó', L'Ó', L'O', */
1537 0xf4, 0xd4, 0x4f, /* L'ô', L'Ô', L'O', */
1538 0xf6, 0xd6, 0x4f, /* L'ö', L'Ö', L'O', */
1539 0xf5, 0xd5, 0x4f, /* L'õ', L'Õ', L'O', */
1540 0xf8, 0xd8, 0x4f, /* L'ø', L'Ø', L'O', */
1541 0xf9, 0xd9, 0x55, /* L'ù', L'Ù', L'U', */
1542 0xfa, 0xda, 0x55, /* L'ú', L'Ú', L'U', */
1543 0xfb, 0xdb, 0x55, /* L'û', L'Û', L'U', */
1544 0xfc, 0xdc, 0x55, /* L'ü', L'Ü', L'U', */
1545 0xe6, 0xc6, 0x41, /* L'æ', L'Æ', L'A', */
1546 0xf0, 0xd0, 0x44, /* L'ð', L'Ð', L'D', */
1547 0xf1, 0xd1, 0x4e, /* L'ñ', L'Ñ', L'N', */
1548 0xfd, 0xdd, 0x59, /* L'ý', L'Ý', L'Y', */
1549 0xe7, 0xc7, 0x43, /* L'ç', L'Ç', L'C', */
1553 Rune LJAN[] = { 'J', 'A', 'N', 0 };
1554 Rune LFEB[] = { 'F', 'E', 'B', 0 };
1555 Rune LMAR[] = { 'M', 'A', 'R', 0 };
1556 Rune LAPR[] = { 'A', 'P', 'R', 0 };
1557 Rune LMAY[] = { 'M', 'A', 'Y', 0 };
1558 Rune LJUN[] = { 'J', 'U', 'N', 0 };
1559 Rune LJUL[] = { 'J', 'U', 'L', 0 };
1560 Rune LAUG[] = { 'A', 'U', 'G', 0 };
1561 Rune LSEP[] = { 'S', 'E', 'P', 0 };
1562 Rune LOCT[] = { 'O', 'C', 'T', 0 };
1563 Rune LNOV[] = { 'N', 'O', 'V', 0 };
1564 Rune LDEC[] = { 'D', 'E', 'C', 0 };
1566 Rune* month[12] =
1568 LJAN,
1569 LFEB,
1570 LMAR,
1571 LAPR,
1572 LMAY,
1573 LJUN,
1574 LJUL,
1575 LAUG,
1576 LSEP,
1577 LOCT,
1578 LNOV,
1579 LDEC,
1582 /************** radix sort ***********/
1584 enum
1586 Threshold = 14,
1589 void rsort4(Key***, ulong, int);
1590 void bsort4(Key***, ulong, int);
1592 void
1593 sort4(void *a, ulong n)
1595 if(n > Threshold)
1596 rsort4((Key***)a, n, 0);
1597 else
1598 bsort4((Key***)a, n, 0);
1601 void
1602 rsort4(Key ***a, ulong n, int b)
1604 Key ***ea, ***t, ***u, **t1, **u1, *k;
1605 Key ***part[257];
1606 static long count[257];
1607 long clist[257+257], *cp, *cp1;
1608 int c, lowc, higc;
1611 * pass 1 over all keys,
1612 * count the number of each key[b].
1613 * find low count and high count.
1615 lowc = 256;
1616 higc = 0;
1617 ea = a+n;
1618 for(t=a; t<ea; t++) {
1619 k = **t;
1620 n = k->klen;
1621 if(b >= n) {
1622 count[256]++;
1623 continue;
1625 c = k->key[b];
1626 n = count[c]++;
1627 if(n == 0) {
1628 if(c < lowc)
1629 lowc = c;
1630 if(c > higc)
1631 higc = c;
1636 * pass 2 over all counts,
1637 * put partition pointers in part[c].
1638 * save compacted indexes and counts
1639 * in clist[].
1641 t = a;
1642 n = count[256];
1643 clist[0] = n;
1644 part[256] = t;
1645 t += n;
1647 cp1 = clist+1;
1648 cp = count+lowc;
1649 for(c=lowc; c<=higc; c++,cp++) {
1650 n = *cp;
1651 if(n) {
1652 cp1[0] = n;
1653 cp1[1] = c;
1654 cp1 += 2;
1655 part[c] = t;
1656 t += n;
1659 *cp1 = 0;
1662 * pass 3 over all counts.
1663 * chase lowest pointer in each partition
1664 * around a permutation until it comes
1665 * back and is stored where it started.
1666 * static array, count[], should be
1667 * reduced to zero entries except maybe
1668 * count[256].
1670 for(cp1=clist+1; cp1[0]; cp1+=2) {
1671 c = cp1[1];
1672 cp = count+c;
1673 while(*cp) {
1674 t1 = *part[c];
1675 for(;;) {
1676 k = *t1;
1677 n = 256;
1678 if(b < k->klen)
1679 n = k->key[b];
1680 u = part[n]++;
1681 count[n]--;
1682 u1 = *u;
1683 *u = t1;
1684 if(n == c)
1685 break;
1686 t1 = u1;
1692 * pass 4 over all partitions.
1693 * call recursively.
1695 b++;
1696 t = a + clist[0];
1697 count[256] = 0;
1698 for(cp1=clist+1; n=cp1[0]; cp1+=2) {
1699 if(n > Threshold)
1700 rsort4(t, n, b);
1701 else
1702 if(n > 1)
1703 bsort4(t, n, b);
1704 t += n;
1709 * bubble sort to pick up
1710 * the pieces.
1712 void
1713 bsort4(Key ***a, ulong n, int b)
1715 Key ***i, ***j, ***k, ***l, **t;
1716 Key *ka, *kb;
1717 int n1, n2;
1719 l = a+n;
1720 j = a;
1722 loop:
1723 i = j;
1724 j++;
1725 if(j >= l)
1726 return;
1728 ka = **i;
1729 kb = **j;
1730 n1 = ka->klen - b;
1731 n2 = kb->klen - b;
1732 if(n1 > n2)
1733 n1 = n2;
1734 if(n1 <= 0)
1735 goto loop;
1736 n2 = ka->key[b] - kb->key[b];
1737 if(n2 == 0)
1738 n2 = memcmp(ka->key+b, kb->key+b, n1);
1739 if(n2 <= 0)
1740 goto loop;
1742 for(;;) {
1743 k = i+1;
1745 t = *k;
1746 *k = *i;
1747 *i = t;
1749 if(i <= a)
1750 goto loop;
1752 i--;
1753 ka = **i;
1754 kb = *t;
1755 n1 = ka->klen - b;
1756 n2 = kb->klen - b;
1757 if(n1 > n2)
1758 n1 = n2;
1759 if(n1 <= 0)
1760 goto loop;
1761 n2 = ka->key[b] - kb->key[b];
1762 if(n2 == 0)
1763 n2 = memcmp(ka->key+b, kb->key+b, n1);
1764 if(n2 <= 0)
1765 goto loop;