Blob
1 #include <u.h>2 #include <libc.h>3 #include <bio.h>5 /*6 bugs:7 00/ff for end of file can conflict with 00/ff characters8 */10 enum11 {12 Nline = 500000, /* default max number of lines saved in memory */13 Nmerge = 10, /* max number of temporary files merged */14 Nfield = 20, /* max number of argument fields */16 Bflag = 1<<0, /* flags per field */17 B1flag = 1<<1,19 Dflag = 1<<2,20 Fflag = 1<<3,21 Gflag = 1<<4,22 Iflag = 1<<5,23 Mflag = 1<<6,24 Nflag = 1<<7,25 Rflag = 1<<8,26 Wflag = 1<<9,28 NSstart = 0, /* states for number to key decoding */29 NSsign,30 NSzero,31 NSdigit,32 NSpoint,33 NSfract,34 NSzerofract,35 NSexp,36 NSexpsign,37 NSexpdigit38 };40 typedef struct Line Line;41 typedef struct Key Key;42 typedef struct Merge Merge;43 typedef struct Field Field;45 struct Line46 {47 Key* key;48 int llen; /* always >= 1 */49 uchar line[1]; /* always ends in '\n' */50 };52 struct Merge53 {54 Key* key; /* copy of line->key so (Line*) looks like (Merge*) */55 Line* line; /* line at the head of a merged temp file */56 int fd; /* file descriptor */57 Biobuf b; /* iobuf for reading a temp file */58 };60 struct Key61 {62 int klen;63 uchar key[1];64 };66 struct Field67 {68 int beg1;69 int beg2;70 int end1;71 int end2;73 long flags;74 uchar mapto[256];76 void (*dokey)(Key*, uchar*, uchar*, Field*);77 };79 struct args80 {81 char* ofile;82 char* tname;83 Rune tabchar;84 char cflag;85 char uflag;86 char vflag;87 int nfield;88 int nfile;89 Field field[Nfield];91 Line** linep;92 long nline; /* number of lines in this temp file */93 long lineno; /* overall ordinal for -s option */94 int ntemp;95 long mline; /* max lines per file */96 } args;98 extern int latinmap[];99 extern Rune* month[12];101 void buildkey(Line*);102 void doargs(int, char*[]);103 void dofield(char*, int*, int*, int, int);104 void dofile(Biobuf*);105 void dokey_(Key*, uchar*, uchar*, Field*);106 void dokey_dfi(Key*, uchar*, uchar*, Field*);107 void dokey_gn(Key*, uchar*, uchar*, Field*);108 void dokey_m(Key*, uchar*, uchar*, Field*);109 void dokey_r(Key*, uchar*, uchar*, Field*);110 void done(char*);111 int kcmp(Key*, Key*);112 void makemapd(Field*);113 void makemapm(Field*);114 void mergefiles(int, int, Biobuf*);115 void mergeout(Biobuf*);116 void newfield(void);117 Line* newline(Biobuf*);118 void nomem(void);119 void notifyf(void*, char*);120 void printargs(void);121 void printout(Biobuf*);122 void setfield(int, int);123 uchar* skip(uchar*, int, int, int, int);124 void sort4(void*, ulong);125 char* tempfile(int);126 void tempout(void);127 void lineout(Biobuf*, Line*);129 void130 main(int argc, char *argv[])131 {132 int i, f;133 char *s;134 Biobuf bbuf;136 notify(notifyf); /**/137 doargs(argc, argv);138 if(args.vflag)139 printargs();141 for(i=1; i<argc; i++) {142 s = argv[i];143 if(s == 0)144 continue;145 if(strcmp(s, "-") == 0) {146 Binit(&bbuf, 0, OREAD);147 dofile(&bbuf);148 Bterm(&bbuf);149 continue;150 }151 f = open(s, OREAD);152 if(f < 0) {153 fprint(2, "sort: open %s: %r\n", s);154 done("open");155 }156 Binit(&bbuf, f, OREAD);157 dofile(&bbuf);158 Bterm(&bbuf);159 close(f);160 }161 if(args.nfile == 0) {162 Binit(&bbuf, 0, OREAD);163 dofile(&bbuf);164 Bterm(&bbuf);165 }166 if(args.cflag)167 done(0);168 if(args.vflag)169 fprint(2, "=========\n");171 f = 1;172 if(args.ofile) {173 f = create(args.ofile, OWRITE, 0666);174 if(f < 0) {175 fprint(2, "sort: create %s: %r\n", args.ofile);176 done("create");177 }178 }180 Binit(&bbuf, f, OWRITE);181 if(args.ntemp) {182 tempout();183 mergeout(&bbuf);184 } else {185 printout(&bbuf);186 }187 Bterm(&bbuf);188 done(0);189 }191 void192 dofile(Biobuf *b)193 {194 Line *l, *ol;195 int n;197 if(args.cflag) {198 ol = newline(b);199 if(ol == 0)200 return;201 for(;;) {202 l = newline(b);203 if(l == 0)204 break;205 n = kcmp(ol->key, l->key);206 if(n > 0 || (n == 0 && args.uflag)) {207 fprint(2, "sort: -c file not in sort\n"); /**/208 done("order");209 }210 free(ol->key);211 free(ol);212 ol = l;213 }214 return;215 }217 if(args.linep == 0) {218 args.linep = malloc(args.mline * sizeof(args.linep));219 if(args.linep == 0)220 nomem();221 }222 for(;;) {223 l = newline(b);224 if(l == 0)225 break;226 if(args.nline >= args.mline)227 tempout();228 args.linep[args.nline] = l;229 args.nline++;230 args.lineno++;231 }232 }234 void235 notifyf(void *a, char *s)236 {237 USED(a);238 if(strcmp(s, "interrupt") == 0)239 done(0);240 if(strcmp(s, "hangup") == 0)241 done(0);242 if(strcmp(s, "kill") == 0)243 done(0);244 if(strncmp(s, "sys: write on closed pipe", 25) == 0)245 done(0);246 fprint(2, "sort: note: %s\n", s);247 abort();248 }250 Line*251 newline(Biobuf *b)252 {253 Line *l;254 char *p;255 int n, c;257 p = Brdline(b, '\n');258 n = Blinelen(b);259 if(p == 0) {260 if(n == 0)261 return 0;262 l = 0;263 for(n=0;;) {264 if((n & 31) == 0) {265 l = realloc(l, sizeof(Line) +266 (n+31)*sizeof(l->line[0]));267 if(l == 0)268 nomem();269 }270 c = Bgetc(b);271 if(c < 0) {272 fprint(2, "sort: newline added\n");273 c = '\n';274 }275 l->line[n++] = c;276 if(c == '\n')277 break;278 }279 l->llen = n;280 buildkey(l);281 return l;282 }283 l = malloc(sizeof(Line) +284 (n-1)*sizeof(l->line[0]));285 if(l == 0)286 nomem();287 l->llen = n;288 memmove(l->line, p, n);289 buildkey(l);290 return l;291 }293 void294 lineout(Biobuf *b, Line *l)295 {296 int n, m;298 n = l->llen;299 m = Bwrite(b, l->line, n);300 if(n != m)301 exits("write");302 }304 void305 tempout(void)306 {307 long n;308 Line **lp, *l;309 char *tf;310 int f;311 Biobuf tb;313 sort4(args.linep, args.nline);314 tf = tempfile(args.ntemp);315 args.ntemp++;316 f = create(tf, OWRITE, 0666);317 if(f < 0) {318 fprint(2, "sort: create %s: %r\n", tf);319 done("create");320 }322 Binit(&tb, f, OWRITE);323 lp = args.linep;324 for(n=args.nline; n>0; n--) {325 l = *lp++;326 lineout(&tb, l);327 free(l->key);328 free(l);329 }330 args.nline = 0;331 Bterm(&tb);332 close(f);333 }335 void336 done(char *xs)337 {338 int i;340 for(i=0; i<args.ntemp; i++)341 remove(tempfile(i));342 exits(xs);343 }345 void346 nomem(void)347 {348 fprint(2, "sort: out of memory\n");349 done("mem");350 }352 char*353 tempfile(int n)354 {355 static char file[100];356 static uint pid;357 char *dir;359 dir = "/var/tmp";360 if(args.tname)361 dir = args.tname;362 if(strlen(dir) >= nelem(file)-20) {363 fprint(2, "temp file directory name is too long: %s\n", dir);364 done("tdir");365 }367 if(pid == 0) {368 pid = getpid();369 if(pid == 0) {370 pid = time(0);371 if(pid == 0)372 pid = 1;373 }374 }376 sprint(file, "%s/sort.%.4d.%.4d", dir, pid%10000, n);377 return file;378 }380 void381 mergeout(Biobuf *b)382 {383 int n, i, f;384 char *tf;385 Biobuf tb;387 for(i=0; i<args.ntemp; i+=n) {388 n = args.ntemp - i;389 if(n > Nmerge) {390 tf = tempfile(args.ntemp);391 args.ntemp++;392 f = create(tf, OWRITE, 0666);393 if(f < 0) {394 fprint(2, "sort: create %s: %r\n", tf);395 done("create");396 }397 Binit(&tb, f, OWRITE);399 n = Nmerge;400 mergefiles(i, n, &tb);402 Bterm(&tb);403 close(f);404 } else405 mergefiles(i, n, b);406 }407 }409 void410 mergefiles(int t, int n, Biobuf *b)411 {412 Merge *m, *mp, **mmp;413 Key *ok;414 Line *l;415 char *tf;416 int i, f, nn;418 mmp = malloc(n*sizeof(*mmp));419 mp = malloc(n*sizeof(*mp));420 if(mmp == 0 || mp == 0)421 nomem();423 nn = 0;424 m = mp;425 for(i=0; i<n; i++,m++) {426 tf = tempfile(t+i);427 f = open(tf, OREAD);428 if(f < 0) {429 fprint(2, "sort: reopen %s: %r\n", tf);430 done("open");431 }432 m->fd = f;433 Binit(&m->b, f, OREAD);434 mmp[nn] = m;436 l = newline(&m->b);437 if(l == 0)438 continue;439 nn++;440 m->line = l;441 m->key = l->key;442 }444 ok = 0;445 for(;;) {446 sort4(mmp, nn);447 m = *mmp;448 if(nn == 0)449 break;450 for(;;) {451 l = m->line;452 if(args.uflag && ok && kcmp(ok, l->key) == 0) {453 free(l->key);454 free(l);455 } else {456 lineout(b, l);457 if(ok)458 free(ok);459 ok = l->key;460 free(l);461 }463 l = newline(&m->b);464 if(l == 0) {465 nn--;466 mmp[0] = mmp[nn];467 break;468 }469 m->line = l;470 m->key = l->key;471 if(nn > 1 && kcmp(mmp[0]->key, mmp[1]->key) > 0)472 break;473 }474 }475 if(ok)476 free(ok);478 m = mp;479 for(i=0; i<n; i++,m++) {480 Bterm(&m->b);481 close(m->fd);482 }484 free(mp);485 free(mmp);486 }488 int489 kcmp(Key *ka, Key *kb)490 {491 int n, m;493 /*494 * set n to length of smaller key495 */496 n = ka->klen;497 m = kb->klen;498 if(n > m)499 n = m;500 return memcmp(ka->key, kb->key, n);501 }503 void504 printout(Biobuf *b)505 {506 long n;507 Line **lp, *l;508 Key *ok;510 sort4(args.linep, args.nline);511 lp = args.linep;512 ok = 0;513 for(n=args.nline; n>0; n--) {514 l = *lp++;515 if(args.uflag && ok && kcmp(ok, l->key) == 0)516 continue;517 lineout(b, l);518 ok = l->key;519 }520 }522 void523 setfield(int n, int c)524 {525 Field *f;527 f = &args.field[n];528 switch(c) {529 default:530 fprint(2, "sort: unknown option: field.%C\n", c);531 done("option");532 case 'b': /* skip blanks */533 f->flags |= Bflag;534 break;535 case 'd': /* directory order */536 f->flags |= Dflag;537 break;538 case 'f': /* fold case */539 f->flags |= Fflag;540 break;541 case 'g': /* floating point -n case */542 f->flags |= Gflag;543 break;544 case 'i': /* ignore non-ascii */545 f->flags |= Iflag;546 break;547 case 'M': /* month */548 f->flags |= Mflag;549 break;550 case 'n': /* numbers */551 f->flags |= Nflag;552 break;553 case 'r': /* reverse */554 f->flags |= Rflag;555 break;556 case 'w': /* ignore white */557 f->flags |= Wflag;558 break;559 }560 }562 void563 dofield(char *s, int *n1, int *n2, int off1, int off2)564 {565 int c, n;567 c = *s++;568 if(c >= '0' && c <= '9') {569 n = 0;570 while(c >= '0' && c <= '9') {571 n = n*10 + (c-'0');572 c = *s++;573 }574 n -= off1; /* posix committee: rot in hell */575 if(n < 0) {576 fprint(2, "sort: field offset must be positive\n");577 done("option");578 }579 *n1 = n;580 }581 if(c == '.') {582 c = *s++;583 if(c >= '0' && c <= '9') {584 n = 0;585 while(c >= '0' && c <= '9') {586 n = n*10 + (c-'0');587 c = *s++;588 }589 n -= off2;590 if(n < 0) {591 fprint(2, "sort: character offset must be positive\n");592 done("option");593 }594 *n2 = n;595 }596 }597 while(c != 0) {598 setfield(args.nfield, c);599 c = *s++;600 }601 }603 void604 printargs(void)605 {606 int i, n;607 Field *f;608 char *prefix;610 fprint(2, "sort");611 for(i=0; i<=args.nfield; i++) {612 f = &args.field[i];613 prefix = " -";614 if(i) {615 n = f->beg1;616 if(n >= 0)617 fprint(2, " +%d", n);618 else619 fprint(2, " +*");620 n = f->beg2;621 if(n >= 0)622 fprint(2, ".%d", n);623 else624 fprint(2, ".*");626 if(f->flags & B1flag)627 fprint(2, "b");629 n = f->end1;630 if(n >= 0)631 fprint(2, " -%d", n);632 else633 fprint(2, " -*");634 n = f->end2;635 if(n >= 0)636 fprint(2, ".%d", n);637 else638 fprint(2, ".*");639 prefix = "";640 }641 if(f->flags & Bflag)642 fprint(2, "%sb", prefix);643 if(f->flags & Dflag)644 fprint(2, "%sd", prefix);645 if(f->flags & Fflag)646 fprint(2, "%sf", prefix);647 if(f->flags & Gflag)648 fprint(2, "%sg", prefix);649 if(f->flags & Iflag)650 fprint(2, "%si", prefix);651 if(f->flags & Mflag)652 fprint(2, "%sM", prefix);653 if(f->flags & Nflag)654 fprint(2, "%sn", prefix);655 if(f->flags & Rflag)656 fprint(2, "%sr", prefix);657 if(f->flags & Wflag)658 fprint(2, "%sw", prefix);659 }660 if(args.cflag)661 fprint(2, " -c");662 if(args.uflag)663 fprint(2, " -u");664 if(args.ofile)665 fprint(2, " -o %s", args.ofile);666 if(args.mline != Nline)667 fprint(2, " -l %ld", args.mline);668 fprint(2, "\n");669 }671 void672 newfield(void)673 {674 int n;675 Field *f;677 n = args.nfield + 1;678 if(n >= Nfield) {679 fprint(2, "sort: too many fields specified\n");680 done("option");681 }682 args.nfield = n;683 f = &args.field[n];684 f->beg1 = -1;685 f->beg2 = -1;686 f->end1 = -1;687 f->end2 = -1;688 }690 void691 doargs(int argc, char *argv[])692 {693 int i, c, hadplus;694 char *s, *p, *q;695 Field *f;697 hadplus = 0;698 args.mline = Nline;699 for(i=1; i<argc; i++) {700 s = argv[i];701 c = *s++;702 if(c == '-') {703 c = *s;704 if(c == 0) /* forced end of arg marker */705 break;706 argv[i] = 0; /* clobber args processed */707 if(c == '.' || (c >= '0' && c <= '9')) {708 if(!hadplus)709 newfield();710 f = &args.field[args.nfield];711 dofield(s, &f->end1, &f->end2, 0, 0);712 hadplus = 0;713 continue;714 }716 while(c = *s++)717 switch(c) {718 case '-': /* end of options */719 i = argc;720 continue;721 case 'T': /* temp directory */722 if(*s == 0) {723 i++;724 if(i < argc) {725 args.tname = argv[i];726 argv[i] = 0;727 }728 } else729 args.tname = s;730 s = strchr(s, 0);731 break;732 case 'o': /* output file */733 if(*s == 0) {734 i++;735 if(i < argc) {736 args.ofile = argv[i];737 argv[i] = 0;738 }739 } else740 args.ofile = s;741 s = strchr(s, 0);742 break;743 case 'k': /* posix key (what were they thinking?) */744 p = 0;745 if(*s == 0) {746 i++;747 if(i < argc) {748 p = argv[i];749 argv[i] = 0;750 }751 } else752 p = s;753 s = strchr(s, 0);754 if(p == 0)755 break;757 newfield();758 q = strchr(p, ',');759 if(q)760 *q++ = 0;761 f = &args.field[args.nfield];762 dofield(p, &f->beg1, &f->beg2, 1, 1);763 if(f->flags & Bflag) {764 f->flags |= B1flag;765 f->flags &= ~Bflag;766 }767 if(q) {768 dofield(q, &f->end1, &f->end2, 1, 0);769 if(f->end2 <= 0)770 f->end1++;771 }772 hadplus = 0;773 break;774 case 't': /* tab character */775 if(*s == 0) {776 i++;777 if(i < argc) {778 chartorune(&args.tabchar, argv[i]);779 argv[i] = 0;780 }781 } else782 s += chartorune(&args.tabchar, s);783 if(args.tabchar == '\n') {784 fprint(2, "aw come on, rob\n");785 done("rob");786 }787 break;788 case 'c': /* check order */789 args.cflag = 1;790 break;791 case 'u': /* unique */792 args.uflag = 1;793 break;794 case 'v': /* debugging noise */795 args.vflag = 1;796 break;797 case 'l':798 if(*s == 0) {799 i++;800 if(i < argc) {801 args.mline = atol(argv[i]);802 argv[i] = 0;803 }804 } else805 args.mline = atol(s);806 s = strchr(s, 0);807 break;809 case 'M': /* month */810 case 'b': /* skip blanks */811 case 'd': /* directory order */812 case 'f': /* fold case */813 case 'g': /* floating numbers */814 case 'i': /* ignore non-ascii */815 case 'n': /* numbers */816 case 'r': /* reverse */817 case 'w': /* ignore white */818 if(args.nfield > 0)819 fprint(2, "sort: global field set after -k\n");820 setfield(0, c);821 break;822 case 'm':823 /* option m silently ignored but required by posix */824 break;825 default:826 fprint(2, "sort: unknown option: -%C\n", c);827 done("option");828 }829 continue;830 }831 if(c == '+') {832 argv[i] = 0; /* clobber args processed */833 c = *s;834 if(c == '.' || (c >= '0' && c <= '9')) {835 newfield();836 f = &args.field[args.nfield];837 dofield(s, &f->beg1, &f->beg2, 0, 0);838 if(f->flags & Bflag) {839 f->flags |= B1flag;840 f->flags &= ~Bflag;841 }842 hadplus = 1;843 continue;844 }845 fprint(2, "sort: unknown option: +%C\n", c);846 done("option");847 }848 args.nfile++;849 }851 for(i=0; i<=args.nfield; i++) {852 f = &args.field[i];854 /*855 * global options apply to fields that856 * specify no options857 */858 if(f->flags == 0) {859 f->flags = args.field[0].flags;860 if(args.field[0].flags & Bflag)861 f->flags |= B1flag;862 }865 /*866 * build buildkey specification867 */868 switch(f->flags & ~(Bflag|B1flag)) {869 default:870 fprint(2, "sort: illegal combination of flags: %lx\n", f->flags);871 done("option");872 case 0:873 f->dokey = dokey_;874 break;875 case Rflag:876 f->dokey = dokey_r;877 break;878 case Gflag:879 case Nflag:880 case Gflag|Nflag:881 case Gflag|Rflag:882 case Nflag|Rflag:883 case Gflag|Nflag|Rflag:884 f->dokey = dokey_gn;885 break;886 case Mflag:887 case Mflag|Rflag:888 f->dokey = dokey_m;889 makemapm(f);890 break;891 case Dflag:892 case Dflag|Fflag:893 case Dflag|Fflag|Iflag:894 case Dflag|Fflag|Iflag|Rflag:895 case Dflag|Fflag|Iflag|Rflag|Wflag:896 case Dflag|Fflag|Iflag|Wflag:897 case Dflag|Fflag|Rflag:898 case Dflag|Fflag|Rflag|Wflag:899 case Dflag|Fflag|Wflag:900 case Dflag|Iflag:901 case Dflag|Iflag|Rflag:902 case Dflag|Iflag|Rflag|Wflag:903 case Dflag|Iflag|Wflag:904 case Dflag|Rflag:905 case Dflag|Rflag|Wflag:906 case Dflag|Wflag:907 case Fflag:908 case Fflag|Iflag:909 case Fflag|Iflag|Rflag:910 case Fflag|Iflag|Rflag|Wflag:911 case Fflag|Iflag|Wflag:912 case Fflag|Rflag:913 case Fflag|Rflag|Wflag:914 case Fflag|Wflag:915 case Iflag:916 case Iflag|Rflag:917 case Iflag|Rflag|Wflag:918 case Iflag|Wflag:919 case Wflag:920 f->dokey = dokey_dfi;921 makemapd(f);922 break;923 }924 }926 /*927 * random spot checks928 */929 if(args.nfile > 1 && args.cflag) {930 fprint(2, "sort: -c can have at most one input file\n");931 done("option");932 }933 return;934 }936 uchar*937 skip(uchar *l, int n1, int n2, int bflag, int endfield)938 {939 int i, c, tc;940 Rune r;942 if(endfield && n1 < 0)943 return 0;945 c = *l++;946 tc = args.tabchar;947 if(tc) {948 if(tc < Runeself) {949 for(i=n1; i>0; i--) {950 while(c != tc) {951 if(c == '\n')952 return 0;953 c = *l++;954 }955 if(!(endfield && i == 1))956 c = *l++;957 }958 } else {959 l--;960 l += chartorune(&r, (char*)l);961 for(i=n1; i>0; i--) {962 while(r != tc) {963 if(r == '\n')964 return 0;965 l += chartorune(&r, (char*)l);966 }967 if(!(endfield && i == 1))968 l += chartorune(&r, (char*)l);969 }970 c = r;971 }972 } else {973 for(i=n1; i>0; i--) {974 while(c == ' ' || c == '\t')975 c = *l++;976 while(c != ' ' && c != '\t') {977 if(c == '\n')978 return 0;979 c = *l++;980 }981 }982 }984 if(bflag)985 while(c == ' ' || c == '\t')986 c = *l++;988 l--;989 for(i=n2; i>0; i--) {990 c = *l;991 if(c < Runeself) {992 if(c == '\n')993 return 0;994 l++;995 continue;996 }997 l += chartorune(&r, (char*)l);998 }999 return l;1000 }1002 void1003 dokey_gn(Key *k, uchar *lp, uchar *lpe, Field *f)1004 {1005 uchar *kp;1006 int c, cl, dp;1007 int state, nzero, exp, expsign, rflag;1009 cl = k->klen + 3;1010 kp = k->key + cl; /* skip place for sign, exponent[2] */1012 nzero = 0; /* number of trailing zeros */1013 exp = 0; /* value of the exponent */1014 expsign = 0; /* sign of the exponent */1015 dp = 0x4040; /* location of decimal point */1016 rflag = f->flags&Rflag; /* xor of rflag and - sign */1017 state = NSstart;1019 for(;; lp++) {1020 if(lp >= lpe)1021 break;1022 c = *lp;1024 if(c == ' ' || c == '\t') {1025 switch(state) {1026 case NSstart:1027 case NSsign:1028 continue;1029 }1030 break;1031 }1032 if(c == '+' || c == '-') {1033 switch(state) {1034 case NSstart:1035 state = NSsign;1036 if(c == '-')1037 rflag = !rflag;1038 continue;1039 case NSexp:1040 state = NSexpsign;1041 if(c == '-')1042 expsign = 1;1043 continue;1044 }1045 break;1046 }1047 if(c == '0') {1048 switch(state) {1049 case NSdigit:1050 if(rflag)1051 c = ~c;1052 *kp++ = c;1053 cl++;1054 nzero++;1055 dp++;1056 state = NSdigit;1057 continue;1058 case NSfract:1059 if(rflag)1060 c = ~c;1061 *kp++ = c;1062 cl++;1063 nzero++;1064 state = NSfract;1065 continue;1066 case NSstart:1067 case NSsign:1068 case NSzero:1069 state = NSzero;1070 continue;1071 case NSzerofract:1072 case NSpoint:1073 dp--;1074 state = NSzerofract;1075 continue;1076 case NSexpsign:1077 case NSexp:1078 case NSexpdigit:1079 exp = exp*10 + (c - '0');1080 state = NSexpdigit;1081 continue;1082 }1083 break;1084 }1085 if(c >= '1' && c <= '9') {1086 switch(state) {1087 case NSzero:1088 case NSstart:1089 case NSsign:1090 case NSdigit:1091 if(rflag)1092 c = ~c;1093 *kp++ = c;1094 cl++;1095 nzero = 0;1096 dp++;1097 state = NSdigit;1098 continue;1099 case NSzerofract:1100 case NSpoint:1101 case NSfract:1102 if(rflag)1103 c = ~c;1104 *kp++ = c;1105 cl++;1106 nzero = 0;1107 state = NSfract;1108 continue;1109 case NSexpsign:1110 case NSexp:1111 case NSexpdigit:1112 exp = exp*10 + (c - '0');1113 state = NSexpdigit;1114 continue;1115 }1116 break;1117 }1118 if(c == '.') {1119 switch(state) {1120 case NSstart:1121 case NSsign:1122 state = NSpoint;1123 continue;1124 case NSzero:1125 state = NSzerofract;1126 continue;1127 case NSdigit:1128 state = NSfract;1129 continue;1130 }1131 break;1132 }1133 if((f->flags & Gflag) && (c == 'e' || c == 'E')) {1134 switch(state) {1135 case NSdigit:1136 case NSfract:1137 state = NSexp;1138 continue;1139 }1140 break;1141 }1142 break;1143 }1145 switch(state) {1146 /*1147 * result is zero1148 */1149 case NSstart:1150 case NSsign:1151 case NSzero:1152 case NSzerofract:1153 case NSpoint:1154 kp = k->key + k->klen;1155 k->klen += 2;1156 kp[0] = 0x20; /* between + and - */1157 kp[1] = 0;1158 return;1159 /*1160 * result has exponent1161 */1162 case NSexpsign:1163 case NSexp:1164 case NSexpdigit:1165 if(expsign)1166 exp = -exp;1167 dp += exp;1169 /*1170 * result is fixed point number1171 */1172 case NSdigit:1173 case NSfract:1174 kp -= nzero;1175 cl -= nzero;1176 break;1177 }1179 /*1180 * end of number1181 */1182 c = 0;1183 if(rflag)1184 c = ~c;1185 *kp = c;1187 /*1188 * sign and exponent1189 */1190 c = 0x30;1191 if(rflag) {1192 c = 0x10;1193 dp = ~dp;1194 }1195 kp = k->key + k->klen;1196 kp[0] = c;1197 kp[1] = (dp >> 8);1198 kp[2] = dp;1199 k->klen = cl+1;1200 }1202 void1203 dokey_m(Key *k, uchar *lp, uchar *lpe, Field *f)1204 {1205 uchar *kp;1206 Rune r, place[3];1207 int c, cl, pc;1208 int rflag;1210 rflag = f->flags&Rflag;1211 pc = 0;1213 cl = k->klen;1214 kp = k->key + cl;1216 for(;;) {1217 /*1218 * get the character1219 */1220 if(lp >= lpe)1221 break;1222 c = *lp;1223 if(c >= Runeself) {1224 lp += chartorune(&r, (char*)lp);1225 c = r;1226 } else1227 lp++;1229 if(c < nelem(f->mapto)) {1230 c = f->mapto[c];1231 if(c == 0)1232 continue;1233 }1234 place[pc++] = c;1235 if(pc < 3)1236 continue;1237 for(c=11; c>=0; c--)1238 if(memcmp(month[c], place, sizeof(place)) == 0)1239 break;1240 c += 10;1241 if(rflag)1242 c = ~c;1243 *kp++ = c;1244 cl++;1245 break;1246 }1248 c = 0;1249 if(rflag)1250 c = ~c;1251 *kp = c;1252 k->klen = cl+1;1253 }1255 void1256 dokey_dfi(Key *k, uchar *lp, uchar *lpe, Field *f)1257 {1258 uchar *kp;1259 Rune r;1260 int c, cl, n, rflag;1262 cl = k->klen;1263 kp = k->key + cl;1264 rflag = f->flags & Rflag;1266 for(;;) {1267 /*1268 * get the character1269 */1270 if(lp >= lpe)1271 break;1272 c = *lp;1273 if(c >= Runeself) {1274 lp += chartorune(&r, (char*)lp);1275 c = r;1276 } else1277 lp++;1279 /*1280 * do the various mappings.1281 * the common case is handled1282 * completely by the table.1283 */1284 if(c != 0 && c < Runeself) {1285 c = f->mapto[c];1286 if(c) {1287 *kp++ = c;1288 cl++;1289 }1290 continue;1291 }1293 /*1294 * for characters out of range,1295 * the table does not do Rflag.1296 * ignore is based on mapto[255]1297 */1298 if(c != 0 && c < nelem(f->mapto)) {1299 c = f->mapto[c];1300 if(c == 0)1301 continue;1302 } else1303 if(f->mapto[nelem(f->mapto)-1] == 0)1304 continue;1306 /*1307 * put it in the key1308 */1309 r = c;1310 n = runetochar((char*)kp, &r);1311 kp += n;1312 cl += n;1313 if(rflag)1314 while(n > 0) {1315 kp[-n] = ~kp[-n];1316 n--;1317 }1318 }1320 /*1321 * end of key1322 */1323 k->klen = cl+1;1324 if(rflag) {1325 *kp = ~0;1326 return;1327 }1328 *kp = 0;1329 }1331 void1332 dokey_r(Key *k, uchar *lp, uchar *lpe, Field *f)1333 {1334 int cl, n;1335 uchar *kp;1337 USED(f);1338 n = lpe - lp;1339 if(n < 0)1340 n = 0;1341 cl = k->klen;1342 kp = k->key + cl;1343 k->klen = cl+n+1;1345 lpe -= 3;1346 while(lp < lpe) {1347 kp[0] = ~lp[0];1348 kp[1] = ~lp[1];1349 kp[2] = ~lp[2];1350 kp[3] = ~lp[3];1351 kp += 4;1352 lp += 4;1353 }1355 lpe += 3;1356 while(lp < lpe)1357 *kp++ = ~*lp++;1358 *kp = ~0;1359 }1361 void1362 dokey_(Key *k, uchar *lp, uchar *lpe, Field *f)1363 {1364 int n, cl;1365 uchar *kp;1367 USED(f);1368 n = lpe - lp;1369 if(n < 0)1370 n = 0;1371 cl = k->klen;1372 kp = k->key + cl;1373 k->klen = cl+n+1;1374 memmove(kp, lp, n);1375 kp[n] = 0;1376 }1378 void1379 buildkey(Line *l)1380 {1381 Key *k;1382 uchar *lp, *lpe;1383 int ll, kl, cl, i, n;1384 Field *f;1386 ll = l->llen - 1;1387 kl = 0; /* allocated length */1388 cl = 0; /* current length */1389 k = 0;1391 for(i=1; i<=args.nfield; i++) {1392 f = &args.field[i];1393 lp = skip(l->line, f->beg1, f->beg2, f->flags&B1flag, 0);1394 if(lp == 0)1395 lp = l->line + ll;1396 lpe = skip(l->line, f->end1, f->end2, f->flags&Bflag, 1);1397 if(lpe == 0)1398 lpe = l->line + ll;1399 n = (lpe - lp) + 1;1400 if(n <= 0)1401 n = 1;1402 if(cl+(n+4) > kl) {1403 kl = cl+(n+4);1404 k = realloc(k, sizeof(Key) +1405 (kl-1)*sizeof(k->key[0]));1406 if(k == 0)1407 nomem();1408 }1409 k->klen = cl;1410 (*f->dokey)(k, lp, lpe, f);1411 cl = k->klen;1412 }1414 /*1415 * global comparisons1416 */1417 if(!(args.uflag && cl > 0)) {1418 f = &args.field[0];1419 if(cl+(ll+4) > kl) {1420 kl = cl+(ll+4);1421 k = realloc(k, sizeof(Key) +1422 (kl-1)*sizeof(k->key[0]));1423 if(k == 0)1424 nomem();1425 }1426 k->klen = cl;1427 (*f->dokey)(k, l->line, l->line+ll, f);1428 cl = k->klen;1429 }1431 l->key = k;1432 k->klen = cl;1434 if(args.vflag) {1435 write(2, l->line, l->llen);1436 for(i=0; i<k->klen; i++) {1437 fprint(2, " %.2x", k->key[i]);1438 if(k->key[i] == 0x00 || k->key[i] == 0xff)1439 fprint(2, "\n");1440 }1441 }1442 }1444 void1445 makemapm(Field *f)1446 {1447 int i, c;1449 for(i=0; i<nelem(f->mapto); i++) {1450 c = 1;1451 if(i == ' ' || i == '\t')1452 c = 0;1453 if(i >= 'a' && i <= 'z')1454 c = i + ('A' - 'a');1455 if(i >= 'A' && i <= 'Z')1456 c = i;1457 f->mapto[i] = c;1458 if(args.vflag) {1459 if((i & 15) == 0)1460 fprint(2, " ");1461 fprint(2, " %.2x", c);1462 if((i & 15) == 15)1463 fprint(2, "\n");1464 }1465 }1466 }1468 void1469 makemapd(Field *f)1470 {1471 int i, j, c;1473 for(i=0; i<nelem(f->mapto); i++) {1474 c = i;1475 if(f->flags & Iflag)1476 if(c < 040 || c > 0176)1477 c = -1;1478 if((f->flags & Wflag) && c >= 0)1479 if(c == ' ' || c == '\t')1480 c = -1;1481 if((f->flags & Dflag) && c >= 0)1482 if(!(c == ' ' || c == '\t' ||1483 (c >= 'a' && c <= 'z') ||1484 (c >= 'A' && c <= 'Z') ||1485 (c >= '0' && c <= '9'))) {1486 for(j=0; latinmap[j]; j+=3)1487 if(c == latinmap[j+0] ||1488 c == latinmap[j+1])1489 break;1490 if(latinmap[j] == 0)1491 c = -1;1492 }1493 if((f->flags & Fflag) && c >= 0) {1494 if(c >= 'a' && c <= 'z')1495 c += 'A' - 'a';1496 for(j=0; latinmap[j]; j+=3)1497 if(c == latinmap[j+0] ||1498 c == latinmap[j+1]) {1499 c = latinmap[j+2];1500 break;1501 }1502 }1503 if((f->flags & Rflag) && c >= 0 && i > 0 && i < Runeself)1504 c = ~c & 0xff;1505 if(c < 0)1506 c = 0;1507 f->mapto[i] = c;1508 if(args.vflag) {1509 if((i & 15) == 0)1510 fprint(2, " ");1511 fprint(2, " %.2x", c);1512 if((i & 15) == 15)1513 fprint(2, "\n");1514 }1515 }1516 }1518 int latinmap[] =1519 {1520 /* lcase ucase fold */1521 0xe0, 0xc0, 0x41, /* L'à', L'À', L'A', */1522 0xe1, 0xc1, 0x41, /* L'á', L'Á', L'A', */1523 0xe2, 0xc2, 0x41, /* L'â', L'Â', L'A', */1524 0xe4, 0xc4, 0x41, /* L'ä', L'Ä', L'A', */1525 0xe3, 0xc3, 0x41, /* L'ã', L'Ã', L'A', */1526 0xe5, 0xc5, 0x41, /* L'å', L'Å', L'A', */1527 0xe8, 0xc8, 0x45, /* L'è', L'È', L'E', */1528 0xe9, 0xc9, 0x45, /* L'é', L'É', L'E', */1529 0xea, 0xca, 0x45, /* L'ê', L'Ê', L'E', */1530 0xeb, 0xcb, 0x45, /* L'ë', L'Ë', L'E', */1531 0xec, 0xcc, 0x49, /* L'ì', L'Ì', L'I', */1532 0xed, 0xcd, 0x49, /* L'í', L'Í', L'I', */1533 0xee, 0xce, 0x49, /* L'î', L'Î', L'I', */1534 0xef, 0xcf, 0x49, /* L'ï', L'Ï', L'I', */1535 0xf2, 0xd2, 0x4f, /* L'ò', L'Ò', L'O', */1536 0xf3, 0xd3, 0x4f, /* L'ó', L'Ó', L'O', */1537 0xf4, 0xd4, 0x4f, /* L'ô', L'Ô', L'O', */1538 0xf6, 0xd6, 0x4f, /* L'ö', L'Ö', L'O', */1539 0xf5, 0xd5, 0x4f, /* L'õ', L'Õ', L'O', */1540 0xf8, 0xd8, 0x4f, /* L'ø', L'Ø', L'O', */1541 0xf9, 0xd9, 0x55, /* L'ù', L'Ù', L'U', */1542 0xfa, 0xda, 0x55, /* L'ú', L'Ú', L'U', */1543 0xfb, 0xdb, 0x55, /* L'û', L'Û', L'U', */1544 0xfc, 0xdc, 0x55, /* L'ü', L'Ü', L'U', */1545 0xe6, 0xc6, 0x41, /* L'æ', L'Æ', L'A', */1546 0xf0, 0xd0, 0x44, /* L'ð', L'Ð', L'D', */1547 0xf1, 0xd1, 0x4e, /* L'ñ', L'Ñ', L'N', */1548 0xfd, 0xdd, 0x59, /* L'ý', L'Ý', L'Y', */1549 0xe7, 0xc7, 0x43, /* L'ç', L'Ç', L'C', */1550 0,1551 };1553 Rune LJAN[] = { 'J', 'A', 'N', 0 };1554 Rune LFEB[] = { 'F', 'E', 'B', 0 };1555 Rune LMAR[] = { 'M', 'A', 'R', 0 };1556 Rune LAPR[] = { 'A', 'P', 'R', 0 };1557 Rune LMAY[] = { 'M', 'A', 'Y', 0 };1558 Rune LJUN[] = { 'J', 'U', 'N', 0 };1559 Rune LJUL[] = { 'J', 'U', 'L', 0 };1560 Rune LAUG[] = { 'A', 'U', 'G', 0 };1561 Rune LSEP[] = { 'S', 'E', 'P', 0 };1562 Rune LOCT[] = { 'O', 'C', 'T', 0 };1563 Rune LNOV[] = { 'N', 'O', 'V', 0 };1564 Rune LDEC[] = { 'D', 'E', 'C', 0 };1566 Rune* month[12] =1567 {1568 LJAN,1569 LFEB,1570 LMAR,1571 LAPR,1572 LMAY,1573 LJUN,1574 LJUL,1575 LAUG,1576 LSEP,1577 LOCT,1578 LNOV,1579 LDEC,1580 };1582 /************** radix sort ***********/1584 enum1585 {1586 Threshold = 141587 };1589 void rsort4(Key***, ulong, int);1590 void bsort4(Key***, ulong, int);1592 void1593 sort4(void *a, ulong n)1594 {1595 if(n > Threshold)1596 rsort4((Key***)a, n, 0);1597 else1598 bsort4((Key***)a, n, 0);1599 }1601 void1602 rsort4(Key ***a, ulong n, int b)1603 {1604 Key ***ea, ***t, ***u, **t1, **u1, *k;1605 Key ***part[257];1606 static long count[257];1607 long clist[257+257], *cp, *cp1;1608 int c, lowc, higc;1610 /*1611 * pass 1 over all keys,1612 * count the number of each key[b].1613 * find low count and high count.1614 */1615 lowc = 256;1616 higc = 0;1617 ea = a+n;1618 for(t=a; t<ea; t++) {1619 k = **t;1620 n = k->klen;1621 if(b >= n) {1622 count[256]++;1623 continue;1624 }1625 c = k->key[b];1626 n = count[c]++;1627 if(n == 0) {1628 if(c < lowc)1629 lowc = c;1630 if(c > higc)1631 higc = c;1632 }1633 }1635 /*1636 * pass 2 over all counts,1637 * put partition pointers in part[c].1638 * save compacted indexes and counts1639 * in clist[].1640 */1641 t = a;1642 n = count[256];1643 clist[0] = n;1644 part[256] = t;1645 t += n;1647 cp1 = clist+1;1648 cp = count+lowc;1649 for(c=lowc; c<=higc; c++,cp++) {1650 n = *cp;1651 if(n) {1652 cp1[0] = n;1653 cp1[1] = c;1654 cp1 += 2;1655 part[c] = t;1656 t += n;1657 }1658 }1659 *cp1 = 0;1661 /*1662 * pass 3 over all counts.1663 * chase lowest pointer in each partition1664 * around a permutation until it comes1665 * back and is stored where it started.1666 * static array, count[], should be1667 * reduced to zero entries except maybe1668 * count[256].1669 */1670 for(cp1=clist+1; cp1[0]; cp1+=2) {1671 c = cp1[1];1672 cp = count+c;1673 while(*cp) {1674 t1 = *part[c];1675 for(;;) {1676 k = *t1;1677 n = 256;1678 if(b < k->klen)1679 n = k->key[b];1680 u = part[n]++;1681 count[n]--;1682 u1 = *u;1683 *u = t1;1684 if(n == c)1685 break;1686 t1 = u1;1687 }1688 }1689 }1691 /*1692 * pass 4 over all partitions.1693 * call recursively.1694 */1695 b++;1696 t = a + clist[0];1697 count[256] = 0;1698 for(cp1=clist+1; n=cp1[0]; cp1+=2) {1699 if(n > Threshold)1700 rsort4(t, n, b);1701 else1702 if(n > 1)1703 bsort4(t, n, b);1704 t += n;1705 }1706 }1708 /*1709 * bubble sort to pick up1710 * the pieces.1711 */1712 void1713 bsort4(Key ***a, ulong n, int b)1714 {1715 Key ***i, ***j, ***k, ***l, **t;1716 Key *ka, *kb;1717 int n1, n2;1719 l = a+n;1720 j = a;1722 loop:1723 i = j;1724 j++;1725 if(j >= l)1726 return;1728 ka = **i;1729 kb = **j;1730 n1 = ka->klen - b;1731 n2 = kb->klen - b;1732 if(n1 > n2)1733 n1 = n2;1734 if(n1 <= 0)1735 goto loop;1736 n2 = ka->key[b] - kb->key[b];1737 if(n2 == 0)1738 n2 = memcmp(ka->key+b, kb->key+b, n1);1739 if(n2 <= 0)1740 goto loop;1742 for(;;) {1743 k = i+1;1745 t = *k;1746 *k = *i;1747 *i = t;1749 if(i <= a)1750 goto loop;1752 i--;1753 ka = **i;1754 kb = *t;1755 n1 = ka->klen - b;1756 n2 = kb->klen - b;1757 if(n1 > n2)1758 n1 = n2;1759 if(n1 <= 0)1760 goto loop;1761 n2 = ka->key[b] - kb->key[b];1762 if(n2 == 0)1763 n2 = memcmp(ka->key+b, kb->key+b, n1);1764 if(n2 <= 0)1765 goto loop;1766 }1767 }