Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include "code.h"
7 /* fig leaves for possibly signed char quantities */
8 #define ISUPPER(c) isupper((c)&0xff)
9 #define ISLOWER(c) islower((c)&0xff)
10 #define ISALPHA(c) isalpha((c)&0xff)
11 #define ISDIGIT(c) isdigit((c)&0xff)
12 #define ISVOWEL(c) voweltab[(c)&0xff]
13 #define Tolower(c) (ISUPPER(c)? (c)-'A'+'a': (c))
14 #define pair(a,b) (((a)<<8) | (b))
15 #define DLEV 2
16 #define DSIZ 40
18 typedef long Bits;
19 #define Set(h, f) ((long)(h) & (f))
21 Bits nop(char*, char*, char*, int, int);
22 Bits strip(char*, char*, char*, int, int);
23 Bits ize(char*, char*, char*, int, int);
24 Bits i_to_y(char*, char*, char*, int, int);
25 Bits ily(char*, char*, char*, int, int);
26 Bits subst(char*, char*, char*, int, int);
27 Bits CCe(char*, char*, char*, int, int);
28 Bits tion(char*, char*, char*, int, int);
29 Bits an(char*, char*, char*, int, int);
30 Bits s(char*, char*, char*, int, int);
31 Bits es(char*, char*, char*, int, int);
32 Bits bility(char*, char*, char*, int, int);
33 Bits y_to_e(char*, char*, char*, int, int);
34 Bits VCe(char*, char*, char*, int, int);
36 Bits trypref(char*, char*, int, int);
37 Bits tryword(char*, char*, int, int);
38 Bits trysuff(char*, int, int);
39 Bits dict(char*, char*);
40 void typeprint(Bits);
41 void pcomma(char*);
43 void ise(void);
44 int ordinal(void);
45 char* skipv(char*);
46 int inun(char*, Bits);
47 char* ztos(char*);
48 void readdict(char*);
50 typedef struct Ptab Ptab;
51 struct Ptab
52 {
53 char* s;
54 int flag;
55 };
57 typedef struct Suftab Suftab;
58 struct Suftab
59 {
60 char *suf;
61 Bits (*p1)(char*, char*, char*, int, int);
62 int n1;
63 char *d1;
64 char *a1;
65 int flag;
66 int affixable;
67 Bits (*p2)(char*, char*, char*, int, int);
68 int n2;
69 char *d2;
70 char *a2;
71 };
73 Suftab staba[] = {
74 {"aibohp",subst,1,"-e+ia","",NOUN, NOUN},
75 0
76 };
78 Suftab stabc[] =
79 {
80 {"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN},
81 {"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN},
82 {"citi",ize,1,"-e+ic","",N_AFFIX, ADJ },
83 {"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN },
84 {"cipocs",ize,1,"-e+ic","",NOUN, ADJ },
85 {"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ },
86 {"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ },
87 {"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ },
88 {"cibohp",subst,1,"-e+ic","",NOUN, ADJ },
89 0
90 };
91 Suftab stabd[] =
92 {
93 {"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"},
94 {"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN},
95 0
96 };
97 Suftab stabe[] =
98 {
99 /*
100 * V_affix for comment ->commence->commentment??
101 */
102 {"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
103 {"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
104 {"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ},
105 {"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ},
106 {"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ},
107 {"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP},
108 {"ekil",strip,4,"","+like",N_AFFIX ,ADJ},
110 };
111 Suftab stabg[] =
113 {"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN},
114 {"gnikam",strip,6,"","+making",NOUN,NOUN},
115 {"gnipeek",strip,7,"","+keeping",NOUN,NOUN},
116 {"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN},
118 };
119 Suftab stabl[] =
121 {"ladio",strip,2,"","+al",NOUN |ADJ,ADJ},
122 {"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX},
123 {"latnem",strip,2,"","+al",N_AFFIX,ADJ},
124 {"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN},
125 {"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN},
127 };
128 Suftab stabm[] =
130 /* congregational + ism */
131 {"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN},
132 {"margo",subst,-1,"-ph+m","",NOUN,NOUN},
134 };
135 Suftab stabn[] =
137 {"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX},
138 {"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX},
139 {"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR},
140 {"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
141 {"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX},
142 {"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB},
143 {"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX},
144 {"nemow",strip,5,"","+women",MAN,PROP_COLLECT},
145 {"nem",strip,3,"","+man",MAN,PROP_COLLECT},
146 {"nosrep",strip,6,"","+person",MAN,PROP_COLLECT},
148 };
149 Suftab stabp[] =
151 {"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
153 };
154 Suftab stabr[] =
156 {"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"},
157 {"reyhparg",nop,0,"","",0,NOUN},
158 {"reyl",nop,0,"","",0,NOUN},
159 {"rekam",strip,5,"","+maker",NOUN,NOUN},
160 {"repeek",strip,6,"","+keeper",NOUN,NOUN},
161 {"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ, i_to_y,2,"-y+ier","+er"},
162 {"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y},
163 {"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX},
164 {"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX},
166 };
167 Suftab stabs[] =
169 {"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX},
170 {"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ },
171 {"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH , es,2,"-y+ies","+es"},
172 {"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH },
173 {"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH },
175 };
176 Suftab stabt[] =
178 {"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB},
179 {"tse",strip,2,"","+st",EST,DONT_TOUCH, i_to_y,3,"-y+iest","+est" },
180 {"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX},
181 {"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP},
183 };
184 Suftab staby[] =
186 {"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
187 {"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
188 {"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX},
189 {"ytisuo",nop,0,"","",NOUN},
190 {"ytilb",nop,0,"","",0,NOUN},
191 {"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX },
192 {"ylb",y_to_e,1,"-e+y","",ADJ,ADV},
193 {"ylc",nop,0,"","",0},
194 {"ylelb",nop,0,"","",0},
195 {"ylelp",nop,0,"","",0},
196 {"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP},
197 {"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX},
198 {"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP},
200 };
201 Suftab stabz[] =
204 };
205 Suftab* suftab[] =
207 staba,
208 stabz,
209 stabc,
210 stabd,
211 stabe,
212 stabz,
213 stabg,
214 stabz,
215 stabz,
216 stabz,
217 stabz,
218 stabl,
219 stabm,
220 stabn,
221 stabz,
222 stabp,
223 stabz,
224 stabr,
225 stabs,
226 stabt,
227 stabz,
228 stabz,
229 stabz,
230 stabz,
231 staby,
232 stabz
233 };
235 Ptab ptaba[] =
237 "anti", 0,
238 "auto", 0,
240 };
241 Ptab ptabb[] =
243 "bio", 0,
245 };
246 Ptab ptabc[] =
248 "counter", 0,
250 };
251 Ptab ptabd[] =
253 "dis", 0,
255 };
256 Ptab ptabe[] =
258 "electro", 0,
260 };
261 Ptab ptabf[] =
263 "femto", 0,
265 };
266 Ptab ptabg[] =
268 "geo", 0,
269 "giga", 0,
271 };
272 Ptab ptabh[] =
274 "hyper", 0,
276 };
277 Ptab ptabi[] =
279 "immuno", 0,
280 "im", IN,
281 "intra", 0,
282 "inter", 0,
283 "in", IN,
284 "ir", IN,
285 "iso", 0,
287 };
288 Ptab ptabj[] =
291 };
292 Ptab ptabk[] =
294 "kilo", 0,
296 };
297 Ptab ptabl[] =
300 };
301 Ptab ptabm[] =
303 "magneto", 0,
304 "mega", 0,
305 "meta", 0,
306 "micro", 0,
307 "mid", 0,
308 "milli", 0,
309 "mini", 0,
310 "mis", 0,
311 "mono", 0,
312 "multi", 0,
314 };
315 Ptab ptabn[] =
317 "nano", 0,
318 "neuro", 0,
319 "non", 0,
321 };
322 Ptab ptabo[] =
324 "out", 0,
325 "over", 0,
327 };
328 Ptab ptabp[] =
330 "para", 0,
331 "photo", 0,
332 "pico", 0,
333 "poly", 0,
334 "pre", 0,
335 "pseudo", 0,
336 "psycho", 0,
338 };
339 Ptab ptabq[] =
341 "quasi", 0,
343 };
344 Ptab ptabr[] =
346 "radio", 0,
347 "re", 0,
349 };
350 Ptab ptabs[] =
352 "semi", 0,
353 "stereo", 0,
354 "sub", 0,
355 "super", 0,
357 };
358 Ptab ptabt[] =
360 "tele", 0,
361 "tera", 0,
362 "thermo", 0,
364 };
365 Ptab ptabu[] =
367 "ultra", 0,
368 "under", 0, /*must precede un*/
369 "un", IN,
371 };
372 Ptab ptabv[] =
375 };
376 Ptab ptabw[] =
379 };
380 Ptab ptabx[] =
383 };
384 Ptab ptaby[] =
387 };
388 Ptab ptabz[] =
391 };
393 Ptab* preftab[] =
395 ptaba,
396 ptabb,
397 ptabc,
398 ptabd,
399 ptabe,
400 ptabf,
401 ptabg,
402 ptabh,
403 ptabi,
404 ptabj,
405 ptabk,
406 ptabl,
407 ptabm,
408 ptabn,
409 ptabo,
410 ptabp,
411 ptabq,
412 ptabr,
413 ptabs,
414 ptabt,
415 ptabu,
416 ptabv,
417 ptabw,
418 ptabx,
419 ptaby,
420 ptabz
421 };
423 typedef struct {
424 char *mesg;
425 enum { NONE, SUFF, PREF} type;
426 } Deriv;
428 int aflag;
429 int cflag;
430 int fflag;
431 int vflag;
432 int xflag;
433 int nflag;
434 char word[500];
435 char* original;
436 Deriv emptyderiv;
437 Deriv deriv[DSIZ+3];
438 char affix[DSIZ*10]; /* 10 is longest affix message */
439 int prefcount;
440 int suffcount;
441 char* acmeid;
442 char space[300000]; /* must be as large as "words"+"space" in pcode run */
443 Bits encode[2048]; /* must be as long as "codes" in pcode run */
444 int nencode;
445 char voweltab[256];
446 char* spacep[128*128+1]; /* pointer to words starting with 'xx' */
447 Biobuf bin;
448 Biobuf bout;
450 char* codefile = "#9/lib/amspell";
451 char* brfile = "#9/lib/brspell";
452 char* Usage = "usage";
454 void
455 main(int argc, char *argv[])
457 char *ep, *cp;
458 char *dp;
459 int j, i, c;
460 int low;
461 Bits h;
463 codefile = unsharp(codefile);
464 brfile = unsharp(brfile);
466 Binit(&bin, 0, OREAD);
467 Binit(&bout, 1, OWRITE);
468 for(i=0; c = "aeiouyAEIOUY"[i]; i++)
469 voweltab[c] = 1;
470 while(argc > 1) {
471 if(argv[1][0] != '-')
472 break;
473 for(i=1; c = argv[1][i]; i++)
474 switch(c) {
475 default:
476 fprint(2, "usage: spell [-bcCvx] [-f file]\n");
477 exits(Usage);
479 case 'a':
480 aflag++;
481 continue;
483 case 'b':
484 ise();
485 if(!fflag)
486 codefile = brfile;
487 continue;
489 case 'C': /* for "correct" */
490 vflag++;
491 case 'c': /* for ocr */
492 cflag++;
493 continue;
495 case 'v':
496 vflag++;
497 continue;
499 case 'x':
500 xflag++;
501 continue;
503 case 'f':
504 if(argc <= 2) {
505 fprint(2, "spell: -f requires another argument\n");
506 exits(Usage);
508 argv++;
509 argc--;
510 codefile = argv[1];
511 fflag++;
512 goto brk;
514 brk:
515 argv++;
516 argc--;
518 readdict(codefile);
519 if(argc > 1) {
520 fprint(2, "usage: spell [-bcCvx] [-f file]\n");
521 exits(Usage);
523 if(aflag)
524 cflag = vflag = 0;
526 for(;;) {
527 affix[0] = 0;
528 original = Brdline(&bin, '\n');
529 if(original == 0)
530 exits(0);
531 original[Blinelen(&bin)-1] = 0;
532 low = 0;
534 if(aflag) {
535 acmeid = original;
536 while(*original != ':')
537 if(*original++ == 0)
538 exits(0);
539 while(*++original != ':')
540 if(*original == 0)
541 exits(0);
542 *original++ = 0;
544 for(ep=word,dp=original; j = *dp; ep++,dp++) {
545 if(ISLOWER(j))
546 low++;
547 if(ep >= word+sizeof(word)-1)
548 break;
549 *ep = j;
551 *ep = 0;
553 if(ISDIGIT(word[0]) && ordinal())
554 continue;
556 h = 0;
557 if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)))
558 for(cp=original+1,dp=word+1; dp<ep; dp++,cp++)
559 *dp = Tolower(*cp);
560 if(!h)
561 for(;;) { /* at most twice */
562 if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))
563 break;
564 if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH))
565 break;
566 if(!ISUPPER(word[0]))
567 break;
568 cp = original;
569 dp = word;
570 while(*dp = *cp++) {
571 if(!low)
572 *dp = Tolower(*dp);
573 dp++;
575 word[0] = Tolower(word[0]);
578 if(cflag) {
579 if(!h || Set(h,STOP))
580 print("-");
581 else if(!vflag)
582 print("+");
583 else
584 print("%c",'0' + (suffcount>0) +
585 (prefcount>4? 8: 2*prefcount));
586 } else if(!h || Set(h,STOP)) {
587 if(aflag)
588 Bprint(&bout, "%s:%s\n", acmeid, original);
589 else
590 Bprint(&bout, "%s\n", original);
591 } else if(affix[0] != 0 && affix[0] != '.')
592 print("%s\t%s\n", affix, original);
596 /* strip exactly one suffix and do
597 * indicated routine(s), which may recursively
598 * strip suffixes
599 */
600 Bits
601 trysuff(char* ep, int lev, int flag)
603 Suftab *t;
604 char *cp, *sp;
605 Bits h = 0;
606 int initchar = ep[-1];
608 flag &= ~MONO;
609 lev += DLEV;
610 if(lev < DSIZ) {
611 deriv[lev] = emptyderiv;
612 deriv[lev-1] = emptyderiv;
614 if(!ISLOWER(initchar))
615 return h;
616 for(t=suftab[initchar-'a']; sp=t->suf; t++) {
617 cp = ep;
618 while(*sp)
619 if(*--cp != *sp++)
620 goto next;
621 for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);)
623 if(sp < word)
624 continue;
625 if(!(t->affixable & flag))
626 return 0;
627 h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP);
628 if(!h && t->p2!=0) {
629 if(lev < DSIZ) {
630 deriv[lev] = emptyderiv;
631 deriv[lev+1] = emptyderiv;
633 h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP);
635 break;
636 next:;
638 return h;
641 Bits
642 nop(char* ep, char* d, char* a, int lev, int flag)
644 USED(ep);
645 USED(d);
646 USED(a);
647 USED(lev);
648 USED(flag);
649 return 0;
652 Bits
653 cstrip(char* ep, char* d, char* a, int lev, int flag)
655 int temp = ep[0];
657 if(ISVOWEL(temp) && ISVOWEL(ep[-1])) {
658 switch(pair(ep[-1],ep[0])) {
659 case pair('a', 'a'):
660 case pair('a', 'e'):
661 case pair('a', 'i'):
662 case pair('e', 'a'):
663 case pair('e', 'e'):
664 case pair('e', 'i'):
665 case pair('i', 'i'):
666 case pair('o', 'a'):
667 return 0;
669 } else
670 if(temp==ep[-1]&&temp==ep[-2])
671 return 0;
672 return strip(ep,d,a,lev,flag);
675 Bits
676 strip(char* ep, char* d, char* a, int lev, int flag)
678 Bits h = trypref(ep, a, lev, flag);
680 USED(d);
681 if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2]))
682 h = 0;
683 if(h)
684 return h;
685 if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) {
686 h = trypref(ep-1,a,lev,flag|MONO);
687 if(h)
688 return h;
690 return trysuff(ep,lev,flag);
693 Bits
694 s(char* ep, char* d, char* a, int lev, int flag)
696 if(lev > DLEV+1)
697 return 0;
698 if(*ep=='s') {
699 switch(ep[-1]) {
700 case 'y':
701 if(ISVOWEL(ep[-2])||ISUPPER(*word))
702 break; /*says Kennedys*/
703 case 'x':
704 case 'z':
705 case 's':
706 return 0;
707 case 'h':
708 switch(ep[-2]) {
709 case 'c':
710 case 's':
711 return 0;
715 return strip(ep,d,a,lev,flag);
718 Bits
719 an(char* ep, char* d, char* a, int lev, int flag)
721 USED(d);
722 if(!ISUPPER(*word)) /*must be proper name*/
723 return 0;
724 return trypref(ep,a,lev,flag);
727 Bits
728 ize(char* ep, char* d, char* a, int lev, int flag)
730 int temp = ep[-1];
731 Bits h;
733 USED(a);
734 ep[-1] = 'e';
735 h = strip(ep,"",d,lev,flag);
736 ep[-1] = temp;
737 return h;
740 Bits
741 y_to_e(char* ep, char* d, char* a, int lev, int flag)
743 Bits h;
744 int temp;
746 USED(a);
747 switch(ep[-1]) {
748 case 'a':
749 case 'e':
750 case 'i':
751 return 0;
753 temp = *ep;
754 *ep++ = 'e';
755 h = strip(ep,"",d,lev,flag);
756 ep[-1] = temp;
757 return h;
760 Bits
761 ily(char* ep, char* d, char* a, int lev, int flag)
763 int temp = ep[0];
764 char *cp = ep;
766 if(temp==ep[-1]&&temp==ep[-2]) /* sillly */
767 return 0;
768 if(*--cp=='y' && !ISVOWEL(*--cp)) /* happyly */
769 while(cp>word)
770 if(ISVOWEL(*--cp)) /* shyness */
771 return 0;
772 if(ep[-1]=='i')
773 return i_to_y(ep,d,a,lev,flag);
774 return cstrip(ep,d,a,lev,flag);
777 Bits
778 bility(char* ep, char* d, char* a, int lev, int flag)
780 *ep++ = 'l';
781 return y_to_e(ep,d,a,lev,flag);
784 Bits
785 i_to_y(char* ep, char* d, char* a, int lev, int flag)
787 Bits h;
788 int temp;
790 if(ISUPPER(*word))
791 return 0;
792 if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) {
793 ep[-1] = 'y';
794 a = d;
796 h = cstrip(ep,"",a,lev,flag);
797 ep[-1] = temp;
798 return h;
801 Bits
802 es(char* ep, char* d, char* a, int lev, int flag)
804 if(lev>DLEV)
805 return 0;
806 switch(ep[-1]) {
807 default:
808 return 0;
809 case 'i':
810 return i_to_y(ep,d,a,lev,flag);
811 case 'h':
812 switch(ep[-2]) {
813 default:
814 return 0;
815 case 'c':
816 case 's':
817 break;
819 case 's':
820 case 'z':
821 case 'x':
822 return strip(ep,d,a,lev,flag);
826 Bits
827 subst(char* ep, char* d, char* a, int lev, int flag)
829 char *u,*t;
830 Bits h;
832 USED(a);
833 if(skipv(skipv(ep-1)) < word)
834 return 0;
835 for(t=d; *t!='+'; t++)
836 continue;
837 for(u=ep; *--t!='-';)
838 *--u = *t;
839 h = strip(ep,"",d,lev,flag);
840 while(*++t != '+')
841 continue;
842 while(*++t)
843 *u++ = *t;
844 return h;
847 Bits
848 tion(char* ep, char* d, char* a, int lev, int flag)
850 switch(ep[-2]) {
851 default:
852 return trypref(ep,a,lev,flag);
853 case 'a':
854 case 'e':
855 case 'i':
856 case 'o':
857 case 'u':
858 return y_to_e(ep,d,a,lev,flag);
862 /*
863 * possible consonant-consonant-e ending
864 */
865 Bits
866 CCe(char* ep, char* d, char* a, int lev, int flag)
868 Bits h;
870 switch(ep[-1]) {
871 case 'l':
872 if(ISVOWEL(ep[-2]))
873 break;
874 switch(ep[-2]) {
875 case 'l':
876 case 'r':
877 case 'w':
878 break;
879 default:
880 return y_to_e(ep,d,a,lev,flag);
882 break;
883 case 'c':
884 case 'g':
885 if(*ep == 'a') /* prevent -able for -eable */
886 return 0;
887 case 's':
888 case 'v':
889 case 'z':
890 if(ep[-2]==ep[-1])
891 break;
892 if(ISVOWEL(ep[-2]))
893 break;
894 case 'u':
895 if(h = y_to_e(ep,d,a,lev,flag))
896 return h;
897 if(!(ep[-2]=='n' && ep[-1]=='g'))
898 return 0;
900 return VCe(ep,d,a,lev,flag);
903 /*
904 * possible consonant-vowel-consonant-e ending
905 */
906 Bits
907 VCe(char* ep, char* d, char* a, int lev, int flag)
909 int c;
910 Bits h;
912 c = ep[-1];
913 if(c=='e')
914 return 0;
915 if(!ISVOWEL(c) && ISVOWEL(ep[-2])) {
916 c = *ep;
917 *ep++ = 'e';
918 h = trypref(ep,d,lev,flag);
919 if(!h)
920 h = trysuff(ep,lev,flag);
921 if(h)
922 return h;
923 ep--;
924 *ep = c;
926 return cstrip(ep,d,a,lev,flag);
929 Ptab*
930 lookuppref(uchar** wp, char* ep)
932 Ptab *sp;
933 uchar *bp,*cp;
934 unsigned int initchar = Tolower(**wp);
936 if(!ISALPHA(initchar))
937 return 0;
938 for(sp=preftab[initchar-'a'];sp->s;sp++) {
939 bp = *wp;
940 for(cp= (uchar*)sp->s;*cp; )
941 if(*bp++!=*cp++)
942 goto next;
943 for(cp=bp;cp<(uchar*)ep;cp++)
944 if(ISVOWEL(*cp)) {
945 *wp = bp;
946 return sp;
948 next:;
950 return 0;
953 /* while word is not in dictionary try stripping
954 * prefixes. Fail if no more prefixes.
955 */
956 Bits
957 trypref(char* ep, char* a, int lev, int flag)
959 Ptab *tp;
960 char *bp, *cp;
961 char *pp;
962 Bits h;
963 char space[20];
965 if(lev<DSIZ) {
966 deriv[lev].mesg = a;
967 deriv[lev].type = *a=='.'? NONE: SUFF;
969 if(h = tryword(word,ep,lev,flag)) {
970 if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO))
971 return h;
972 h = 0;
974 bp = word;
975 pp = space;
976 if(lev<DSIZ) {
977 deriv[lev+1].mesg = pp;
978 deriv[lev+1].type = 0;
980 while(tp=lookuppref((uchar**)(void*)&bp,ep)) {
981 *pp++ = '+';
982 cp = tp->s;
983 while(pp<space+sizeof(space) && (*pp = *cp++))
984 pp++;
985 deriv[lev+1].type += PREF;
986 h = tryword(bp,ep,lev+1,flag);
987 if(Set(h,NOPREF) ||
988 ((tp->flag&IN) && inun(bp-2,h)==0)) {
989 h = 0;
990 break;
992 if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO))
993 break;
994 h = 0;
996 if(lev < DSIZ) {
997 deriv[lev+1] = emptyderiv;
998 deriv[lev+2] = emptyderiv;
1000 return h;
1003 Bits
1004 tryword(char* bp, char* ep, int lev, int flag)
1006 int j;
1007 Bits h = 0;
1008 char duple[3];
1010 if(ep-bp <= 1)
1011 return h;
1012 if(flag&MONO) {
1013 if(lev<DSIZ) {
1014 deriv[++lev].mesg = duple;
1015 deriv[lev].type = SUFF;
1017 duple[0] = '+';
1018 duple[1] = *ep;
1019 duple[2] = 0;
1021 h = dict(bp, ep);
1022 if(vflag==0 || h==0)
1023 return h;
1025 * when derivations are wanted, collect them
1026 * for printing
1028 j = lev;
1029 prefcount = suffcount = 0;
1030 do {
1031 if(j<DSIZ && deriv[j].type) {
1032 strcat(affix, deriv[j].mesg);
1033 if(deriv[j].type == SUFF)
1034 suffcount++;
1035 else if(deriv[j].type != NONE)
1036 prefcount = deriv[j].type/PREF;
1038 } while(--j > 0);
1039 return h;
1042 int
1043 inun(char* bp, Bits h)
1045 if(*bp == 'u')
1046 return Set(h, IN) == 0;
1047 /* *bp == 'i' */
1048 if(Set(h, IN) == 0)
1049 return 0;
1050 switch(bp[2]) {
1051 case 'r':
1052 return bp[1] == 'r';
1053 case 'm':
1054 case 'p':
1055 return bp[1] == 'm';
1057 return bp[1] == 'n';
1060 char*
1061 skipv(char *s)
1063 if(s >= word && ISVOWEL(*s))
1064 s--;
1065 while(s >= word && !ISVOWEL(*s))
1066 s--;
1067 return s;
1071 * crummy way to Britishise
1073 void
1074 ise(void)
1076 Suftab *p;
1077 int i;
1079 for(i=0; i<26; i++)
1080 for(p = suftab[i]; p->suf; p++) {
1081 p->suf = ztos(p->suf);
1082 p->d1 = ztos(p->d1);
1083 p->a1 = ztos(p->a1);
1087 char*
1088 ztos(char *as)
1090 char *s, *ds;
1092 for(s=as; *s; s++)
1093 if(*s == 'z')
1094 goto copy;
1095 return as;
1097 copy:
1098 ds = strdup(as);
1099 for(s=ds; *s; s++)
1100 if(*s == 'z')
1101 *s = 's';
1102 return ds;
1105 Bits
1106 dict(char* bp, char* ep)
1108 char *cp, *cp1, *w, *wp, *we;
1109 int n, f;
1111 w = bp;
1112 we = ep;
1113 n = ep-bp;
1114 if(n <= 1)
1115 return NOUN;
1117 f = w[0] & 0x7f;
1118 f *= 128;
1119 f += w[1] & 0x7f;
1120 bp = spacep[f];
1121 ep = spacep[f+1];
1123 loop:
1124 if(bp >= ep) {
1125 if(xflag)
1126 fprint(2, "=%.*s\n", utfnlen(w, n), w);
1127 return 0;
1130 * find the beginning of some word in the middle
1132 cp = bp + (ep-bp)/2;
1134 while(cp > bp && !(*cp & 0x80))
1135 cp--;
1136 while(cp > bp && (cp[-1] & 0x80))
1137 cp--;
1139 wp = w + 2; /* skip two letters */
1140 cp1 = cp + 2; /* skip affix code */
1141 for(;;) {
1142 if(wp >= we) {
1143 if(*cp1 & 0x80)
1144 goto found;
1145 else
1146 f = 1;
1147 break;
1149 if(*cp1 & 0x80) {
1150 f = -1;
1151 break;
1153 f = *cp1++ - *wp++;
1154 if(f != 0)
1155 break;
1158 if(f < 0) {
1159 while(!(*cp1 & 0x80))
1160 cp1++;
1161 bp = cp1;
1162 goto loop;
1164 ep = cp;
1165 goto loop;
1167 found:
1168 f = ((cp[0] & 0x7) << 8) |
1169 (cp[1] & 0xff);
1170 if(xflag) {
1171 fprint(2, "=%.*s ", utfnlen(w, n), w);
1172 typeprint(encode[f]);
1174 return encode[f];
1177 void
1178 typeprint(Bits h)
1181 pcomma("");
1182 if(h & NOUN)
1183 pcomma("n");
1184 if(h & PROP_COLLECT)
1185 pcomma("pc");
1186 if(h & VERB) {
1187 if((h & VERB) == VERB)
1188 pcomma("v");
1189 else
1190 if((h & VERB) == V_IRREG)
1191 pcomma("vi");
1192 else
1193 if(h & ED)
1194 pcomma("ed");
1196 if(h & ADJ)
1197 pcomma("a");
1198 if(h & COMP) {
1199 if((h & COMP) == ACTOR)
1200 pcomma("er");
1201 else
1202 pcomma("comp");
1204 if(h & DONT_TOUCH)
1205 pcomma("d");
1206 if(h & N_AFFIX)
1207 pcomma("na");
1208 if(h & ADV)
1209 pcomma("adv");
1210 if(h & ION)
1211 pcomma("ion");
1212 if(h & V_AFFIX)
1213 pcomma("va");
1214 if(h & MAN)
1215 pcomma("man");
1216 if(h & NOPREF)
1217 pcomma("nopref");
1218 if(h & MONO)
1219 pcomma("ms");
1220 if(h & IN)
1221 pcomma("in");
1222 if(h & _Y)
1223 pcomma("y");
1224 if(h & STOP)
1225 pcomma("s");
1226 fprint(2, "\n");
1229 void
1230 pcomma(char *s)
1232 static int flag;
1234 if(*s == 0) {
1235 flag = 0;
1236 return;
1238 if(!flag) {
1239 fprint(2, "%s", s);
1240 flag = 1;
1241 } else
1242 fprint(2, ",%s", s);
1246 * is the word on of the following
1247 * 12th teen
1248 * 21st end in 1
1249 * 23rd end in 3
1250 * 77th default
1251 * called knowing word[0] is a digit
1253 int
1254 ordinal(void)
1256 char *cp = word;
1257 static char sp[4];
1259 while(ISDIGIT(*cp))
1260 cp++;
1261 strncpy(sp,cp,3);
1262 if(ISUPPER(cp[0]) && ISUPPER(cp[1])) {
1263 sp[0] = Tolower(cp[0]);
1264 sp[1] = Tolower(cp[1]);
1266 return 0 == strncmp(sp,
1267 cp[-2]=='1'? "th": /* out of bounds if 1 digit */
1268 *--cp=='1'? "st": /* harmless */
1269 *cp=='2'? "nd":
1270 *cp=='3'? "rd":
1271 "th", 3);
1275 * read in the dictionary.
1276 * format is
1277 * {
1278 * short nencode;
1279 * long encode[nencode];
1280 * char space[*];
1281 * };
1283 * the encodings are a table all different
1284 * affixes.
1285 * the dictionary proper has 2 bytes
1286 * that demark and then the rest of the
1287 * word. the 2 bytes have the following
1288 * 0x80 0x00 flag
1289 * 0x78 0x00 count of prefix bytes
1290 * common with prev word
1291 * 0x07 0xff affix code
1293 * all ints are big endians in the file.
1295 void
1296 readdict(char *file)
1298 char *s, *is, *lasts, *ls;
1299 int c, i, sp, p;
1300 int f;
1301 long l;
1303 lasts = 0;
1304 f = open(file, 0);
1305 if(f == -1) {
1306 fprint(2, "cannot open %s\n", file);
1307 exits("open");
1309 if(read(f, space, 2) != 2)
1310 goto bad;
1311 nencode = ((space[0]&0xff)<<8) | (space[1]&0xff);
1312 if(read(f, space, 4*nencode) != 4*nencode)
1313 goto bad;
1314 s = space;
1315 for(i=0; i<nencode; i++) {
1316 l = (long)(s[0] & 0xff) << 24;
1317 l |= (s[1] & 0xff) << 16;
1318 l |= (s[2] & 0xff) << 8;
1319 l |= s[3] & 0xff;
1320 encode[i] = (Bits)l;
1321 s += 4;
1323 l = read(f, space, sizeof(space));
1324 if(l == sizeof(space))
1325 goto noroom;
1326 is = space + (sizeof(space) - l);
1327 memmove(is, space, l);
1329 s = space;
1330 c = *is++ & 0xff;
1331 sp = -1;
1332 i = 0;
1334 loop:
1335 if(s > is)
1336 goto noroom;
1337 if(c < 0) {
1338 close(f);
1339 while(sp < 128*128)
1340 spacep[++sp] = s;
1341 *s = (char)0x80; /* fence */
1342 return;
1344 p = (c>>3) & 0xf;
1345 *s++ = c;
1346 *s++ = *is++ & 0xff;
1347 if(p <= 0)
1348 i = (*is++ & 0xff)*128;
1349 if(p <= 1) {
1350 if(!(*is & 0x80))
1351 i = i/128*128 + (*is++ & 0xff);
1352 if(i <= sp) {
1353 fprint(2, "the dict isnt sorted or \n");
1354 fprint(2, "memmove didn't work\n");
1355 goto bad;
1357 while(sp < i)
1358 spacep[++sp] = s-2;
1360 ls = lasts;
1361 lasts = s;
1362 for(p-=2; p>0; p--)
1363 *s++ = *ls++;
1364 for(;;) {
1365 if(is >= space+sizeof(space)) {
1366 c = -1;
1367 break;
1369 c = *is++ & 0xff;
1370 if(c & 0x80)
1371 break;
1372 *s++ = c;
1374 *s = 0;
1375 goto loop;
1377 bad:
1378 fprint(2, "trouble reading %s\n", file);
1379 exits("read");
1380 noroom:
1381 fprint(2, "not enough space for dictionary\n");
1382 exits("space");