Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include "code.h"
7 /* fig leaves for possibly signed char quantities */
8 #define ISUPPER(c) isupper((c)&0xff)
9 #define ISLOWER(c) islower((c)&0xff)
10 #define ISALPHA(c) isalpha((c)&0xff)
11 #define ISDIGIT(c) isdigit((c)&0xff)
12 #define ISVOWEL(c) voweltab[(c)&0xff]
13 #define Tolower(c) (ISUPPER(c)? (c)-'A'+'a': (c))
14 #define pair(a,b) (((a)<<8) | (b))
15 #define DLEV 2
16 #define DSIZ 40
18 typedef long Bits;
19 #define Set(h, f) ((long)(h) & (f))
21 Bits nop(char*, char*, char*, int, int);
22 Bits strip(char*, char*, char*, int, int);
23 Bits ize(char*, char*, char*, int, int);
24 Bits i_to_y(char*, char*, char*, int, int);
25 Bits ily(char*, char*, char*, int, int);
26 Bits subst(char*, char*, char*, int, int);
27 Bits CCe(char*, char*, char*, int, int);
28 Bits tion(char*, char*, char*, int, int);
29 Bits an(char*, char*, char*, int, int);
30 Bits s(char*, char*, char*, int, int);
31 Bits es(char*, char*, char*, int, int);
32 Bits bility(char*, char*, char*, int, int);
33 Bits y_to_e(char*, char*, char*, int, int);
34 Bits VCe(char*, char*, char*, int, int);
36 Bits trypref(char*, char*, int, int);
37 Bits tryword(char*, char*, int, int);
38 Bits trysuff(char*, int, int);
39 Bits dict(char*, char*);
40 void typeprint(Bits);
41 void pcomma(char*);
43 void ise(void);
44 int ordinal(void);
45 char* skipv(char*);
46 int inun(char*, Bits);
47 char* ztos(char*);
48 void readdict(char*);
50 typedef struct Ptab Ptab;
51 struct Ptab
52 {
53 char* s;
54 int flag;
55 };
57 typedef struct Suftab Suftab;
58 struct Suftab
59 {
60 char *suf;
61 Bits (*p1)(char*, char*, char*, int, int);
62 int n1;
63 char *d1;
64 char *a1;
65 int flag;
66 int affixable;
67 Bits (*p2)(char*, char*, char*, int, int);
68 int n2;
69 char *d2;
70 char *a2;
71 };
73 Suftab staba[] = {
74 {"aibohp",subst,1,"-e+ia","",NOUN, NOUN},
75 0
76 };
78 Suftab stabc[] =
79 {
80 {"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN},
81 {"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN},
82 {"citi",ize,1,"-e+ic","",N_AFFIX, ADJ },
83 {"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN },
84 {"cipocs",ize,1,"-e+ic","",NOUN, ADJ },
85 {"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ },
86 {"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ },
87 {"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ },
88 {"cibohp",subst,1,"-e+ic","",NOUN, ADJ },
89 0
90 };
91 Suftab stabd[] =
92 {
93 {"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"},
94 {"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN},
95 0
96 };
97 Suftab stabe[] =
98 {
99 /*
100 * V_affix for comment ->commence->commentment??
101 */
102 {"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
103 {"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
104 {"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ},
105 {"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ},
106 {"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ},
107 {"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP},
108 {"ekil",strip,4,"","+like",N_AFFIX ,ADJ},
110 };
111 Suftab stabg[] =
113 {"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN},
114 {"gnikam",strip,6,"","+making",NOUN,NOUN},
115 {"gnipeek",strip,7,"","+keeping",NOUN,NOUN},
116 {"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN},
118 };
119 Suftab stabl[] =
121 {"ladio",strip,2,"","+al",NOUN |ADJ,ADJ},
122 {"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX},
123 {"latnem",strip,2,"","+al",N_AFFIX,ADJ},
124 {"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN},
125 {"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN},
127 };
128 Suftab stabm[] =
130 /* congregational + ism */
131 {"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN},
132 {"margo",subst,-1,"-ph+m","",NOUN,NOUN},
134 };
135 Suftab stabn[] =
137 {"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX},
138 {"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX},
139 {"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR},
140 {"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
141 {"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX},
142 {"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB},
143 {"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX},
144 {"nemow",strip,5,"","+women",MAN,PROP_COLLECT},
145 {"nem",strip,3,"","+man",MAN,PROP_COLLECT},
146 {"nosrep",strip,6,"","+person",MAN,PROP_COLLECT},
148 };
149 Suftab stabp[] =
151 {"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
153 };
154 Suftab stabr[] =
156 {"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"},
157 {"reyhparg",nop,0,"","",0,NOUN},
158 {"reyl",nop,0,"","",0,NOUN},
159 {"rekam",strip,5,"","+maker",NOUN,NOUN},
160 {"repeek",strip,6,"","+keeper",NOUN,NOUN},
161 {"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ, i_to_y,2,"-y+ier","+er"},
162 {"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y},
163 {"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX},
164 {"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX},
166 };
167 Suftab stabs[] =
169 {"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX},
170 {"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ },
171 {"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH , es,2,"-y+ies","+es"},
172 {"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH },
173 {"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH },
175 };
176 Suftab stabt[] =
178 {"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB},
179 {"tse",strip,2,"","+st",EST,DONT_TOUCH, i_to_y,3,"-y+iest","+est" },
180 {"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX},
181 {"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP},
183 };
184 Suftab staby[] =
186 {"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
187 {"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
188 {"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX},
189 {"ytisuo",nop,0,"","",NOUN},
190 {"ytilb",nop,0,"","",0,NOUN},
191 {"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX },
192 {"ylb",y_to_e,1,"-e+y","",ADJ,ADV},
193 {"ylc",nop,0,"","",0},
194 {"ylelb",nop,0,"","",0},
195 {"ylelp",nop,0,"","",0},
196 {"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP},
197 {"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX},
198 {"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP},
200 };
201 Suftab stabz[] =
204 };
205 Suftab* suftab[] =
207 staba,
208 stabz,
209 stabc,
210 stabd,
211 stabe,
212 stabz,
213 stabg,
214 stabz,
215 stabz,
216 stabz,
217 stabz,
218 stabl,
219 stabm,
220 stabn,
221 stabz,
222 stabp,
223 stabz,
224 stabr,
225 stabs,
226 stabt,
227 stabz,
228 stabz,
229 stabz,
230 stabz,
231 staby,
232 stabz,
233 };
235 Ptab ptaba[] =
237 "anti", 0,
238 "auto", 0,
240 };
241 Ptab ptabb[] =
243 "bio", 0,
245 };
246 Ptab ptabc[] =
248 "counter", 0,
250 };
251 Ptab ptabd[] =
253 "dis", 0,
255 };
256 Ptab ptabe[] =
258 "electro", 0,
260 };
261 Ptab ptabf[] =
263 "femto", 0,
265 };
266 Ptab ptabg[] =
268 "geo", 0,
269 "giga", 0,
271 };
272 Ptab ptabh[] =
274 "hyper", 0,
276 };
277 Ptab ptabi[] =
279 "immuno", 0,
280 "im", IN,
281 "intra", 0,
282 "inter", 0,
283 "in", IN,
284 "ir", IN,
285 "iso", 0,
287 };
288 Ptab ptabj[] =
291 };
292 Ptab ptabk[] =
294 "kilo", 0,
296 };
297 Ptab ptabl[] =
300 };
301 Ptab ptabm[] =
303 "magneto", 0,
304 "mega", 0,
305 "meta", 0,
306 "micro", 0,
307 "mid", 0,
308 "milli", 0,
309 "mini", 0,
310 "mis", 0,
311 "mono", 0,
312 "multi", 0,
314 };
315 Ptab ptabn[] =
317 "nano", 0,
318 "neuro", 0,
319 "non", 0,
321 };
322 Ptab ptabo[] =
324 "out", 0,
325 "over", 0,
327 };
328 Ptab ptabp[] =
330 "para", 0,
331 "photo", 0,
332 "pico", 0,
333 "poly", 0,
334 "pre", 0,
335 "pseudo", 0,
336 "psycho", 0,
338 };
339 Ptab ptabq[] =
341 "quasi", 0,
343 };
344 Ptab ptabr[] =
346 "radio", 0,
347 "re", 0,
349 };
350 Ptab ptabs[] =
352 "semi", 0,
353 "stereo", 0,
354 "sub", 0,
355 "super", 0,
357 };
358 Ptab ptabt[] =
360 "tele", 0,
361 "tera", 0,
362 "thermo", 0,
364 };
365 Ptab ptabu[] =
367 "ultra", 0,
368 "under", 0, /*must precede un*/
369 "un", IN,
371 };
372 Ptab ptabv[] =
375 };
376 Ptab ptabw[] =
379 };
380 Ptab ptabx[] =
383 };
384 Ptab ptaby[] =
387 };
388 Ptab ptabz[] =
391 };
393 Ptab* preftab[] =
395 ptaba,
396 ptabb,
397 ptabc,
398 ptabd,
399 ptabe,
400 ptabf,
401 ptabg,
402 ptabh,
403 ptabi,
404 ptabj,
405 ptabk,
406 ptabl,
407 ptabm,
408 ptabn,
409 ptabo,
410 ptabp,
411 ptabq,
412 ptabr,
413 ptabs,
414 ptabt,
415 ptabu,
416 ptabv,
417 ptabw,
418 ptabx,
419 ptaby,
420 ptabz,
421 };
423 typedef struct {
424 char *mesg;
425 enum { NONE, SUFF, PREF} type;
426 } Deriv;
428 int aflag;
429 int cflag;
430 int fflag;
431 int vflag;
432 int xflag;
433 int nflag;
434 char word[500];
435 char* original;
436 Deriv emptyderiv;
437 Deriv deriv[DSIZ+3];
438 char affix[DSIZ*10]; /* 10 is longest affix message */
439 int prefcount;
440 int suffcount;
441 char* acmeid;
442 char space[300000]; /* must be as large as "words"+"space" in pcode run */
443 Bits encode[2048]; /* must be as long as "codes" in pcode run */
444 int nencode;
445 char voweltab[256];
446 char* spacep[128*128+1]; /* pointer to words starting with 'xx' */
447 Biobuf bin;
448 Biobuf bout;
450 char* codefile = "#9/lib/amspell";
451 char* brfile = "#9/lib/brspell";
452 char* Usage = "usage";
454 void
455 main(int argc, char *argv[])
457 char *ep, *cp;
458 char *dp;
459 int j, i, c;
460 int low;
461 Bits h;
463 codefile = unsharp(codefile);
464 brfile = unsharp(brfile);
466 Binit(&bin, 0, OREAD);
467 Binit(&bout, 1, OWRITE);
468 for(i=0; c = "aeiouyAEIOUY"[i]; i++)
469 voweltab[c] = 1;
470 while(argc > 1) {
471 if(argv[1][0] != '-')
472 break;
473 for(i=1; c = argv[1][i]; i++)
474 switch(c) {
475 default:
476 fprint(2, "usage: spell [-bcCvx] [-f file]\n");
477 exits(Usage);
479 case 'a':
480 aflag++;
481 continue;
483 case 'b':
484 ise();
485 if(!fflag)
486 codefile = brfile;
487 continue;
489 case 'C': /* for "correct" */
490 vflag++;
491 case 'c': /* for ocr */
492 cflag++;
493 continue;
495 case 'v':
496 vflag++;
497 continue;
499 case 'x':
500 xflag++;
501 continue;
503 case 'f':
504 if(argc <= 2) {
505 fprint(2, "spell: -f requires another argument\n");
506 exits(Usage);
508 argv++;
509 argc--;
510 codefile = argv[1];
511 fflag++;
512 goto brk;
514 brk:
515 argv++;
516 argc--;
518 readdict(codefile);
519 if(argc > 1) {
520 fprint(2, "usage: spell [-bcCvx] [-f file]\n");
521 exits(Usage);
523 if(aflag)
524 cflag = vflag = 0;
526 for(;;) {
527 affix[0] = 0;
528 original = Brdline(&bin, '\n');
529 if(original == 0)
530 exits(0);
531 original[Blinelen(&bin)-1] = 0;
532 low = 0;
534 if(aflag) {
535 acmeid = original;
536 while(*original != ':')
537 if(*original++ == 0)
538 exits(0);
539 while(*++original != ':')
540 if(*original == 0)
541 exits(0);
542 *original++ = 0;
544 for(ep=word,dp=original; j = *dp; ep++,dp++) {
545 if(ISLOWER(j))
546 low++;
547 if(ep >= word+sizeof(word)-1)
548 break;
549 *ep = j;
551 *ep = 0;
553 if(ISDIGIT(word[0]) && ordinal())
554 continue;
556 h = 0;
557 if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)))
558 for(cp=original+1,dp=word+1; dp<ep; dp++,cp++)
559 *dp = Tolower(*cp);
560 if(!h)
561 for(;;) { /* at most twice */
562 if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))
563 break;
564 if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH))
565 break;
566 if(!ISUPPER(word[0]))
567 break;
568 cp = original;
569 dp = word;
570 while(*dp = *cp++) {
571 if(!low)
572 *dp = Tolower(*dp);
573 dp++;
575 word[0] = Tolower(word[0]);
578 if(cflag) {
579 if(!h || Set(h,STOP))
580 print("-");
581 else if(!vflag)
582 print("+");
583 else
584 print("%c",'0' + (suffcount>0) +
585 (prefcount>4? 8: 2*prefcount));
586 } else if(!h || Set(h,STOP)) {
587 if(aflag)
588 Bprint(&bout, "%s:%s\n", acmeid, original);
589 else
590 Bprint(&bout, "%s\n", original);
591 } else if(affix[0] != 0 && affix[0] != '.')
592 print("%s\t%s\n", affix, original);
594 exits(0);
597 /* strip exactly one suffix and do
598 * indicated routine(s), which may recursively
599 * strip suffixes
600 */
601 Bits
602 trysuff(char* ep, int lev, int flag)
604 Suftab *t;
605 char *cp, *sp;
606 Bits h = 0;
607 int initchar = ep[-1];
609 flag &= ~MONO;
610 lev += DLEV;
611 if(lev < DSIZ) {
612 deriv[lev] = emptyderiv;
613 deriv[lev-1] = emptyderiv;
615 if(!ISLOWER(initchar))
616 return h;
617 for(t=suftab[initchar-'a']; sp=t->suf; t++) {
618 cp = ep;
619 while(*sp)
620 if(*--cp != *sp++)
621 goto next;
622 for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);)
624 if(sp < word)
625 continue;
626 if(!(t->affixable & flag))
627 return 0;
628 h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP);
629 if(!h && t->p2!=0) {
630 if(lev < DSIZ) {
631 deriv[lev] = emptyderiv;
632 deriv[lev+1] = emptyderiv;
634 h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP);
636 break;
637 next:;
639 return h;
642 Bits
643 nop(char* ep, char* d, char* a, int lev, int flag)
645 USED(ep);
646 USED(d);
647 USED(a);
648 USED(lev);
649 USED(flag);
650 return 0;
653 Bits
654 cstrip(char* ep, char* d, char* a, int lev, int flag)
656 int temp = ep[0];
658 if(ISVOWEL(temp) && ISVOWEL(ep[-1])) {
659 switch(pair(ep[-1],ep[0])) {
660 case pair('a', 'a'):
661 case pair('a', 'e'):
662 case pair('a', 'i'):
663 case pair('e', 'a'):
664 case pair('e', 'e'):
665 case pair('e', 'i'):
666 case pair('i', 'i'):
667 case pair('o', 'a'):
668 return 0;
670 } else
671 if(temp==ep[-1]&&temp==ep[-2])
672 return 0;
673 return strip(ep,d,a,lev,flag);
676 Bits
677 strip(char* ep, char* d, char* a, int lev, int flag)
679 Bits h = trypref(ep, a, lev, flag);
681 USED(d);
682 if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2]))
683 h = 0;
684 if(h)
685 return h;
686 if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) {
687 h = trypref(ep-1,a,lev,flag|MONO);
688 if(h)
689 return h;
691 return trysuff(ep,lev,flag);
694 Bits
695 s(char* ep, char* d, char* a, int lev, int flag)
697 if(lev > DLEV+1)
698 return 0;
699 if(*ep=='s') {
700 switch(ep[-1]) {
701 case 'y':
702 if(ISVOWEL(ep[-2])||ISUPPER(*word))
703 break; /*says Kennedys*/
704 case 'x':
705 case 'z':
706 case 's':
707 return 0;
708 case 'h':
709 switch(ep[-2]) {
710 case 'c':
711 case 's':
712 return 0;
716 return strip(ep,d,a,lev,flag);
719 Bits
720 an(char* ep, char* d, char* a, int lev, int flag)
722 USED(d);
723 if(!ISUPPER(*word)) /*must be proper name*/
724 return 0;
725 return trypref(ep,a,lev,flag);
728 Bits
729 ize(char* ep, char* d, char* a, int lev, int flag)
731 int temp = ep[-1];
732 Bits h;
734 USED(a);
735 ep[-1] = 'e';
736 h = strip(ep,"",d,lev,flag);
737 ep[-1] = temp;
738 return h;
741 Bits
742 y_to_e(char* ep, char* d, char* a, int lev, int flag)
744 Bits h;
745 int temp;
747 USED(a);
748 switch(ep[-1]) {
749 case 'a':
750 case 'e':
751 case 'i':
752 return 0;
754 temp = *ep;
755 *ep++ = 'e';
756 h = strip(ep,"",d,lev,flag);
757 ep[-1] = temp;
758 return h;
761 Bits
762 ily(char* ep, char* d, char* a, int lev, int flag)
764 int temp = ep[0];
765 char *cp = ep;
767 if(temp==ep[-1]&&temp==ep[-2]) /* sillly */
768 return 0;
769 if(*--cp=='y' && !ISVOWEL(*--cp)) /* happyly */
770 while(cp>word)
771 if(ISVOWEL(*--cp)) /* shyness */
772 return 0;
773 if(ep[-1]=='i')
774 return i_to_y(ep,d,a,lev,flag);
775 return cstrip(ep,d,a,lev,flag);
778 Bits
779 bility(char* ep, char* d, char* a, int lev, int flag)
781 *ep++ = 'l';
782 return y_to_e(ep,d,a,lev,flag);
785 Bits
786 i_to_y(char* ep, char* d, char* a, int lev, int flag)
788 Bits h;
789 int temp;
791 if(ISUPPER(*word))
792 return 0;
793 if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) {
794 ep[-1] = 'y';
795 a = d;
797 h = cstrip(ep,"",a,lev,flag);
798 ep[-1] = temp;
799 return h;
802 Bits
803 es(char* ep, char* d, char* a, int lev, int flag)
805 if(lev>DLEV)
806 return 0;
807 switch(ep[-1]) {
808 default:
809 return 0;
810 case 'i':
811 return i_to_y(ep,d,a,lev,flag);
812 case 'h':
813 switch(ep[-2]) {
814 default:
815 return 0;
816 case 'c':
817 case 's':
818 break;
820 case 's':
821 case 'z':
822 case 'x':
823 return strip(ep,d,a,lev,flag);
827 Bits
828 subst(char* ep, char* d, char* a, int lev, int flag)
830 char *u,*t;
831 Bits h;
833 USED(a);
834 if(skipv(skipv(ep-1)) < word)
835 return 0;
836 for(t=d; *t!='+'; t++)
837 continue;
838 for(u=ep; *--t!='-';)
839 *--u = *t;
840 h = strip(ep,"",d,lev,flag);
841 while(*++t != '+')
842 continue;
843 while(*++t)
844 *u++ = *t;
845 return h;
848 Bits
849 tion(char* ep, char* d, char* a, int lev, int flag)
851 switch(ep[-2]) {
852 default:
853 return trypref(ep,a,lev,flag);
854 case 'a':
855 case 'e':
856 case 'i':
857 case 'o':
858 case 'u':
859 return y_to_e(ep,d,a,lev,flag);
863 /*
864 * possible consonant-consonant-e ending
865 */
866 Bits
867 CCe(char* ep, char* d, char* a, int lev, int flag)
869 Bits h;
871 switch(ep[-1]) {
872 case 'l':
873 if(ISVOWEL(ep[-2]))
874 break;
875 switch(ep[-2]) {
876 case 'l':
877 case 'r':
878 case 'w':
879 break;
880 default:
881 return y_to_e(ep,d,a,lev,flag);
883 break;
884 case 'c':
885 case 'g':
886 if(*ep == 'a') /* prevent -able for -eable */
887 return 0;
888 case 's':
889 case 'v':
890 case 'z':
891 if(ep[-2]==ep[-1])
892 break;
893 if(ISVOWEL(ep[-2]))
894 break;
895 case 'u':
896 if(h = y_to_e(ep,d,a,lev,flag))
897 return h;
898 if(!(ep[-2]=='n' && ep[-1]=='g'))
899 return 0;
901 return VCe(ep,d,a,lev,flag);
904 /*
905 * possible consonant-vowel-consonant-e ending
906 */
907 Bits
908 VCe(char* ep, char* d, char* a, int lev, int flag)
910 int c;
911 Bits h;
913 c = ep[-1];
914 if(c=='e')
915 return 0;
916 if(!ISVOWEL(c) && ISVOWEL(ep[-2])) {
917 c = *ep;
918 *ep++ = 'e';
919 h = trypref(ep,d,lev,flag);
920 if(!h)
921 h = trysuff(ep,lev,flag);
922 if(h)
923 return h;
924 ep--;
925 *ep = c;
927 return cstrip(ep,d,a,lev,flag);
930 Ptab*
931 lookuppref(uchar** wp, char* ep)
933 Ptab *sp;
934 uchar *bp,*cp;
935 unsigned int initchar = Tolower(**wp);
937 if(!ISALPHA(initchar))
938 return 0;
939 for(sp=preftab[initchar-'a'];sp->s;sp++) {
940 bp = *wp;
941 for(cp= (uchar*)sp->s;*cp; )
942 if(*bp++!=*cp++)
943 goto next;
944 for(cp=bp;cp<(uchar*)ep;cp++)
945 if(ISVOWEL(*cp)) {
946 *wp = bp;
947 return sp;
949 next:;
951 return 0;
954 /* while word is not in dictionary try stripping
955 * prefixes. Fail if no more prefixes.
956 */
957 Bits
958 trypref(char* ep, char* a, int lev, int flag)
960 Ptab *tp;
961 char *bp, *cp;
962 char *pp;
963 Bits h;
964 char space[20];
966 if(lev<DSIZ) {
967 deriv[lev].mesg = a;
968 deriv[lev].type = *a=='.'? NONE: SUFF;
970 if(h = tryword(word,ep,lev,flag)) {
971 if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO))
972 return h;
973 h = 0;
975 bp = word;
976 pp = space;
977 if(lev<DSIZ) {
978 deriv[lev+1].mesg = pp;
979 deriv[lev+1].type = 0;
981 while(tp=lookuppref((uchar**)&bp,ep)) {
982 *pp++ = '+';
983 cp = tp->s;
984 while(pp<space+sizeof(space) && (*pp = *cp++))
985 pp++;
986 deriv[lev+1].type += PREF;
987 h = tryword(bp,ep,lev+1,flag);
988 if(Set(h,NOPREF) ||
989 ((tp->flag&IN) && inun(bp-2,h)==0)) {
990 h = 0;
991 break;
993 if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO))
994 break;
995 h = 0;
997 if(lev < DSIZ) {
998 deriv[lev+1] = emptyderiv;
999 deriv[lev+2] = emptyderiv;
1001 return h;
1004 Bits
1005 tryword(char* bp, char* ep, int lev, int flag)
1007 int j;
1008 Bits h = 0;
1009 char duple[3];
1011 if(ep-bp <= 1)
1012 return h;
1013 if(flag&MONO) {
1014 if(lev<DSIZ) {
1015 deriv[++lev].mesg = duple;
1016 deriv[lev].type = SUFF;
1018 duple[0] = '+';
1019 duple[1] = *ep;
1020 duple[2] = 0;
1022 h = dict(bp, ep);
1023 if(vflag==0 || h==0)
1024 return h;
1026 * when derivations are wanted, collect them
1027 * for printing
1029 j = lev;
1030 prefcount = suffcount = 0;
1031 do {
1032 if(j<DSIZ && deriv[j].type) {
1033 strcat(affix, deriv[j].mesg);
1034 if(deriv[j].type == SUFF)
1035 suffcount++;
1036 else if(deriv[j].type != NONE)
1037 prefcount = deriv[j].type/PREF;
1039 } while(--j > 0);
1040 return h;
1043 int
1044 inun(char* bp, Bits h)
1046 if(*bp == 'u')
1047 return Set(h, IN) == 0;
1048 /* *bp == 'i' */
1049 if(Set(h, IN) == 0)
1050 return 0;
1051 switch(bp[2]) {
1052 case 'r':
1053 return bp[1] == 'r';
1054 case 'm':
1055 case 'p':
1056 return bp[1] == 'm';
1058 return bp[1] == 'n';
1061 char*
1062 skipv(char *s)
1064 if(s >= word && ISVOWEL(*s))
1065 s--;
1066 while(s >= word && !ISVOWEL(*s))
1067 s--;
1068 return s;
1072 * crummy way to Britishise
1074 void
1075 ise(void)
1077 Suftab *p;
1078 int i;
1080 for(i=0; i<26; i++)
1081 for(p = suftab[i]; p->suf; p++) {
1082 p->suf = ztos(p->suf);
1083 p->d1 = ztos(p->d1);
1084 p->a1 = ztos(p->a1);
1088 char*
1089 ztos(char *as)
1091 char *s, *ds;
1093 for(s=as; *s; s++)
1094 if(*s == 'z')
1095 goto copy;
1096 return as;
1098 copy:
1099 ds = strdup(as);
1100 for(s=ds; *s; s++)
1101 if(*s == 'z')
1102 *s = 's';
1103 return ds;
1106 Bits
1107 dict(char* bp, char* ep)
1109 char *cp, *cp1, *w, *wp, *we;
1110 int n, f;
1112 w = bp;
1113 we = ep;
1114 n = ep-bp;
1115 if(n <= 1)
1116 return NOUN;
1118 f = w[0] & 0x7f;
1119 f *= 128;
1120 f += w[1] & 0x7f;
1121 bp = spacep[f];
1122 ep = spacep[f+1];
1124 loop:
1125 if(bp >= ep) {
1126 if(xflag)
1127 fprint(2, "=%.*s\n", utfnlen(w, n), w);
1128 return 0;
1131 * find the beginning of some word in the middle
1133 cp = bp + (ep-bp)/2;
1135 while(cp > bp && !(*cp & 0x80))
1136 cp--;
1137 while(cp > bp && (cp[-1] & 0x80))
1138 cp--;
1140 wp = w + 2; /* skip two letters */
1141 cp1 = cp + 2; /* skip affix code */
1142 for(;;) {
1143 if(wp >= we) {
1144 if(*cp1 & 0x80)
1145 goto found;
1146 else
1147 f = 1;
1148 break;
1150 if(*cp1 & 0x80) {
1151 f = -1;
1152 break;
1154 f = *cp1++ - *wp++;
1155 if(f != 0)
1156 break;
1159 if(f < 0) {
1160 while(!(*cp1 & 0x80))
1161 cp1++;
1162 bp = cp1;
1163 goto loop;
1165 ep = cp;
1166 goto loop;
1168 found:
1169 f = ((cp[0] & 0x7) << 8) |
1170 (cp[1] & 0xff);
1171 if(xflag) {
1172 fprint(2, "=%.*s ", utfnlen(w, n), w);
1173 typeprint(encode[f]);
1175 return encode[f];
1178 void
1179 typeprint(Bits h)
1182 pcomma("");
1183 if(h & NOUN)
1184 pcomma("n");
1185 if(h & PROP_COLLECT)
1186 pcomma("pc");
1187 if(h & VERB) {
1188 if((h & VERB) == VERB)
1189 pcomma("v");
1190 else
1191 if((h & VERB) == V_IRREG)
1192 pcomma("vi");
1193 else
1194 if(h & ED)
1195 pcomma("ed");
1197 if(h & ADJ)
1198 pcomma("a");
1199 if(h & COMP) {
1200 if((h & COMP) == ACTOR)
1201 pcomma("er");
1202 else
1203 pcomma("comp");
1205 if(h & DONT_TOUCH)
1206 pcomma("d");
1207 if(h & N_AFFIX)
1208 pcomma("na");
1209 if(h & ADV)
1210 pcomma("adv");
1211 if(h & ION)
1212 pcomma("ion");
1213 if(h & V_AFFIX)
1214 pcomma("va");
1215 if(h & MAN)
1216 pcomma("man");
1217 if(h & NOPREF)
1218 pcomma("nopref");
1219 if(h & MONO)
1220 pcomma("ms");
1221 if(h & IN)
1222 pcomma("in");
1223 if(h & _Y)
1224 pcomma("y");
1225 if(h & STOP)
1226 pcomma("s");
1227 fprint(2, "\n");
1230 void
1231 pcomma(char *s)
1233 static int flag;
1235 if(*s == 0) {
1236 flag = 0;
1237 return;
1239 if(!flag) {
1240 fprint(2, "%s", s);
1241 flag = 1;
1242 } else
1243 fprint(2, ",%s", s);
1247 * is the word on of the following
1248 * 12th teen
1249 * 21st end in 1
1250 * 23rd end in 3
1251 * 77th default
1252 * called knowing word[0] is a digit
1254 int
1255 ordinal(void)
1257 char *cp = word;
1258 static char sp[4];
1260 while(ISDIGIT(*cp))
1261 cp++;
1262 strncpy(sp,cp,3);
1263 if(ISUPPER(cp[0]) && ISUPPER(cp[1])) {
1264 sp[0] = Tolower(cp[0]);
1265 sp[1] = Tolower(cp[1]);
1267 return 0 == strncmp(sp,
1268 cp[-2]=='1'? "th": /* out of bounds if 1 digit */
1269 *--cp=='1'? "st": /* harmless */
1270 *cp=='2'? "nd":
1271 *cp=='3'? "rd":
1272 "th", 3);
1276 * read in the dictionary.
1277 * format is
1278 * {
1279 * short nencode;
1280 * long encode[nencode];
1281 * char space[*];
1282 * };
1284 * the encodings are a table all different
1285 * affixes.
1286 * the dictionary proper has 2 bytes
1287 * that demark and then the rest of the
1288 * word. the 2 bytes have the following
1289 * 0x80 0x00 flag
1290 * 0x78 0x00 count of prefix bytes
1291 * common with prev word
1292 * 0x07 0xff affix code
1294 * all ints are big endians in the file.
1296 void
1297 readdict(char *file)
1299 char *s, *is, *lasts, *ls;
1300 int c, i, sp, p;
1301 int f;
1302 long l;
1304 lasts = 0;
1305 f = open(file, 0);
1306 if(f == -1) {
1307 fprint(2, "cannot open %s\n", file);
1308 exits("open");
1310 if(read(f, space, 2) != 2)
1311 goto bad;
1312 nencode = ((space[0]&0xff)<<8) | (space[1]&0xff);
1313 if(read(f, space, 4*nencode) != 4*nencode)
1314 goto bad;
1315 s = space;
1316 for(i=0; i<nencode; i++) {
1317 l = (long)(s[0] & 0xff) << 24;
1318 l |= (s[1] & 0xff) << 16;
1319 l |= (s[2] & 0xff) << 8;
1320 l |= s[3] & 0xff;
1321 encode[i] = (Bits)l;
1322 s += 4;
1324 l = read(f, space, sizeof(space));
1325 if(l == sizeof(space))
1326 goto noroom;
1327 is = space + (sizeof(space) - l);
1328 memmove(is, space, l);
1330 s = space;
1331 c = *is++ & 0xff;
1332 sp = -1;
1333 i = 0;
1335 loop:
1336 if(s > is)
1337 goto noroom;
1338 if(c < 0) {
1339 close(f);
1340 while(sp < 128*128)
1341 spacep[++sp] = s;
1342 *s = 0x80; /* fence */
1343 return;
1345 p = (c>>3) & 0xf;
1346 *s++ = c;
1347 *s++ = *is++ & 0xff;
1348 if(p <= 0)
1349 i = (*is++ & 0xff)*128;
1350 if(p <= 1) {
1351 if(!(*is & 0x80))
1352 i = i/128*128 + (*is++ & 0xff);
1353 if(i <= sp) {
1354 fprint(2, "the dict isnt sorted or \n");
1355 fprint(2, "memmove didn't work\n");
1356 goto bad;
1358 while(sp < i)
1359 spacep[++sp] = s-2;
1361 ls = lasts;
1362 lasts = s;
1363 for(p-=2; p>0; p--)
1364 *s++ = *ls++;
1365 for(;;) {
1366 if(is >= space+sizeof(space)) {
1367 c = -1;
1368 break;
1370 c = *is++ & 0xff;
1371 if(c & 0x80)
1372 break;
1373 *s++ = c;
1375 *s = 0;
1376 goto loop;
1378 bad:
1379 fprint(2, "trouble reading %s\n", file);
1380 exits("read");
1381 noroom:
1382 fprint(2, "not enough space for dictionary\n");
1383 exits("space");