7 /* fig leaves for possibly signed char quantities */
8 #define ISUPPER(c) isupper((c)&0xff)
9 #define ISLOWER(c) islower((c)&0xff)
10 #define ISALPHA(c) isalpha((c)&0xff)
11 #define ISDIGIT(c) isdigit((c)&0xff)
12 #define ISVOWEL(c) voweltab[(c)&0xff]
13 #define Tolower(c) (ISUPPER(c)? (c)-'A'+'a': (c))
14 #define pair(a,b) (((a)<<8) | (b))
19 #define Set(h, f) ((long)(h) & (f))
21 Bits nop(char*, char*, char*, int, int);
22 Bits strip(char*, char*, char*, int, int);
23 Bits ize(char*, char*, char*, int, int);
24 Bits i_to_y(char*, char*, char*, int, int);
25 Bits ily(char*, char*, char*, int, int);
26 Bits subst(char*, char*, char*, int, int);
27 Bits CCe(char*, char*, char*, int, int);
28 Bits tion(char*, char*, char*, int, int);
29 Bits an(char*, char*, char*, int, int);
30 Bits s(char*, char*, char*, int, int);
31 Bits es(char*, char*, char*, int, int);
32 Bits bility(char*, char*, char*, int, int);
33 Bits y_to_e(char*, char*, char*, int, int);
34 Bits VCe(char*, char*, char*, int, int);
36 Bits trypref(char*, char*, int, int);
37 Bits tryword(char*, char*, int, int);
38 Bits trysuff(char*, int, int);
39 Bits dict(char*, char*);
46 int inun(char*, Bits);
50 typedef struct Ptab Ptab;
57 typedef struct Suftab Suftab;
61 Bits (*p1)(char*, char*, char*, int, int);
67 Bits (*p2)(char*, char*, char*, int, int);
74 {"aibohp",subst,1,"-e+ia","",NOUN, NOUN},
80 {"cai",strip,1,"","+c",N_AFFIX, ADJ|NOUN},
81 {"citsi",strip,2,"","+ic",N_AFFIX, ADJ | N_AFFIX | NOUN},
82 {"citi",ize,1,"-e+ic","",N_AFFIX, ADJ },
83 {"cihparg",i_to_y,1,"-y+ic","",NOUN, ADJ|NOUN },
84 {"cipocs",ize,1,"-e+ic","",NOUN, ADJ },
85 {"cirtem",i_to_y,1,"-y+ic","",NOUN, ADJ },
86 {"cigol",i_to_y,1,"-y+ic","",NOUN, ADJ },
87 {"cimono",i_to_y,1,"-y+ic","",NOUN, ADJ },
88 {"cibohp",subst,1,"-e+ic","",NOUN, ADJ },
93 {"de",strip,1,"","+d",ED,ADJ |COMP,i_to_y,2,"-y+ied","+ed"},
94 {"dooh",ily,4,"-y+ihood","+hood",NOUN | ADV, NOUN},
100 * V_affix for comment ->commence->commentment??
102 {"ecna",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
103 {"ecne",subst,1,"-t+ce","",ADJ,N_AFFIX|_Y|NOUN|VERB|ACTOR|V_AFFIX},
104 {"elbaif",i_to_y,4,"-y+iable","",V_IRREG,ADJ},
105 {"elba",CCe,4,"-e+able","+able",V_AFFIX,ADJ},
106 {"evi",subst,0,"-ion+ive","",N_AFFIX | V_AFFIX,NOUN | N_AFFIX| ADJ},
107 {"ezi",CCe,3,"-e+ize","+ize",N_AFFIX|ADJ ,V_AFFIX | VERB |ION | COMP},
108 {"ekil",strip,4,"","+like",N_AFFIX ,ADJ},
113 {"gniee",strip,3,"","+ing",V_IRREG ,ADJ|NOUN},
114 {"gnikam",strip,6,"","+making",NOUN,NOUN},
115 {"gnipeek",strip,7,"","+keeping",NOUN,NOUN},
116 {"gni",CCe,3,"-e+ing","+ing",V_IRREG ,ADJ|ED|NOUN},
121 {"ladio",strip,2,"","+al",NOUN |ADJ,ADJ},
122 {"laci",strip,2,"","+al",NOUN |ADJ,ADJ |NOUN|N_AFFIX},
123 {"latnem",strip,2,"","+al",N_AFFIX,ADJ},
124 {"lanoi",strip,2,"","+al",N_AFFIX,ADJ|NOUN},
125 {"luf",ily,3,"-y+iful","+ful",N_AFFIX,ADJ | NOUN},
130 /* congregational + ism */
131 {"msi",CCe,3,"-e+ism","ism",N_AFFIX|ADJ,NOUN},
132 {"margo",subst,-1,"-ph+m","",NOUN,NOUN},
137 {"noitacifi",i_to_y,6,"-y+ication","",ION,NOUN | N_AFFIX},
138 {"noitazi",ize,4,"-e+ation","",ION,NOUN| N_AFFIX},
139 {"noit",tion,3,"-e+ion","+ion",ION,NOUN| N_AFFIX | V_AFFIX |VERB|ACTOR},
140 {"naino",an,3,"","+ian",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
141 {"namow",strip,5,"","+woman",MAN,PROP_COLLECT|N_AFFIX},
142 {"nam",strip,3,"","+man",MAN,PROP_COLLECT | N_AFFIX | VERB},
143 {"na",an,1,"","+n",NOUN|PROP_COLLECT,NOUN | N_AFFIX},
144 {"nemow",strip,5,"","+women",MAN,PROP_COLLECT},
145 {"nem",strip,3,"","+man",MAN,PROP_COLLECT},
146 {"nosrep",strip,6,"","+person",MAN,PROP_COLLECT},
151 {"pihs",strip,4,"","+ship",NOUN|PROP_COLLECT,NOUN| N_AFFIX},
156 {"rehparg",subst,1,"-y+er","",ACTOR,NOUN,strip,2,"","+er"},
157 {"reyhparg",nop,0,"","",0,NOUN},
158 {"reyl",nop,0,"","",0,NOUN},
159 {"rekam",strip,5,"","+maker",NOUN,NOUN},
160 {"repeek",strip,6,"","+keeper",NOUN,NOUN},
161 {"re",strip,1,"","+r",ACTOR,NOUN | N_AFFIX|VERB|ADJ, i_to_y,2,"-y+ier","+er"},
162 {"rota",tion,2,"-e+or","",ION,NOUN| N_AFFIX|_Y},
163 {"rotc",tion,2,"","+or",ION,NOUN| N_AFFIX},
164 {"rotp",tion,2,"","+or",ION,NOUN| N_AFFIX},
169 {"ssen",ily,4,"-y+iness","+ness",ADJ|ADV,NOUN| N_AFFIX},
170 {"ssel",ily,4,"-y+iless","+less",NOUN | PROP_COLLECT,ADJ },
171 {"se",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH , es,2,"-y+ies","+es"},
172 {"s'",s,2,"","+'s",PROP_COLLECT | NOUN,DONT_TOUCH },
173 {"s",s,1,"","+s",NOUN | V_IRREG,DONT_TOUCH },
178 {"tnem",strip,4,"","+ment",V_AFFIX,NOUN | N_AFFIX | ADJ|VERB},
179 {"tse",strip,2,"","+st",EST,DONT_TOUCH, i_to_y,3,"-y+iest","+est" },
180 {"tsigol",i_to_y,2,"-y+ist","",N_AFFIX,NOUN | N_AFFIX},
181 {"tsi",CCe,3,"-e+ist","+ist",N_AFFIX|ADJ,NOUN | N_AFFIX|COMP},
186 {"ycna",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
187 {"ycne",subst,1,"-t+cy","",ADJ | N_AFFIX,NOUN | N_AFFIX},
188 {"ytilib",bility,5,"-le+ility","",ADJ | V_AFFIX,NOUN | N_AFFIX},
189 {"ytisuo",nop,0,"","",NOUN},
190 {"ytilb",nop,0,"","",0,NOUN},
191 {"yti",CCe,3,"-e+ity","+ity",ADJ ,NOUN | N_AFFIX },
192 {"ylb",y_to_e,1,"-e+y","",ADJ,ADV},
193 {"ylc",nop,0,"","",0},
194 {"ylelb",nop,0,"","",0},
195 {"ylelp",nop,0,"","",0},
196 {"yl",ily,2,"-y+ily","+ly",ADJ,ADV|COMP},
197 {"yrtem",subst,0,"-er+ry","",NOUN,NOUN | N_AFFIX},
198 {"y",CCe,1,"-e+y","+y",_Y,ADJ|COMP},
368 "under", 0, /*must precede un*/
425 enum { NONE, SUFF, PREF} type;
438 char affix[DSIZ*10]; /* 10 is longest affix message */
442 char space[300000]; /* must be as large as "words"+"space" in pcode run */
443 Bits encode[2048]; /* must be as long as "codes" in pcode run */
446 char* spacep[128*128+1]; /* pointer to words starting with 'xx' */
450 char* codefile = "#9/lib/amspell";
451 char* brfile = "#9/lib/brspell";
452 char* Usage = "usage";
455 main(int argc, char *argv[])
463 codefile = unsharp(codefile);
464 brfile = unsharp(brfile);
466 Binit(&bin, 0, OREAD);
467 Binit(&bout, 1, OWRITE);
468 for(i=0; c = "aeiouyAEIOUY"[i]; i++)
471 if(argv[1][0] != '-')
473 for(i=1; c = argv[1][i]; i++)
476 fprint(2, "usage: spell [-bcCvx] [-f file]\n");
489 case 'C': /* for "correct" */
491 case 'c': /* for ocr */
505 fprint(2, "spell: -f requires another argument\n");
520 fprint(2, "usage: spell [-bcCvx] [-f file]\n");
528 original = Brdline(&bin, '\n');
531 original[Blinelen(&bin)-1] = 0;
536 while(*original != ':')
539 while(*++original != ':')
544 for(ep=word,dp=original; j = *dp; ep++,dp++) {
547 if(ep >= word+sizeof(word)-1)
553 if(ISDIGIT(word[0]) && ordinal())
557 if(!low && !(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH)))
558 for(cp=original+1,dp=word+1; dp<ep; dp++,cp++)
561 for(;;) { /* at most twice */
562 if(h = trypref(ep,".",0,ALL|STOP|DONT_TOUCH))
564 if(h = trysuff(ep,0,ALL|STOP|DONT_TOUCH))
566 if(!ISUPPER(word[0]))
575 word[0] = Tolower(word[0]);
579 if(!h || Set(h,STOP))
584 print("%c",'0' + (suffcount>0) +
585 (prefcount>4? 8: 2*prefcount));
586 } else if(!h || Set(h,STOP)) {
588 Bprint(&bout, "%s:%s\n", acmeid, original);
590 Bprint(&bout, "%s\n", original);
591 } else if(affix[0] != 0 && affix[0] != '.')
592 print("%s\t%s\n", affix, original);
596 /* strip exactly one suffix and do
597 * indicated routine(s), which may recursively
601 trysuff(char* ep, int lev, int flag)
606 int initchar = ep[-1];
611 deriv[lev] = emptyderiv;
612 deriv[lev-1] = emptyderiv;
614 if(!ISLOWER(initchar))
616 for(t=suftab[initchar-'a']; sp=t->suf; t++) {
621 for(sp=ep-t->n1; --sp >= word && !ISVOWEL(*sp);)
625 if(!(t->affixable & flag))
627 h = (*t->p1)(ep-t->n1, t->d1, t->a1, lev+1, t->flag|STOP);
630 deriv[lev] = emptyderiv;
631 deriv[lev+1] = emptyderiv;
633 h = (*t->p2)(ep-t->n2, t->d2, t->a2, lev, t->flag|STOP);
642 nop(char* ep, char* d, char* a, int lev, int flag)
653 cstrip(char* ep, char* d, char* a, int lev, int flag)
657 if(ISVOWEL(temp) && ISVOWEL(ep[-1])) {
658 switch(pair(ep[-1],ep[0])) {
670 if(temp==ep[-1]&&temp==ep[-2])
672 return strip(ep,d,a,lev,flag);
676 strip(char* ep, char* d, char* a, int lev, int flag)
678 Bits h = trypref(ep, a, lev, flag);
681 if(Set(h,MONO) && ISVOWEL(*ep) && ISVOWEL(ep[-2]))
685 if(ISVOWEL(*ep) && !ISVOWEL(ep[-1]) && ep[-1]==ep[-2]) {
686 h = trypref(ep-1,a,lev,flag|MONO);
690 return trysuff(ep,lev,flag);
694 s(char* ep, char* d, char* a, int lev, int flag)
701 if(ISVOWEL(ep[-2])||ISUPPER(*word))
702 break; /*says Kennedys*/
715 return strip(ep,d,a,lev,flag);
719 an(char* ep, char* d, char* a, int lev, int flag)
722 if(!ISUPPER(*word)) /*must be proper name*/
724 return trypref(ep,a,lev,flag);
728 ize(char* ep, char* d, char* a, int lev, int flag)
735 h = strip(ep,"",d,lev,flag);
741 y_to_e(char* ep, char* d, char* a, int lev, int flag)
755 h = strip(ep,"",d,lev,flag);
761 ily(char* ep, char* d, char* a, int lev, int flag)
766 if(temp==ep[-1]&&temp==ep[-2]) /* sillly */
768 if(*--cp=='y' && !ISVOWEL(*--cp)) /* happyly */
770 if(ISVOWEL(*--cp)) /* shyness */
773 return i_to_y(ep,d,a,lev,flag);
774 return cstrip(ep,d,a,lev,flag);
778 bility(char* ep, char* d, char* a, int lev, int flag)
781 return y_to_e(ep,d,a,lev,flag);
785 i_to_y(char* ep, char* d, char* a, int lev, int flag)
792 if((temp=ep[-1])=='i' && !ISVOWEL(ep[-2])) {
796 h = cstrip(ep,"",a,lev,flag);
802 es(char* ep, char* d, char* a, int lev, int flag)
810 return i_to_y(ep,d,a,lev,flag);
822 return strip(ep,d,a,lev,flag);
827 subst(char* ep, char* d, char* a, int lev, int flag)
833 if(skipv(skipv(ep-1)) < word)
835 for(t=d; *t!='+'; t++)
837 for(u=ep; *--t!='-';)
839 h = strip(ep,"",d,lev,flag);
848 tion(char* ep, char* d, char* a, int lev, int flag)
852 return trypref(ep,a,lev,flag);
858 return y_to_e(ep,d,a,lev,flag);
863 * possible consonant-consonant-e ending
866 CCe(char* ep, char* d, char* a, int lev, int flag)
880 return y_to_e(ep,d,a,lev,flag);
885 if(*ep == 'a') /* prevent -able for -eable */
895 if(h = y_to_e(ep,d,a,lev,flag))
897 if(!(ep[-2]=='n' && ep[-1]=='g'))
900 return VCe(ep,d,a,lev,flag);
904 * possible consonant-vowel-consonant-e ending
907 VCe(char* ep, char* d, char* a, int lev, int flag)
915 if(!ISVOWEL(c) && ISVOWEL(ep[-2])) {
918 h = trypref(ep,d,lev,flag);
920 h = trysuff(ep,lev,flag);
926 return cstrip(ep,d,a,lev,flag);
930 lookuppref(uchar** wp, char* ep)
934 unsigned int initchar = Tolower(**wp);
936 if(!ISALPHA(initchar))
938 for(sp=preftab[initchar-'a'];sp->s;sp++) {
940 for(cp= (uchar*)sp->s;*cp; )
943 for(cp=bp;cp<(uchar*)ep;cp++)
953 /* while word is not in dictionary try stripping
954 * prefixes. Fail if no more prefixes.
957 trypref(char* ep, char* a, int lev, int flag)
967 deriv[lev].type = *a=='.'? NONE: SUFF;
969 if(h = tryword(word,ep,lev,flag)) {
970 if(Set(h, flag&~MONO) && (flag&MONO) <= Set(h, MONO))
977 deriv[lev+1].mesg = pp;
978 deriv[lev+1].type = 0;
980 while(tp=lookuppref((uchar**)(void*)&bp,ep)) {
983 while(pp<space+sizeof(space) && (*pp = *cp++))
985 deriv[lev+1].type += PREF;
986 h = tryword(bp,ep,lev+1,flag);
988 ((tp->flag&IN) && inun(bp-2,h)==0)) {
992 if(Set(h,flag&~MONO) && (flag&MONO) <= Set(h, MONO))
997 deriv[lev+1] = emptyderiv;
998 deriv[lev+2] = emptyderiv;
1004 tryword(char* bp, char* ep, int lev, int flag)
1014 deriv[++lev].mesg = duple;
1015 deriv[lev].type = SUFF;
1022 if(vflag==0 || h==0)
1025 * when derivations are wanted, collect them
1029 prefcount = suffcount = 0;
1031 if(j<DSIZ && deriv[j].type) {
1032 strcat(affix, deriv[j].mesg);
1033 if(deriv[j].type == SUFF)
1035 else if(deriv[j].type != NONE)
1036 prefcount = deriv[j].type/PREF;
1043 inun(char* bp, Bits h)
1046 return Set(h, IN) == 0;
1052 return bp[1] == 'r';
1055 return bp[1] == 'm';
1057 return bp[1] == 'n';
1063 if(s >= word && ISVOWEL(*s))
1065 while(s >= word && !ISVOWEL(*s))
1071 * crummy way to Britishise
1080 for(p = suftab[i]; p->suf; p++) {
1081 p->suf = ztos(p->suf);
1082 p->d1 = ztos(p->d1);
1083 p->a1 = ztos(p->a1);
1106 dict(char* bp, char* ep)
1108 char *cp, *cp1, *w, *wp, *we;
1126 fprint(2, "=%.*s\n", utfnlen(w, n), w);
1130 * find the beginning of some word in the middle
1132 cp = bp + (ep-bp)/2;
1134 while(cp > bp && !(*cp & 0x80))
1136 while(cp > bp && (cp[-1] & 0x80))
1139 wp = w + 2; /* skip two letters */
1140 cp1 = cp + 2; /* skip affix code */
1159 while(!(*cp1 & 0x80))
1168 f = ((cp[0] & 0x7) << 8) |
1171 fprint(2, "=%.*s ", utfnlen(w, n), w);
1172 typeprint(encode[f]);
1184 if(h & PROP_COLLECT)
1187 if((h & VERB) == VERB)
1190 if((h & VERB) == V_IRREG)
1199 if((h & COMP) == ACTOR)
1242 fprint(2, ",%s", s);
1246 * is the word on of the following
1251 * called knowing word[0] is a digit
1262 if(ISUPPER(cp[0]) && ISUPPER(cp[1])) {
1263 sp[0] = Tolower(cp[0]);
1264 sp[1] = Tolower(cp[1]);
1266 return 0 == strncmp(sp,
1267 cp[-2]=='1'? "th": /* out of bounds if 1 digit */
1268 *--cp=='1'? "st": /* harmless */
1275 * read in the dictionary.
1279 * long encode[nencode];
1283 * the encodings are a table all different
1285 * the dictionary proper has 2 bytes
1286 * that demark and then the rest of the
1287 * word. the 2 bytes have the following
1289 * 0x78 0x00 count of prefix bytes
1290 * common with prev word
1291 * 0x07 0xff affix code
1293 * all ints are big endians in the file.
1296 readdict(char *file)
1298 char *s, *is, *lasts, *ls;
1306 fprint(2, "cannot open %s\n", file);
1309 if(read(f, space, 2) != 2)
1311 nencode = ((space[0]&0xff)<<8) | (space[1]&0xff);
1312 if(read(f, space, 4*nencode) != 4*nencode)
1315 for(i=0; i<nencode; i++) {
1316 l = (long)(s[0] & 0xff) << 24;
1317 l |= (s[1] & 0xff) << 16;
1318 l |= (s[2] & 0xff) << 8;
1320 encode[i] = (Bits)l;
1323 l = read(f, space, sizeof(space));
1324 if(l == sizeof(space))
1326 is = space + (sizeof(space) - l);
1327 memmove(is, space, l);
1341 *s = (char)0x80; /* fence */
1346 *s++ = *is++ & 0xff;
1348 i = (*is++ & 0xff)*128;
1351 i = i/128*128 + (*is++ & 0xff);
1353 fprint(2, "the dict isnt sorted or \n");
1354 fprint(2, "memmove didn't work\n");
1365 if(is >= space+sizeof(space)) {
1378 fprint(2, "trouble reading %s\n", file);
1381 fprint(2, "not enough space for dictionary\n");