6 #define HY_BIT 0200 /* stuff in here only works for 7-bit ascii */
7 /* this value is used (as a literal) in suftab.c */
8 /* to encode possible hyphenation points in suffixes. */
9 /* it could be changed, by widening the tables */
10 /* to be shorts instead of chars. */
18 int hexsize = 0; /* hyphenation exception list size */
19 char *hbufp = NULL; /* base of list */
20 char *nexth = NULL; /* first free slot in list */
23 #define THRESH 160 /* digram goodness threshold */
27 static int alpha(Tchar);
29 void hyphen(Tchar *wp)
42 hyend = wdend = --i - 1;
47 if (wdend - wdstart < 4) /* 4 chars is too short to hyphenate */
53 /* for now, try exceptions first, then tex (if hyphalg is non-zero),
54 then suffix and digram if tex didn't hyphenate it at all.
57 if (!exword() && !texhyphen() && !suffix())
60 /* this appears to sort hyphenation points into increasing order */
65 for (hyp = hyptr + 1; *hyp != 0; hyp++) {
66 if (*(hyp - 1) > *hyp) {
76 static int alpha(Tchar i) /* non-zero if really alphabetic */
80 else if (cbits(i) >= ALPHABET) /* this isn't very elegant, but there's */
81 return 0; /* no good way to make sure i is in range for */
82 else /* the call of isalpha */
83 return isalpha(cbits(i));
96 void caseha(void) /* set hyphenation algorithm */
107 void caseht(void) /* set hyphenation threshold; not in manual! */
118 char *growh(char *where)
123 if ((new = grow(hbufp, hexsize, sizeof(char))) == NULL)
129 diff = where - hbufp;
143 if ((nexth = hbufp = grow(hbufp, NHEX, sizeof(char))) == NULL) {
144 ERROR "No space for exception word list." WARN;
151 if ((j = nexth) >= hbufp + hexsize - 2)
152 if ((j = nexth = growh(j)) == NULL)
155 if (ismot(t = getch()))
158 if (i == ' ' || i == '\n') {
171 *j++ = maplow(i) | k;
173 if (j >= hbufp + hexsize - 2)
174 if ((j = growh(j)) == NULL)
180 ERROR "Cannot grow exception word list." WARN;
193 if (e == NULL || *e == 0)
196 while (*e && w <= hyend && (*e & 0177) == maplow(cbits(*w))) {
201 if (w-1 == hyend || (w == wdend && maplow(cbits(*w)) == 's')) {
203 for (e = save; *e; e++) {
206 if (hyp > hyptr + NHYP - 1)
207 hyp = hyptr + NHYP - 1;
227 extern char *suftab[];
235 if ((s0 = suftab[i-'a']) == 0)
238 if ((i = *s0 & 017) == 0)
242 while (s > s0 && w >= wdstart && (*s & 0177) == maplow(cbits(*w))) {
259 if (*s0 & 0100) /* 0100 used in suftab to encode something too */
298 Tchar *chkvow(Tchar *w)
300 while (--w >= wdstart)
301 if (vowel(cbits(*w)))
311 Tchar *nhyend, *maxw;
313 extern char bxh[26][13], bxxh[26][13], xxh[26][13], xhx[26][13], hxx[26][13];
316 if (!(w = chkvow(hyend + 1)))
319 if (!(w = chkvow(hyend)))
324 while (++w < hyend && w < wdend - 1) {
327 val *= dilook('a', cbits(*w), bxh);
328 else if (w == wdstart + 1)
329 val *= dilook(cbits(*(w-1)), cbits(*w), bxxh);
331 val *= dilook(cbits(*(w-1)), cbits(*w), xxh);
332 val *= dilook(cbits(*w), cbits(*(w+1)), xhx);
333 val *= dilook(cbits(*(w+1)), cbits(*(w+2)), hxx);
346 dilook(int a, int b, char t[26][13])
350 i = t[maplow(a)-'a'][(j = maplow(b)-'a')/2];
357 /* here beginneth the tex hyphenation code, as interpreted freely */
358 /* the main difference is that there is no attempt to squeeze space */
359 /* as tightly at tex does. */
361 static int texit(Tchar *, Tchar *);
362 static int readpats(void);
363 static void install(char *);
364 static void fixup(void);
365 static int trieindex(int, int);
367 static char pats[50000]; /* size ought to be computed dynamically */
368 static char *nextpat = pats;
369 static char *trie[27*27]; /* english-specific sizes */
373 static int loaded = 0; /* -1: couldn't find tex file */
375 if (hyphalg == 0 || loaded == -1) /* non-zero => tex for now */
383 return texit(wdstart, wdend);
386 static int texit(Tchar *start, Tchar *end) /* hyphenate as in tex, return # found */
388 int nw, i, k, equal, cnt[500];
389 char w[500+1], *np, *pp, *wp, *xpp, *xwp;
392 for (nw = 1; start <= end && nw < 500-1; nw++, start++)
393 w[nw] = maplow(tolower(cbits(*start)));
398 * printf("try %s\n", w);
400 for (i = 0; i <= nw; i++)
403 for (wp = w; wp+1 < w+nw; wp++) {
404 for (pp = trie[trieindex(*wp, *(wp+1))]; pp < nextpat; ) {
405 if (pp == 0 /* no trie entry */
406 || *pp != *wp /* no match on 1st letter */
407 || *(pp+1) != *(wp+1)) /* no match on 2nd letter */
408 break; /* so move to next letter of word */
410 for (xpp = pp+2, xwp = wp+2; *xpp; )
411 if (*xpp++ != *xwp++) {
416 np = xpp+1; /* numpat */
417 for (k = wp-w; *np; k++, np++)
421 * printf("match: %s %s\n", pp, xpp+1);
424 pp += *(pp-1); /* skip over pattern and numbers to next */
428 * for (i = 0; i < nw; i++) printf("%c", w[i]);
430 * for (i = 0; i <= nw; i++) printf("%c", cnt[i]);
434 * for (i = 1; i < nw - 1; i++) {
435 * if (i > 2 && i < nw - 3 && cnt[i] % 2)
437 * if (cbits(start[i-1]) != '.')
438 * printf("%c", cbits(start[i-1]));
442 for (i = 1; i < nw -1; i++)
443 if (i > 2 && i < nw - 3 && cnt[i] % 2)
444 *hyp++ = start + i - 1;
445 return hyp - hyptr; /* non-zero if a hyphen was found */
449 This code assumes that hyphen.tex looks like
451 \patterns{ % more comments
452 pat5ter4ns, 1 per line, SORTED, nothing else
455 \hyphenation{ % more comments
456 ex-cep-tions, one per line; i ignore this part for now
459 this code is NOT robust against variations. unfortunately,
460 it looks like every local language version of this file has
461 a different format. i have also made no provision for weird
465 static int readpats(void)
468 char buf[200], buf1[200];
470 if ((fp = fopen(unsharp(TEXHYPHENS), "r")) == NULL
471 && (fp = fopen(unsharp(DWBalthyphens), "r")) == NULL) {
472 ERROR "warning: can't find hyphen.tex" WARN;
476 while (fgets(buf, sizeof buf, fp) != NULL) {
477 sscanf(buf, "%s", buf1);
478 if (strcmp(buf1, "\\patterns{") == 0)
481 while (fgets(buf, sizeof buf, fp) != NULL) {
491 static void install(char *s) /* map ab4c5de to: 12 abcde \0 00405 \0 */
494 char num[500], *onextpat = nextpat;
497 *nextpat++ = ' '; /* fill in with count later */
498 for (npat = lastpat = 0; *s != '\n' && *s != '\0'; s++) {
499 if (isdigit((uchar)*s)) {
509 if (nextpat > pats + sizeof(pats)-20) {
510 ERROR "tex hyphenation table overflow, tail end ignored" WARN;
514 strcat(nextpat, num);
515 nextpat += strlen(nextpat) + 1;
518 static void fixup(void) /* build indexes of where . a b c ... start */
523 for (lastc = pats, p = pats+1; p < nextpat; p++)
529 for (p = pats+1; p < nextpat; ) {
530 n = trieindex(p[0], p[1]);
535 /* printf("pats = %d\n", nextpat - pats); */
538 static int trieindex(int d1, int d2)
542 z = 27 * (d1 == '.' ? 0 : d1 - 'a' + 1) + (d2 == '.' ? 0 : d2 - 'a' + 1);
543 assert(z >= 0 && z < 27*27);