Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
6 enum{
7 Nfont = 11,
8 Wid = 20 /* tmac.anhtml sets page width to 20" so we can recognize .nf text */
9 };
11 typedef ulong Char;
12 typedef struct Troffchar Troffchar;
13 typedef struct Htmlchar Htmlchar;
14 typedef struct Font Font;
15 typedef struct HTMLfont HTMLfont;
17 /* a Char is 32 bits. low 16 bits are the rune. higher are attributes */
18 enum
19 {
20 Italic = 16,
21 Bold,
22 CW,
23 Indent1,
24 Indent2,
25 Indent3,
26 Heading = 25,
27 Anchor = 26 /* must be last */
28 };
30 enum /* magic emissions */
31 {
32 Estring = 0,
33 Epp = 1<<16
34 };
36 int attrorder[] = { Indent1, Indent2, Indent3, Heading, Anchor, Italic, Bold, CW };
38 int nest[10];
39 int nnest;
41 struct Troffchar
42 {
43 char *name;
44 char *value;
45 };
47 struct Htmlchar
48 {
49 char *utf;
50 char *name;
51 int value;
52 };
54 #include "chars.h"
56 struct Font{
57 char *name;
58 HTMLfont *htmlfont;
59 };
61 struct HTMLfont{
62 char *name;
63 char *htmlname;
64 int bit;
65 };
67 /* R must be first; it's the default representation for fonts we don't recognize */
68 HTMLfont htmlfonts[] =
69 {
70 "R", nil, 0,
71 "LuxiSans", nil, 0,
72 "I", "i", Italic,
73 "LuxiSans-Oblique", "i", Italic,
74 "CW", "tt", CW,
75 "LuxiMono", "tt", CW,
76 nil, nil
77 };
79 #define TABLE "<table border=0 cellpadding=0 cellspacing=0>"
81 char*
82 onattr[8*sizeof(ulong)] =
83 {
84 0, 0, 0, 0, 0, 0, 0, 0,
85 0, 0, 0, 0, 0, 0, 0, 0,
86 "<i>", /* italic */
87 "<b>", /* bold */
88 "<tt><font size=+1>", /* cw */
89 "<+table border=0 cellpadding=0 cellspacing=0><tr height=2><td><tr><td width=20><td>\n", /* indent1 */
90 "<+table border=0 cellpadding=0 cellspacing=0><tr height=2><td><tr><td width=20><td>\n", /* indent2 */
91 "<+table border=0 cellpadding=0 cellspacing=0><tr height=2><td><tr><td width=20><td>\n", /* indent3 */
92 0,
93 0,
94 0,
95 "<p><font size=+1><b>", /* heading 25 */
96 "<unused>", /* anchor 26 */
97 };
99 char*
100 offattr[8*sizeof(ulong)] =
102 0, 0, 0, 0, 0, 0, 0, 0,
103 0, 0, 0, 0, 0, 0, 0, 0,
104 "</i>", /* italic */
105 "</b>", /* bold */
106 "</font></tt>", /* cw */
107 "<-/table>", /* indent1 */
108 "<-/table>", /* indent2 */
109 "<-/table>", /* indent3 */
110 0,
111 0,
112 0,
113 "</b></font>", /* heading 25 */
114 "</a>", /* anchor 26 */
115 };
117 Font *font[Nfont];
119 Biobuf bout;
120 int debug = 0;
122 /* troff state */
123 int page = 1;
124 int ft = 1;
125 int vp = 0;
126 int hp = 0;
127 int ps = 1;
128 int res = 720;
130 int didP = 0;
131 int atnewline = 1;
132 int prevlineH = 0;
133 ulong attr = 0; /* or'ed into each Char */
135 Char *chars;
136 int nchars;
137 int nalloc;
138 char** anchors; /* allocated in order */
139 int nanchors;
141 char *pagename;
142 char *section;
144 char *filename;
145 int cno;
146 char buf[8192];
147 char *title = "Plan 9 man page";
149 void process(Biobuf*, char*);
150 void mountfont(int, char*);
151 void switchfont(int);
152 void header(char*);
153 void flush(void);
154 void trailer(void);
156 void*
157 emalloc(ulong n)
159 void *p;
161 p = malloc(n);
162 if(p == nil)
163 sysfatal("malloc failed: %r");
164 return p;
167 void*
168 erealloc(void *p, ulong n)
171 p = realloc(p, n);
172 if(p == nil)
173 sysfatal("realloc failed: %r");
174 return p;
177 char*
178 estrdup(char *s)
180 char *t;
182 t = strdup(s);
183 if(t == nil)
184 sysfatal("strdup failed: %r");
185 return t;
188 void
189 usage(void)
191 fprint(2, "usage: troff2html [-d] [-t title] [file ...]\n");
192 exits("usage");
195 int
196 hccmp(const void *va, const void *vb)
198 Htmlchar *a, *b;
200 a = (Htmlchar*)va;
201 b = (Htmlchar*)vb;
202 return a->value - b->value;
205 void
206 main(int argc, char *argv[])
208 int i;
209 Biobuf in, *inp;
210 Rune r;
212 for(i=0; i<nelem(htmlchars); i++){
213 chartorune(&r, htmlchars[i].utf);
214 htmlchars[i].value = r;
216 qsort(htmlchars, nelem(htmlchars), sizeof(htmlchars[0]), hccmp);
218 ARGBEGIN{
219 case 't':
220 title = ARGF();
221 if(title == nil)
222 usage();
223 break;
224 case 'd':
225 debug++;
226 break;
227 default:
228 usage();
229 }ARGEND
231 Binit(&bout, 1, OWRITE);
232 if(argc == 0){
233 Binit(&in, 0, OREAD);
234 process(&in, "<stdin>");
235 }else{
236 for(i=0; i<argc; i++){
237 inp = Bopen(argv[i], OREAD);
238 if(inp == nil)
239 sysfatal("can't open %s: %r", argv[i]);
240 process(inp, argv[i]);
241 Bterm(inp);
244 header(title);
245 flush();
246 trailer();
247 exits(nil);
250 void
251 emitul(ulong ul, int special)
253 ulong a, c;
255 if(nalloc == nchars){
256 nalloc += 10000;
257 chars = realloc(chars, nalloc*sizeof(chars[0]));
258 if(chars == nil)
259 sysfatal("malloc failed: %r");
262 if(!special){
263 a = ul&~0xFFFF;
264 c = ul&0xFFFF;
265 /*
266 * Attr-specific transformations.
267 */
268 if((a&(1<<CW)) && c=='-')
269 c = 0x2212;
270 if(!(a&(1<<CW))){
271 if(c == '`')
272 c = 0x2018;
273 if(c == '\'')
274 c = 0x2019;
276 ul = a|c;
278 /*
279 * Turn single quotes into double quotes.
280 */
281 if(nchars > 0){
282 if(c == 0x2018 && (chars[nchars-1]&0xFFFF) == 0x2018
283 && a==(chars[nchars-1]&~0xFFFF)){
284 chars[nchars-1] = (ul&~0xFFFF) | 0x201C;
285 return;
287 if(c == 0x2019 && (chars[nchars-1]&0xFFFF) == 0x2019
288 && a==(chars[nchars-1]&~0xFFFF)){
289 chars[nchars-1] = (ul&~0xFFFF) | 0x201D;
290 return;
294 chars[nchars++] = ul;
297 void
298 emit(Rune r)
300 emitul(r | attr, 0);
301 /*
302 * Close man page references early, so that
303 * .IR proof (1),
304 * doesn't make the comma part of the link.
305 */
306 if(r == ')')
307 attr &= ~(1<<Anchor);
310 void
311 emitstr(char *s)
313 emitul(Estring | attr, 0);
314 emitul((ulong)s, 1);
317 int indentlevel;
318 int linelen;
320 void
321 iputrune(Biobuf *b, Rune r)
323 int i;
325 if(linelen++ > 60 && r == ' ')
326 r = '\n';
327 if(r >= 0x80)
328 Bprint(b, "&#%d;", r);
329 else
330 Bputrune(b, r);
331 if(r == '\n'){
332 for(i=0; i<indentlevel; i++)
333 Bprint(b, " ");
334 linelen = 0;
338 void
339 iputs(Biobuf *b, char *s)
341 if(s[0]=='<' && s[1]=='+'){
342 iputrune(b, '\n');
343 Bprint(b, "<%s", s+2);
344 indentlevel++;
345 iputrune(b, '\n');
346 }else if(s[0]=='<' && s[1]=='-'){
347 indentlevel--;
348 iputrune(b, '\n');
349 Bprint(b, "<%s", s+2);
350 iputrune(b, '\n');
351 }else
352 Bprint(b, "%s", s);
355 void
356 setattr(ulong a)
358 int on, off, i, j;
360 on = a & ~attr;
361 off = attr & ~a;
363 /* walk up the nest stack until we reach something we need to turn off. */
364 for(i=0; i<nnest; i++)
365 if(off&(1<<nest[i]))
366 break;
368 /* turn off everything above that */
369 for(j=nnest-1; j>=i; j--)
370 iputs(&bout, offattr[nest[j]]);
372 /* turn on everything we just turned off but didn't want to */
373 for(j=i; j<nnest; j++)
374 if(a&(1<<nest[j]))
375 iputs(&bout, onattr[nest[j]]);
376 else
377 nest[j] = 0;
379 /* shift the zeros (turned off things) up */
380 for(i=j=0; i<nnest; i++)
381 if(nest[i] != 0)
382 nest[j++] = nest[i];
383 nnest = j;
385 /* now turn on the new attributes */
386 for(i=0; i<nelem(attrorder); i++){
387 j = attrorder[i];
388 if(on&(1<<j)){
389 if(j == Anchor)
390 onattr[j] = anchors[nanchors++];
391 iputs(&bout, onattr[j]);
392 nest[nnest++] = j;
395 attr = a;
398 void
399 flush(void)
401 int i;
402 ulong c, a;
404 nanchors = 0;
405 for(i=0; i<nchars; i++){
406 c = chars[i];
407 if(c == Epp){
408 iputrune(&bout, '\n');
409 iputs(&bout, TABLE "<tr height=5><td></table>");
410 iputrune(&bout, '\n');
411 continue;
413 a = c & ~0xFFFF;
414 c &= 0xFFFF;
415 /*
416 * If we're going to something off after a space,
417 * let's just turn it off before.
418 */
419 if(c==' ' && i<nchars-1 && (chars[i+1]&0xFFFF) >= 32)
420 a ^= a & ~chars[i+1];
421 setattr(a);
422 if(c == Estring){
423 /* next word is string to print */
424 iputs(&bout, (char*)chars[++i]);
425 continue;
427 iputrune(&bout, c & 0xFFFF);
431 void
432 header(char *s)
434 char *p;
436 Bprint(&bout, "<head>\n");
437 if(pagename && section){
438 char buf[512];
439 strecpy(buf, buf+sizeof buf, pagename);
440 for(p=buf; *p; p++)
441 *p = tolower((uchar)*p);
442 Bprint(&bout, "<title>%s(%s) - %s</title>\n", buf, section, s);
443 }else
444 Bprint(&bout, "<title>%s</title>\n", s);
445 Bprint(&bout, "<meta content=\"text/html; charset=utf-8\" http-equiv=Content-Type>\n");
446 Bprint(&bout, "</head>\n");
447 Bprint(&bout, "<body bgcolor=#ffffff>\n");
448 Bprint(&bout, "<table border=0 cellpadding=0 cellspacing=0 width=100%%>\n");
449 Bprint(&bout, "<tr height=10><td>\n");
450 Bprint(&bout, "<tr><td width=20><td>\n");
451 if(pagename && section){
452 Bprint(&bout, "<tr><td width=20><td><b>%s(%s)</b><td align=right><b>%s(%s)</b>\n",
453 pagename, section, pagename, section);
455 Bprint(&bout, "<tr><td width=20><td colspan=2>\n");
458 void
459 trailer(void)
461 Bprint(&bout, "<td width=20>\n");
462 Bprint(&bout, "<tr height=20><td>\n");
463 Bprint(&bout, "</table>\n");
465 #ifdef LUCENT
467 Tm *t;
469 t = localtime(time(nil));
470 Bprint(&bout, TABLE "<tr height=20><td></table>\n");
471 Bprint(&bout, "<font size=-1><a href=\"http:/*www.lucent.com/copyright.html\">\n"); */
472 Bprint(&bout, "Portions Copyright</A> &#169; %d Lucent Technologies. All rights reserved.</font>\n", t->year+1900);
474 #endif
475 Bprint(&bout, "<!-- TRAILER -->\n");
476 Bprint(&bout, "</body></html>\n");
479 int
480 getc(Biobuf *b)
482 cno++;
483 return Bgetrune(b);
486 void
487 ungetc(Biobuf *b)
489 cno--;
490 Bungetrune(b);
493 char*
494 getline(Biobuf *b)
496 int i, c;
498 for(i=0; i<sizeof buf; i++){
499 c = getc(b);
500 if(c == Beof)
501 return nil;
502 buf[i] = c;
503 if(c == '\n'){
504 buf[i] = '\0';
505 break;
508 return buf;
511 int
512 getnum(Biobuf *b)
514 int i, c;
516 i = 0;
517 for(;;){
518 c = getc(b);
519 if(c<'0' || '9'<c){
520 ungetc(b);
521 break;
523 i = i*10 + (c-'0');
525 return i;
528 char*
529 getstr(Biobuf *b)
531 int i, c;
533 for(i=0; i<sizeof buf; i++){
534 /* must get bytes not runes */
535 cno++;
536 c = Bgetc(b);
537 if(c == Beof)
538 return nil;
539 buf[i] = c;
540 if(c == '\n' || c==' ' || c=='\t'){
541 ungetc(b);
542 buf[i] = '\0';
543 break;
546 return buf;
549 int
550 setnum(Biobuf *b, char *name, int min, int max)
552 int i;
554 i = getnum(b);
555 if(debug > 2)
556 fprint(2, "set %s = %d\n", name, i);
557 if(min<=i && i<max)
558 return i;
559 sysfatal("value of %s is %d; min %d max %d at %s:#%d", name, i, min, max, filename, cno);
560 return i;
563 void
564 xcmd(Biobuf *b)
566 char *p, *fld[16], buf[1024];
568 int i, nfld;
570 p = getline(b);
571 if(p == nil)
572 sysfatal("xcmd error: %r");
573 if(debug)
574 fprint(2, "x command '%s'\n", p);
575 nfld = tokenize(p, fld, nelem(fld));
576 if(nfld == 0)
577 return;
578 switch(fld[0][0]){
579 case 'f':
580 /* mount font */
581 if(nfld != 3)
582 break;
583 i = atoi(fld[1]);
584 if(i<0 || Nfont<=i)
585 sysfatal("font %d out of range at %s:#%d", i, filename, cno);
586 mountfont(i, fld[2]);
587 return;
588 case 'i':
589 /* init */
590 return;
591 case 'r':
592 if(nfld<2 || atoi(fld[1])!=res)
593 sysfatal("typesetter has unexpected resolution %s", fld[1]? fld[1] : "<unspecified>");
594 return;
595 case 's':
596 /* stop */
597 return;
598 case 't':
599 /* trailer */
600 return;
601 case 'T':
602 if(nfld!=2 || strcmp(fld[1], "utf")!=0)
603 sysfatal("output for unknown typesetter type %s", fld[1]);
604 return;
605 case 'X':
606 if(nfld<3 || strcmp(fld[1], "html")!=0)
607 break;
608 /* is it a man reference of the form cp(1)? */
609 /* X manref start/end cp (1) */
610 if(nfld==6 && strcmp(fld[2], "manref")==0){
611 /* was the right macro; is it the right form? */
612 if(strlen(fld[5])>=3 &&
613 fld[5][0]=='('/*)*/ && (fld[5][2]==/*(*/')' || (isalpha((uchar)fld[5][2]) && fld[5][3]==/*(*/')')) &&
614 '0'<=fld[5][1] && fld[5][1]<='9'){
615 if(strcmp(fld[3], "start") == 0){
616 /* set anchor attribute and remember string */
617 attr |= (1<<Anchor);
618 #if 0
619 snprint(buf, sizeof buf,
620 "<a href=\"/magic/man2html/man%c/%s\">",
621 fld[5][1], fld[4]);
622 #else
623 snprint(buf, sizeof buf,
624 "<a href=\"../man%c/%s.html\">", fld[5][1], fld[4]);
625 for(p=buf; *p; p++)
626 if('A' <= *p && *p <= 'Z')
627 *p += 'a'-'A';
628 #endif
629 nanchors++;
630 anchors = erealloc(anchors, nanchors*sizeof(char*));
631 anchors[nanchors-1] = estrdup(buf);
632 }else if(strcmp(fld[3], "end") == 0)
633 attr &= ~(1<<Anchor);
635 }else if(nfld >= 4 && strcmp(fld[2], "href") == 0){
636 attr |= 1<<Anchor;
637 nanchors++;
638 anchors = erealloc(anchors, nanchors*sizeof(char*));
639 anchors[nanchors-1] = smprint("<a href=\"%s\">", fld[3]);
640 }else if(strcmp(fld[2], "/href") == 0){
641 attr &= ~(1<<Anchor);
642 }else if(strcmp(fld[2], "manPP") == 0){
643 didP = 1;
644 emitul(Epp, 1);
645 }else if(nfld>=5 && strcmp(fld[2], "manhead") == 0){
646 pagename = strdup(fld[3]);
647 section = strdup(fld[4]);
648 }else if(nfld<4 || strcmp(fld[2], "manref")!=0){
649 if(nfld>2 && strcmp(fld[2], "<P>")==0){ /* avoid triggering extra <br> */
650 didP = 1;
651 /* clear all font attributes before paragraph */
652 emitul(' ' | (attr & ~(0xFFFF|((1<<Italic)|(1<<Bold)|(1<<CW)))), 0);
653 emitstr("<P>");
654 /* next emittec char will turn font attributes back on */
655 }else if(nfld>2 && strcmp(fld[2], "<H4>")==0)
656 attr |= (1<<Heading);
657 else if(nfld>2 && strcmp(fld[2], "</H4>")==0)
658 attr &= ~(1<<Heading);
659 else if(debug)
660 fprint(2, "unknown in-line html %s... at %s:%#d\n",
661 fld[2], filename, cno);
663 return;
665 if(debug)
666 fprint(2, "unknown or badly formatted x command %s\n", fld[0]);
669 int
670 lookup(int c, Htmlchar tab[], int ntab)
672 int low, high, mid;
674 low = 0;
675 high = ntab - 1;
676 while(low <= high){
677 mid = (low+high)/2;
678 if(c < tab[mid].value)
679 high = mid - 1;
680 else if(c > tab[mid].value)
681 low = mid + 1;
682 else
683 return mid;
685 return -1; /* no match */
688 void
689 emithtmlchar(int r)
691 int i;
693 i = lookup(r, htmlchars, nelem(htmlchars));
694 if(i >= 0)
695 emitstr(htmlchars[i].name);
696 else
697 emit(r);
700 char*
701 troffchar(char *s)
703 int i;
705 for(i=0; troffchars[i].name!=nil; i++)
706 if(strcmp(s, troffchars[i].name) == 0)
707 return troffchars[i].value;
708 return strdup(s);
711 void
712 indent(void)
714 int nind;
716 didP = 0;
717 if(atnewline){
718 if(hp != prevlineH){
719 prevlineH = hp;
720 /* these most peculiar numbers appear in the troff -man output */
721 nind = ((prevlineH-1*res)+323)/324;
722 attr &= ~((1<<Indent1)|(1<<Indent2)|(1<<Indent3));
723 if(nind >= 1)
724 attr |= (1<<Indent1);
725 if(nind >= 2)
726 attr |= (1<<Indent2);
727 if(nind >= 3)
728 attr |= (1<<Indent3);
730 atnewline = 0;
734 void
735 process(Biobuf *b, char *name)
737 int c, r, v, i;
738 char *p;
740 cno = 0;
741 prevlineH = res;
742 filename = name;
743 for(;;){
744 c = getc(b);
745 switch(c){
746 case Beof:
747 /* go to ground state */
748 attr = 0;
749 emit('\n');
750 return;
751 case '\n':
752 break;
753 case '0': case '1': case '2': case '3': case '4':
754 case '5': case '6': case '7': case '8': case '9':
755 v = c-'0';
756 c = getc(b);
757 if(c<'0' || '9'<c)
758 sysfatal("illegal character motion at %s:#%d", filename, cno);
759 v = v*10 + (c-'0');
760 hp += v;
761 /* fall through to character case */
762 case 'c':
763 indent();
764 r = getc(b);
765 emithtmlchar(r);
766 break;
767 case 'D':
768 /* draw line; ignore */
769 do
770 c = getc(b);
771 while(c!='\n' && c!= Beof);
772 break;
773 case 'f':
774 v = setnum(b, "font", 0, Nfont);
775 switchfont(v);
776 break;
777 case 'h':
778 v = setnum(b, "hpos", -20000, 20000);
779 /* generate spaces if motion is large and within a line */
780 if(!atnewline && v>2*72)
781 for(i=0; i<v; i+=72)
782 emitstr("&nbsp;");
783 hp += v;
784 break;
785 case 'n':
786 setnum(b, "n1", -10000, 10000);
787 /*Bprint(&bout, " N1=%d", v); */
788 getc(b); /* space separates */
789 setnum(b, "n2", -10000, 10000);
790 atnewline = 1;
791 if(!didP && hp < (Wid-1)*res) /* if line is less than 19" long, probably need a line break */
792 emitstr("<br>");
793 emit('\n');
794 break;
795 case 'p':
796 page = setnum(b, "ps", -10000, 10000);
797 break;
798 case 's':
799 ps = setnum(b, "ps", 1, 1000);
800 break;
801 case 'v':
802 vp += setnum(b, "vpos", -10000, 10000);
803 /* BUG: ignore motion */
804 break;
805 case 'x':
806 xcmd(b);
807 break;
808 case 'w':
809 emit(' ');
810 break;
811 case 'C':
812 indent();
813 p = getstr(b);
814 emitstr(troffchar(p));
815 break;
816 case 'H':
817 hp = setnum(b, "hpos", 0, 20000);
818 /*Bprint(&bout, " H=%d ", hp); */
819 break;
820 case 'V':
821 vp = setnum(b, "vpos", 0, 10000);
822 break;
823 default:
824 fprint(2, "dhtml: unknown directive %c(0x%.2ux) at %s:#%d\n", c, c, filename, cno);
825 return;
830 HTMLfont*
831 htmlfont(char *name)
833 int i;
835 for(i=0; htmlfonts[i].name!=nil; i++)
836 if(strcmp(name, htmlfonts[i].name) == 0)
837 return &htmlfonts[i];
838 return &htmlfonts[0];
841 void
842 mountfont(int pos, char *name)
844 if(debug)
845 fprint(2, "mount font %s on %d\n", name, pos);
846 if(font[pos] != nil){
847 free(font[pos]->name);
848 free(font[pos]);
850 font[pos] = emalloc(sizeof(Font));
851 font[pos]->name = estrdup(name);
852 font[pos]->htmlfont = htmlfont(name);
855 void
856 switchfont(int pos)
858 HTMLfont *hf;
860 if(debug)
861 fprint(2, "font change from %d (%s) to %d (%s)\n", ft, font[ft]->name, pos, font[pos]->name);
862 if(pos == ft)
863 return;
864 hf = font[ft]->htmlfont;
865 if(hf->bit != 0)
866 attr &= ~(1<<hf->bit);
867 ft = pos;
868 hf = font[ft]->htmlfont;
869 if(hf->bit != 0)
870 attr |= (1<<hf->bit);