Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
7 /*
8 * file - determine type of file
9 */
10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
12 uchar buf[6001];
13 short cfreq[140];
14 short wfreq[50];
15 int nbuf;
16 Dir* mbuf;
17 int fd;
18 char *fname;
19 char *slash;
21 enum
22 {
23 Cword,
24 Fword,
25 Aword,
26 Alword,
27 Lword,
28 I1,
29 I2,
30 I3,
31 Clatin = 128,
32 Cbinary,
33 Cnull,
34 Ceascii,
35 Cutf,
36 };
37 struct
38 {
39 char* word;
40 int class;
41 } dict[] =
42 {
43 "PATH", Lword,
44 "TEXT", Aword,
45 "adt", Alword,
46 "aggr", Alword,
47 "alef", Alword,
48 "array", Lword,
49 "block", Fword,
50 "chan", Alword,
51 "char", Cword,
52 "common", Fword,
53 "con", Lword,
54 "data", Fword,
55 "dimension", Fword,
56 "double", Cword,
57 "extern", Cword,
58 "bio", I2,
59 "float", Cword,
60 "fn", Lword,
61 "function", Fword,
62 "h", I3,
63 "implement", Lword,
64 "import", Lword,
65 "include", I1,
66 "int", Cword,
67 "integer", Fword,
68 "iota", Lword,
69 "libc", I2,
70 "long", Cword,
71 "module", Lword,
72 "real", Fword,
73 "ref", Lword,
74 "register", Cword,
75 "self", Lword,
76 "short", Cword,
77 "static", Cword,
78 "stdio", I2,
79 "struct", Cword,
80 "subroutine", Fword,
81 "u", I2,
82 "void", Cword,
83 };
85 /* codes for 'mode' field in language structure */
86 enum {
87 Normal = 0,
88 First, /* first entry for language spanning several ranges */
89 Multi, /* later entries " " " ... */
90 Shared, /* codes used in several languages */
91 };
93 struct
94 {
95 int mode; /* see enum above */
96 int count;
97 int low;
98 int high;
99 char *name;
101 } language[] =
103 Normal, 0, 0x0080, 0x0080, "Extended Latin",
104 Normal, 0, 0x0100, 0x01FF, "Extended Latin",
105 Normal, 0, 0x0370, 0x03FF, "Greek",
106 Normal, 0, 0x0400, 0x04FF, "Cyrillic",
107 Normal, 0, 0x0530, 0x058F, "Armenian",
108 Normal, 0, 0x0590, 0x05FF, "Hebrew",
109 Normal, 0, 0x0600, 0x06FF, "Arabic",
110 Normal, 0, 0x0900, 0x097F, "Devanagari",
111 Normal, 0, 0x0980, 0x09FF, "Bengali",
112 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi",
113 Normal, 0, 0x0A80, 0x0AFF, "Gujarati",
114 Normal, 0, 0x0B00, 0x0B7F, "Oriya",
115 Normal, 0, 0x0B80, 0x0BFF, "Tamil",
116 Normal, 0, 0x0C00, 0x0C7F, "Telugu",
117 Normal, 0, 0x0C80, 0x0CFF, "Kannada",
118 Normal, 0, 0x0D00, 0x0D7F, "Malayalam",
119 Normal, 0, 0x0E00, 0x0E7F, "Thai",
120 Normal, 0, 0x0E80, 0x0EFF, "Lao",
121 Normal, 0, 0x1000, 0x105F, "Tibetan",
122 Normal, 0, 0x10A0, 0x10FF, "Georgian",
123 Normal, 0, 0x3040, 0x30FF, "Japanese",
124 Normal, 0, 0x3100, 0x312F, "Chinese",
125 First, 0, 0x3130, 0x318F, "Korean",
126 Multi, 0, 0x3400, 0x3D2F, "Korean",
127 Shared, 0, 0x4e00, 0x9fff, "CJK",
128 Normal, 0, 0, 0, 0, /* terminal entry */
129 };
132 enum
134 Fascii, /* printable ascii */
135 Flatin, /* latin 1*/
136 Futf, /* UTf character set */
137 Fbinary, /* binary */
138 Feascii, /* ASCII with control chars */
139 Fnull, /* NULL in file */
140 } guess;
142 void bump_utf_count(Rune);
143 int cistrncmp(char*, char*, int);
144 void filetype(int);
145 int getfontnum(uchar*, uchar**);
146 int isas(void);
147 int isc(void);
148 int isenglish(void);
149 int ishp(void);
150 int ishtml(void);
151 int isrfc822(void);
152 int ismbox(void);
153 int islimbo(void);
154 int ismung(void);
155 int isp9bit(void);
156 int isp9font(void);
157 int isrtf(void);
158 int ismsdos(void);
159 int iself(void);
160 int istring(void);
161 int iff(void);
162 int long0(void);
163 int istar(void);
164 int p9bitnum(uchar*);
165 int p9subfont(uchar*);
166 void print_utf(void);
167 void type(char*, int);
168 int utf_count(void);
169 void wordfreq(void);
171 int (*call[])(void) =
173 long0, /* recognizable by first 4 bytes */
174 istring, /* recognizable by first string */
175 iff, /* interchange file format (strings) */
176 isrfc822, /* email file */
177 ismbox, /* mail box */
178 istar, /* recognizable by tar checksum */
179 ishtml, /* html keywords */
180 /* iscint, /* compiler/assembler intermediate */
181 islimbo, /* limbo source */
182 isc, /* c & alef compiler key words */
183 isas, /* assembler key words */
184 ismung, /* entropy compressed/encrypted */
185 isp9font, /* plan 9 font */
186 isp9bit, /* plan 9 image (as from /dev/window) */
187 isenglish, /* char frequency English */
188 isrtf, /* rich text format */
189 ismsdos, /* msdos exe (virus file attachement) */
190 iself, /* ELF (foreign) executable */
192 };
194 int mime;
196 #define OCTET "application/octet-stream\n"
197 #define PLAIN "text/plain\n"
199 void
200 main(int argc, char *argv[])
202 int i, j, maxlen;
203 char *cp;
204 Rune r;
206 ARGBEGIN{
207 case 'm':
208 mime = 1;
209 break;
210 default:
211 fprint(2, "usage: file [-m] [file...]\n");
212 exits("usage");
213 }ARGEND;
215 maxlen = 0;
216 if(mime == 0 || argc > 1){
217 for(i = 0; i < argc; i++) {
218 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
220 if(j > maxlen)
221 maxlen = j;
224 if (argc <= 0) {
225 if(!mime)
226 print ("stdin: ");
227 filetype(0);
229 else {
230 for(i = 0; i < argc; i++)
231 type(argv[i], maxlen);
233 exits(0);
236 void
237 type(char *file, int nlen)
239 Rune r;
240 int i;
241 char *p;
243 if(nlen > 0){
244 slash = 0;
245 for (i = 0, p = file; *p; i++) {
246 if (*p == '/') /* find rightmost slash */
247 slash = p;
248 p += chartorune(&r, p); /* count runes */
250 print("%s:%*s",file, nlen-i+1, "");
252 fname = file;
253 if ((fd = open(file, OREAD)) < 0) {
254 print("cannot open\n");
255 return;
257 filetype(fd);
258 close(fd);
261 void
262 filetype(int fd)
264 Rune r;
265 int i, f, n;
266 char *p, *eob;
268 free(mbuf);
269 mbuf = dirfstat(fd);
270 if(mbuf == nil){
271 print("cannot stat: %r\n");
272 return;
274 if(mbuf->mode & DMDIR) {
275 print(mime ? "text/directory\n" : "directory\n");
276 return;
278 if(mbuf->type != 'M' && mbuf->type != '|') {
279 print(mime ? OCTET : "special file #%c/%s\n",
280 mbuf->type, mbuf->name);
281 return;
283 nbuf = read(fd, buf, sizeof(buf)-1);
285 if(nbuf < 0) {
286 print("cannot read\n");
287 return;
289 if(nbuf == 0) {
290 print(mime ? PLAIN : "empty file\n");
291 return;
293 buf[nbuf] = 0;
295 /*
296 * build histogram table
297 */
298 memset(cfreq, 0, sizeof(cfreq));
299 for (i = 0; language[i].name; i++)
300 language[i].count = 0;
301 eob = (char *)buf+nbuf;
302 for(n = 0, p = (char *)buf; p < eob; n++) {
303 if (!fullrune(p, eob-p) && eob-p < UTFmax)
304 break;
305 p += chartorune(&r, p);
306 if (r == 0)
307 f = Cnull;
308 else if (r <= 0x7f) {
309 if (!isprint(r) && !isspace(r))
310 f = Ceascii; /* ASCII control char */
311 else f = r;
312 } else if (r == 0x080) {
313 bump_utf_count(r);
314 f = Cutf;
315 } else if (r < 0xA0)
316 f = Cbinary; /* Invalid Runes */
317 else if (r <= 0xff)
318 f = Clatin; /* Latin 1 */
319 else {
320 bump_utf_count(r);
321 f = Cutf; /* UTF extension */
323 cfreq[f]++; /* ASCII chars peg directly */
325 /*
326 * gross classify
327 */
328 if (cfreq[Cbinary])
329 guess = Fbinary;
330 else if (cfreq[Cutf])
331 guess = Futf;
332 else if (cfreq[Clatin])
333 guess = Flatin;
334 else if (cfreq[Ceascii])
335 guess = Feascii;
336 else if (cfreq[Cnull] == n) {
337 print(mime ? OCTET : "first block all null bytes\n");
338 return;
340 else guess = Fascii;
341 /*
342 * lookup dictionary words
343 */
344 memset(wfreq, 0, sizeof(wfreq));
345 if(guess == Fascii || guess == Flatin || guess == Futf)
346 wordfreq();
347 /*
348 * call individual classify routines
349 */
350 for(i=0; call[i]; i++)
351 if((*call[i])())
352 return;
354 /*
355 * if all else fails,
356 * print out gross classification
357 */
358 if (nbuf < 100 && !mime)
359 print(mime ? PLAIN : "short ");
360 if (guess == Fascii)
361 print(mime ? PLAIN : "Ascii\n");
362 else if (guess == Feascii)
363 print(mime ? PLAIN : "extended ascii\n");
364 else if (guess == Flatin)
365 print(mime ? PLAIN : "latin ascii\n");
366 else if (guess == Futf && utf_count() < 4)
367 print_utf();
368 else print(mime ? OCTET : "binary\n");
371 void
372 bump_utf_count(Rune r)
374 int low, high, mid;
376 high = sizeof(language)/sizeof(language[0])-1;
377 for (low = 0; low < high;) {
378 mid = (low+high)/2;
379 if (r >=language[mid].low) {
380 if (r <= language[mid].high) {
381 language[mid].count++;
382 break;
383 } else low = mid+1;
384 } else high = mid;
388 int
389 utf_count(void)
391 int i, count;
393 count = 0;
394 for (i = 0; language[i].name; i++)
395 if (language[i].count > 0)
396 switch (language[i].mode) {
397 case Normal:
398 case First:
399 count++;
400 break;
401 default:
402 break;
404 return count;
407 int
408 chkascii(void)
410 int i;
412 for (i = 'a'; i < 'z'; i++)
413 if (cfreq[i])
414 return 1;
415 for (i = 'A'; i < 'Z'; i++)
416 if (cfreq[i])
417 return 1;
418 return 0;
421 int
422 find_first(char *name)
424 int i;
426 for (i = 0; language[i].name != 0; i++)
427 if (language[i].mode == First
428 && strcmp(language[i].name, name) == 0)
429 return i;
430 return -1;
433 void
434 print_utf(void)
436 int i, printed, j;
438 if(mime){
439 print(PLAIN);
440 return;
442 if (chkascii()) {
443 printed = 1;
444 print("Ascii");
445 } else
446 printed = 0;
447 for (i = 0; language[i].name; i++)
448 if (language[i].count) {
449 switch(language[i].mode) {
450 case Multi:
451 j = find_first(language[i].name);
452 if (j < 0)
453 break;
454 if (language[j].count > 0)
455 break;
456 /* Fall through */
457 case Normal:
458 case First:
459 if (printed)
460 print(" & ");
461 else printed = 1;
462 print("%s", language[i].name);
463 break;
464 case Shared:
465 default:
466 break;
469 if(!printed)
470 print("UTF");
471 print(" text\n");
474 void
475 wordfreq(void)
477 int low, high, mid, r;
478 uchar *p, *p2, c;
480 p = buf;
481 for(;;) {
482 while (p < buf+nbuf && !isalpha(*p))
483 p++;
484 if (p >= buf+nbuf)
485 return;
486 p2 = p;
487 while(p < buf+nbuf && isalpha(*p))
488 p++;
489 c = *p;
490 *p = 0;
491 high = sizeof(dict)/sizeof(dict[0]);
492 for(low = 0;low < high;) {
493 mid = (low+high)/2;
494 r = strcmp(dict[mid].word, (char*)p2);
495 if(r == 0) {
496 wfreq[dict[mid].class]++;
497 break;
499 if(r < 0)
500 low = mid+1;
501 else
502 high = mid;
504 *p++ = c;
508 typedef struct Filemagic Filemagic;
509 struct Filemagic {
510 ulong x;
511 ulong mask;
512 char *desc;
513 char *mime;
514 };
516 Filemagic long0tab[] = {
517 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET,
518 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET,
519 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET,
520 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET,
521 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET,
522 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip",
523 070707, 0xFFFF, "cpio archive\n", OCTET,
524 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi",
525 0xfffa0000, 0xfffe0000, "mp3 audio", "audio/mpeg",
526 };
528 int
529 filemagic(Filemagic *tab, int ntab, ulong x)
531 int i;
533 for(i=0; i<ntab; i++)
534 if((x&tab[i].mask) == tab[i].x){
535 print(mime ? tab[i].mime : tab[i].desc);
536 return 1;
538 return 0;
541 int
542 long0(void)
544 // Fhdr *f;
545 long x;
547 seek(fd, 0, 0); /* reposition to start of file */
548 /*
549 if(crackhdr(fd, &f)) {
550 print(mime ? OCTET : "%s\n", f.name);
551 return 1;
553 */
554 x = LENDIAN(buf);
555 if(filemagic(long0tab, nelem(long0tab), x))
556 return 1;
557 return 0;
560 /* from tar.c */
561 enum { NAMSIZ = 100, TBLOCK = 512 };
563 union hblock
565 char dummy[TBLOCK];
566 struct header
568 char name[NAMSIZ];
569 char mode[8];
570 char uid[8];
571 char gid[8];
572 char size[12];
573 char mtime[12];
574 char chksum[8];
575 char linkflag;
576 char linkname[NAMSIZ];
577 /* rest are defined by POSIX's ustar format; see p1003.2b */
578 char magic[6]; /* "ustar" */
579 char version[2];
580 char uname[32];
581 char gname[32];
582 char devmajor[8];
583 char devminor[8];
584 char prefix[155]; /* if non-null, path = prefix "/" name */
585 } dbuf;
586 };
588 int
589 checksum(union hblock *hp)
591 int i;
592 char *cp;
593 struct header *hdr = &hp->dbuf;
595 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
596 *cp = ' ';
597 i = 0;
598 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
599 i += *cp & 0xff;
600 return i;
603 int
604 istar(void)
606 int chksum;
607 char tblock[TBLOCK];
608 union hblock *hp = (union hblock *)tblock;
609 struct header *hdr = &hp->dbuf;
611 seek(fd, 0, 0); /* reposition to start of file */
612 if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
613 return 0;
614 chksum = strtol(hdr->chksum, 0, 8);
615 if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
616 if (strcmp(hdr->magic, "ustar") == 0)
617 print(mime? "application/x-ustar\n":
618 "posix tar archive\n");
619 else
620 print(mime? "application/x-tar\n": "tar archive\n");
621 return 1;
623 return 0;
626 /*
627 * initial words to classify file
628 */
629 struct FILE_STRING
631 char *key;
632 char *filetype;
633 int length;
634 char *mime;
635 } file_string[] =
637 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream",
638 "!<arch>\n", "archive", 8, "application/octet-stream",
639 "070707", "cpio archive - ascii header", 6, "application/octet-stream",
640 "#!/bin/rc", "rc executable file", 9, "text/plain",
641 "#!/bin/sh", "sh executable file", 9, "text/plain",
642 "%!", "postscript", 2, "application/postscript",
643 "\004%!", "postscript", 3, "application/postscript",
644 "x T post", "troff output for post", 8, "application/troff",
645 "x T Latin1", "troff output for Latin1", 10, "application/troff",
646 "x T utf", "troff output for UTF", 7, "application/troff",
647 "x T 202", "troff output for 202", 7, "application/troff",
648 "x T aps", "troff output for aps", 7, "application/troff",
649 "GIF", "GIF image", 3, "image/gif",
650 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript",
651 "%PDF", "PDF", 4, "application/pdf",
652 "<html>\n", "HTML file", 7, "text/html",
653 "<HTML>\n", "HTML file", 7, "text/html",
654 "compressed\n", "Compressed image or subfont", 11, "application/octet-stream",
655 "\111\111\052\000", "tiff", 4, "image/tiff",
656 "\115\115\000\052", "tiff", 4, "image/tiff",
657 "\377\330\377\340", "jpeg", 4, "image/jpeg",
658 "\377\330\377\341", "jpeg", 4, "image/jpeg",
659 "\377\330\377\333", "jpeg", 4, "image/jpeg",
660 "BM", "bmp", 2, "image/bmp",
661 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream",
662 "<MakerFile ", "FrameMaker file", 11, "application/framemaker",
663 "\033%-12345X", "HPJCL file", 9, "application/hpjcl",
664 "ID3", "mp3 audio with id3", 3, "audio/mpeg",
665 0,0,0,0
666 };
668 int
669 istring(void)
671 int i;
672 struct FILE_STRING *p;
674 for(p = file_string; p->key; p++) {
675 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
676 if(mime)
677 print("%s\n", p->mime);
678 else
679 print("%s\n", p->filetype);
680 return 1;
683 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */
684 for(i = 5; i < nbuf; i++)
685 if(buf[i] == '\n')
686 break;
687 if(mime)
688 print(OCTET);
689 else
690 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
691 return 1;
693 return 0;
696 int
697 iff(void)
699 if (strncmp((char*)buf, "FORM", 4) == 0 &&
700 strncmp((char*)buf+8, "AIFF", 4) == 0) {
701 print("%s\n", mime? "audio/x-aiff": "aiff audio");
702 return 1;
704 return 0;
707 char* html_string[] =
709 "title",
710 "body",
711 "head",
712 "strong",
713 "h1",
714 "h2",
715 "h3",
716 "h4",
717 "h5",
718 "h6",
719 "ul",
720 "li",
721 "dl",
722 "br",
723 "em",
724 0,
725 };
727 int
728 ishtml(void)
730 uchar *p, *q;
731 int i, count;
733 /* compare strings between '<' and '>' to html table */
734 count = 0;
735 p = buf;
736 for(;;) {
737 while (p < buf+nbuf && *p != '<')
738 p++;
739 p++;
740 if (p >= buf+nbuf)
741 break;
742 if(*p == '/')
743 p++;
744 q = p;
745 while(p < buf+nbuf && *p != '>')
746 p++;
747 if (p >= buf+nbuf)
748 break;
749 for(i = 0; html_string[i]; i++) {
750 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
751 if(count++ > 4) {
752 print(mime ? "text/html\n" : "HTML file\n");
753 return 1;
755 break;
758 p++;
760 return 0;
763 char* rfc822_string[] =
765 "from:",
766 "date:",
767 "to:",
768 "subject:",
769 "received:",
770 "reply to:",
771 "sender:",
772 0,
773 };
775 int
776 isrfc822(void)
779 char *p, *q, *r;
780 int i, count;
782 count = 0;
783 p = (char*)buf;
784 for(;;) {
785 q = strchr(p, '\n');
786 if(q == nil)
787 break;
788 *q = 0;
789 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
790 count++;
791 *q = '\n';
792 p = q+1;
793 continue;
795 *q = '\n';
796 if(*p != '\t' && *p != ' '){
797 r = strchr(p, ':');
798 if(r == 0 || r > q)
799 break;
800 for(i = 0; rfc822_string[i]; i++) {
801 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
802 count++;
803 break;
807 p = q+1;
809 if(count >= 3){
810 print(mime ? "message/rfc822\n" : "email file\n");
811 return 1;
813 return 0;
816 int
817 ismbox(void)
819 char *p, *q;
821 p = (char*)buf;
822 q = strchr(p, '\n');
823 if(q == nil)
824 return 0;
825 *q = 0;
826 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
827 print(mime ? "text/plain\n" : "mail box\n");
828 return 1;
830 *q = '\n';
831 return 0;
834 int
835 isc(void)
837 int n;
839 n = wfreq[I1];
840 /*
841 * includes
842 */
843 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
844 goto yes;
845 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
846 goto yes;
847 /*
848 * declarations
849 */
850 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
851 goto yes;
852 /*
853 * assignments
854 */
855 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
856 goto yes;
857 return 0;
859 yes:
860 if(mime){
861 print(PLAIN);
862 return 1;
864 if(wfreq[Alword] > 0)
865 print("alef program\n");
866 else
867 print("c program\n");
868 return 1;
871 int
872 islimbo(void)
875 /*
876 * includes
877 */
878 if(wfreq[Lword] < 4)
879 return 0;
880 print(mime ? PLAIN : "limbo program\n");
881 return 1;
884 int
885 isas(void)
888 /*
889 * includes
890 */
891 if(wfreq[Aword] < 2)
892 return 0;
893 print(mime ? PLAIN : "as program\n");
894 return 1;
897 /*
898 * low entropy means encrypted
899 */
900 int
901 ismung(void)
903 int i, bucket[8];
904 float cs;
906 if(nbuf < 64)
907 return 0;
908 memset(bucket, 0, sizeof(bucket));
909 for(i=0; i<64; i++)
910 bucket[(buf[i]>>5)&07] += 1;
912 cs = 0.;
913 for(i=0; i<8; i++)
914 cs += (bucket[i]-8)*(bucket[i]-8);
915 cs /= 8.;
916 if(cs <= 24.322) {
917 if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
918 print(mime ? OCTET : "compressed\n");
919 else
920 print(mime ? OCTET : "encrypted\n");
921 return 1;
923 return 0;
926 /*
927 * english by punctuation and frequencies
928 */
929 int
930 isenglish(void)
932 int vow, comm, rare, badpun, punct;
933 char *p;
935 if(guess != Fascii && guess != Feascii)
936 return 0;
937 badpun = 0;
938 punct = 0;
939 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
940 switch(*p) {
941 case '.':
942 case ',':
943 case ')':
944 case '%':
945 case ';':
946 case ':':
947 case '?':
948 punct++;
949 if(p[1] != ' ' && p[1] != '\n')
950 badpun++;
952 if(badpun*5 > punct)
953 return 0;
954 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */
955 return 0;
956 if(2*cfreq[';'] > cfreq['e'])
957 return 0;
959 vow = 0;
960 for(p="AEIOU"; *p; p++) {
961 vow += cfreq[(uchar)*p];
962 vow += cfreq[tolower((uchar)*p)];
964 comm = 0;
965 for(p="ETAION"; *p; p++) {
966 comm += cfreq[(uchar)*p];
967 comm += cfreq[tolower((uchar)*p)];
969 rare = 0;
970 for(p="VJKQXZ"; *p; p++) {
971 rare += cfreq[(uchar)*p];
972 rare += cfreq[tolower((uchar)*p)];
974 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
975 print(mime ? PLAIN : "English text\n");
976 return 1;
978 return 0;
981 /*
982 * pick up a number with
983 * syntax _*[0-9]+_
984 */
985 #define P9BITLEN 12
986 int
987 p9bitnum(uchar *bp)
989 int n, c, len;
991 len = P9BITLEN;
992 while(*bp == ' ') {
993 bp++;
994 len--;
995 if(len <= 0)
996 return -1;
998 n = 0;
999 while(len > 1) {
1000 c = *bp++;
1001 if(!isdigit(c))
1002 return -1;
1003 n = n*10 + c-'0';
1004 len--;
1006 if(*bp != ' ')
1007 return -1;
1008 return n;
1011 int
1012 depthof(char *s, int *newp)
1014 char *es;
1015 int d;
1017 *newp = 0;
1018 es = s+12;
1019 while(s<es && *s==' ')
1020 s++;
1021 if(s == es)
1022 return -1;
1023 if('0'<=*s && *s<='9')
1024 return 1<<atoi(s);
1026 *newp = 1;
1027 d = 0;
1028 while(s<es && *s!=' '){
1029 s++; /* skip letter */
1030 d += strtoul(s, &s, 10);
1033 switch(d){
1034 case 32:
1035 case 24:
1036 case 16:
1037 case 8:
1038 return d;
1040 return -1;
1043 int
1044 isp9bit(void)
1046 int dep, lox, loy, hix, hiy, px, new;
1047 ulong t;
1048 long len;
1049 char *newlabel;
1051 newlabel = "old ";
1053 dep = depthof((char*)buf + 0*P9BITLEN, &new);
1054 if(new)
1055 newlabel = "";
1056 lox = p9bitnum(buf + 1*P9BITLEN);
1057 loy = p9bitnum(buf + 2*P9BITLEN);
1058 hix = p9bitnum(buf + 3*P9BITLEN);
1059 hiy = p9bitnum(buf + 4*P9BITLEN);
1060 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1061 return 0;
1063 if(dep < 8){
1064 px = 8/dep; /* pixels per byte */
1065 /* set l to number of bytes of data per scan line */
1066 if(lox >= 0)
1067 len = (hix+px-1)/px - lox/px;
1068 else{ /* make positive before divide */
1069 t = (-lox)+px-1;
1070 t = (t/px)*px;
1071 len = (t+hix+px-1)/px;
1073 }else
1074 len = (hix-lox)*dep/8;
1075 len *= (hiy-loy); /* col length */
1076 len += 5*P9BITLEN; /* size of initial ascii */
1079 * for image file, length is non-zero and must match calculation above
1080 * for /dev/window and /dev/screen the length is always zero
1081 * for subfont, the subfont header should follow immediately.
1083 if (len != 0 && mbuf->length == 0) {
1084 print("%splan 9 image\n", newlabel);
1085 return 1;
1087 if (mbuf->length == len) {
1088 print("%splan 9 image\n", newlabel);
1089 return 1;
1091 /* Ghostscript sometimes produces a little extra on the end */
1092 if (mbuf->length < len+P9BITLEN) {
1093 print("%splan 9 image\n", newlabel);
1094 return 1;
1096 if (p9subfont(buf+len)) {
1097 print("%ssubfont file\n", newlabel);
1098 return 1;
1100 return 0;
1103 int
1104 p9subfont(uchar *p)
1106 int n, h, a;
1108 /* if image too big, assume it's a subfont */
1109 if (p+3*P9BITLEN > buf+sizeof(buf))
1110 return 1;
1112 n = p9bitnum(p + 0*P9BITLEN); /* char count */
1113 if (n < 0)
1114 return 0;
1115 h = p9bitnum(p + 1*P9BITLEN); /* height */
1116 if (h < 0)
1117 return 0;
1118 a = p9bitnum(p + 2*P9BITLEN); /* ascent */
1119 if (a < 0)
1120 return 0;
1121 return 1;
1124 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
1126 int
1127 isp9font(void)
1129 uchar *cp, *p;
1130 int i, n;
1131 char pathname[1024];
1133 cp = buf;
1134 if (!getfontnum(cp, &cp)) /* height */
1135 return 0;
1136 if (!getfontnum(cp, &cp)) /* ascent */
1137 return 0;
1138 for (i = 0; 1; i++) {
1139 if (!getfontnum(cp, &cp)) /* min */
1140 break;
1141 if (!getfontnum(cp, &cp)) /* max */
1142 return 0;
1143 while (WHITESPACE(*cp))
1144 cp++;
1145 for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1147 /* construct a path name, if needed */
1148 n = 0;
1149 if (*p != '/' && slash) {
1150 n = slash-fname+1;
1151 if (n < sizeof(pathname))
1152 memcpy(pathname, fname, n);
1153 else n = 0;
1155 if (n+cp-p < sizeof(pathname)) {
1156 memcpy(pathname+n, p, cp-p);
1157 n += cp-p;
1158 pathname[n] = 0;
1159 if (access(pathname, AEXIST) < 0)
1160 return 0;
1163 if (i) {
1164 print(mime ? "text/plain\n" : "font file\n");
1165 return 1;
1167 return 0;
1170 int
1171 getfontnum(uchar *cp, uchar **rp)
1173 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */
1174 cp++;
1175 if (*cp < '0' || *cp > '9')
1176 return 0;
1177 strtoul((char *)cp, (char **)rp, 0);
1178 if (!WHITESPACE(**rp))
1179 return 0;
1180 return 1;
1183 int
1184 isrtf(void)
1186 if(strstr((char *)buf, "\\rtf1")){
1187 print(mime ? "application/rtf\n" : "rich text format\n");
1188 return 1;
1190 return 0;
1193 int
1194 ismsdos(void)
1196 if (buf[0] == 0x4d && buf[1] == 0x5a){
1197 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1198 return 1;
1200 return 0;
1203 int
1204 iself(void)
1206 static char *cpu[] = { /* NB: incomplete and arbitary list */
1207 nil,
1208 /*1*/ "WE32100",
1209 /*2*/ "SPARC",
1210 /*3*/ "i386",
1211 /*4*/ "M68000",
1212 /*5*/ "M88000",
1213 /*6*/ "i486",
1214 /*7*/ "i860",
1215 /*8*/ "R3000",
1216 /*9*/ "S370",
1217 /*10*/ "R4000",
1218 nil, nil, nil, nil,
1219 /*15*/ "HP-PA",
1220 nil,
1221 nil,
1222 /*18*/ "sparc v8+",
1223 /*19*/ "i960",
1224 /*20*/ "PPC-32",
1225 /*21*/ "PPC-64",
1226 nil, nil, nil, nil,
1227 nil, nil, nil, nil, nil,
1228 nil, nil, nil, nil, nil,
1229 nil, nil, nil, nil,
1230 /*40*/ "ARM",
1231 /*41*/ "Alpha",
1232 nil,
1233 /*43*/ "sparc v9",
1234 nil, nil,
1235 nil, nil, nil, nil,
1236 /*50*/ "IA-46",
1237 nil, nil, nil, nil, nil,
1238 nil, nil, nil, nil, nil,
1239 nil,
1240 /*62*/ "AMD64",
1241 nil, nil, nil,
1242 nil, nil, nil, nil, nil,
1243 nil, nil, nil, nil,
1244 /*75*/ "VAX",
1248 if (memcmp(buf, "\0177ELF", 4) == 0){
1249 /* gcc misparses \x7FELF as \x7FE L F */
1250 if (!mime){
1251 int n = (buf[19] << 8) | buf[18];
1252 char *p = "unknown";
1254 if (n > 0 && n < nelem(cpu) && cpu[n])
1255 p = cpu[n];
1256 else {
1257 /* try the other byte order */
1258 n = (buf[18] << 8) | buf[19];
1259 if (n > 0 && n < nelem(cpu) && cpu[n])
1260 p = cpu[n];
1262 print("%s ELF executable\n", p);
1264 else
1265 print("application/x-elf-executable");
1266 return 1;
1269 return 0;