Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
7 /*
8 * file - determine type of file
9 */
10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
12 uchar buf[6001];
13 short cfreq[140];
14 short wfreq[50];
15 int nbuf;
16 Dir* mbuf;
17 int fd;
18 char *fname;
19 char *slash;
21 enum
22 {
23 Cword,
24 Fword,
25 Aword,
26 Alword,
27 Lword,
28 I1,
29 I2,
30 I3,
31 Clatin = 128,
32 Cbinary,
33 Cnull,
34 Ceascii,
35 Cutf,
36 };
37 struct
38 {
39 char* word;
40 int class;
41 } dict[] =
42 {
43 "PATH", Lword,
44 "TEXT", Aword,
45 "adt", Alword,
46 "aggr", Alword,
47 "alef", Alword,
48 "array", Lword,
49 "block", Fword,
50 "chan", Alword,
51 "char", Cword,
52 "common", Fword,
53 "con", Lword,
54 "data", Fword,
55 "dimension", Fword,
56 "double", Cword,
57 "extern", Cword,
58 "bio", I2,
59 "float", Cword,
60 "fn", Lword,
61 "function", Fword,
62 "h", I3,
63 "implement", Lword,
64 "import", Lword,
65 "include", I1,
66 "int", Cword,
67 "integer", Fword,
68 "iota", Lword,
69 "libc", I2,
70 "long", Cword,
71 "module", Lword,
72 "real", Fword,
73 "ref", Lword,
74 "register", Cword,
75 "self", Lword,
76 "short", Cword,
77 "static", Cword,
78 "stdio", I2,
79 "struct", Cword,
80 "subroutine", Fword,
81 "u", I2,
82 "void", Cword,
83 };
85 /* codes for 'mode' field in language structure */
86 enum {
87 Normal = 0,
88 First, /* first entry for language spanning several ranges */
89 Multi, /* later entries " " " ... */
90 Shared, /* codes used in several languages */
91 };
93 struct
94 {
95 int mode; /* see enum above */
96 int count;
97 int low;
98 int high;
99 char *name;
101 } language[] =
103 Normal, 0, 0x0080, 0x0080, "Extended Latin",
104 Normal, 0, 0x0100, 0x01FF, "Extended Latin",
105 Normal, 0, 0x0370, 0x03FF, "Greek",
106 Normal, 0, 0x0400, 0x04FF, "Cyrillic",
107 Normal, 0, 0x0530, 0x058F, "Armenian",
108 Normal, 0, 0x0590, 0x05FF, "Hebrew",
109 Normal, 0, 0x0600, 0x06FF, "Arabic",
110 Normal, 0, 0x0900, 0x097F, "Devanagari",
111 Normal, 0, 0x0980, 0x09FF, "Bengali",
112 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi",
113 Normal, 0, 0x0A80, 0x0AFF, "Gujarati",
114 Normal, 0, 0x0B00, 0x0B7F, "Oriya",
115 Normal, 0, 0x0B80, 0x0BFF, "Tamil",
116 Normal, 0, 0x0C00, 0x0C7F, "Telugu",
117 Normal, 0, 0x0C80, 0x0CFF, "Kannada",
118 Normal, 0, 0x0D00, 0x0D7F, "Malayalam",
119 Normal, 0, 0x0E00, 0x0E7F, "Thai",
120 Normal, 0, 0x0E80, 0x0EFF, "Lao",
121 Normal, 0, 0x1000, 0x105F, "Tibetan",
122 Normal, 0, 0x10A0, 0x10FF, "Georgian",
123 Normal, 0, 0x3040, 0x30FF, "Japanese",
124 Normal, 0, 0x3100, 0x312F, "Chinese",
125 First, 0, 0x3130, 0x318F, "Korean",
126 Multi, 0, 0x3400, 0x3D2F, "Korean",
127 Shared, 0, 0x4e00, 0x9fff, "CJK",
128 Normal, 0, 0, 0, 0, /* terminal entry */
129 };
132 enum
134 Fascii, /* printable ascii */
135 Flatin, /* latin 1*/
136 Futf, /* UTf character set */
137 Fbinary, /* binary */
138 Feascii, /* ASCII with control chars */
139 Fnull, /* NULL in file */
140 } guess;
142 void bump_utf_count(Rune);
143 int cistrncmp(char*, char*, int);
144 void filetype(int);
145 int getfontnum(uchar*, uchar**);
146 int isas(void);
147 int isc(void);
148 int isenglish(void);
149 int ishp(void);
150 int ishtml(void);
151 int isrfc822(void);
152 int ismbox(void);
153 int islimbo(void);
154 int ismung(void);
155 int isp9bit(void);
156 int isp9font(void);
157 int isrtf(void);
158 int ismsdos(void);
159 int iself(void);
160 int istring(void);
161 int iff(void);
162 int long0(void);
163 int istar(void);
164 int p9bitnum(uchar*);
165 int p9subfont(uchar*);
166 void print_utf(void);
167 void type(char*, int);
168 int utf_count(void);
169 void wordfreq(void);
171 int (*call[])(void) =
173 long0, /* recognizable by first 4 bytes */
174 istring, /* recognizable by first string */
175 iff, /* interchange file format (strings) */
176 isrfc822, /* email file */
177 ismbox, /* mail box */
178 istar, /* recognizable by tar checksum */
179 ishtml, /* html keywords */
180 /* iscint, /* compiler/assembler intermediate */
181 islimbo, /* limbo source */
182 isc, /* c & alef compiler key words */
183 isas, /* assembler key words */
184 ismung, /* entropy compressed/encrypted */
185 isp9font, /* plan 9 font */
186 isp9bit, /* plan 9 image (as from /dev/window) */
187 isenglish, /* char frequency English */
188 isrtf, /* rich text format */
189 ismsdos, /* msdos exe (virus file attachement) */
190 iself, /* ELF (foreign) executable */
192 };
194 int mime;
196 #define OCTET "application/octet-stream\n"
197 #define PLAIN "text/plain\n"
199 void
200 main(int argc, char *argv[])
202 int i, j, maxlen;
203 char *cp;
204 Rune r;
206 ARGBEGIN{
207 case 'm':
208 mime = 1;
209 break;
210 default:
211 fprint(2, "usage: file [-m] [file...]\n");
212 exits("usage");
213 }ARGEND;
215 maxlen = 0;
216 if(mime == 0 || argc > 1){
217 for(i = 0; i < argc; i++) {
218 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
220 if(j > maxlen)
221 maxlen = j;
224 if (argc <= 0) {
225 if(!mime)
226 print ("stdin: ");
227 filetype(0);
229 else {
230 for(i = 0; i < argc; i++)
231 type(argv[i], maxlen);
233 exits(0);
236 void
237 type(char *file, int nlen)
239 Rune r;
240 int i;
241 char *p;
243 if(nlen > 0){
244 slash = 0;
245 for (i = 0, p = file; *p; i++) {
246 if (*p == '/') /* find rightmost slash */
247 slash = p;
248 p += chartorune(&r, p); /* count runes */
250 print("%s:%*s",file, nlen-i+1, "");
252 fname = file;
253 if ((fd = open(file, OREAD)) < 0) {
254 print("cannot open\n");
255 return;
257 filetype(fd);
258 close(fd);
261 void
262 filetype(int fd)
264 Rune r;
265 int i, f, n;
266 char *p, *eob;
268 free(mbuf);
269 mbuf = dirfstat(fd);
270 if(mbuf == nil){
271 print("cannot stat: %r\n");
272 return;
274 if(mbuf->mode & DMDIR) {
275 print(mime ? "text/directory\n" : "directory\n");
276 return;
278 if(mbuf->type != 'M' && mbuf->type != '|') {
279 print(mime ? OCTET : "special file #%c/%s\n",
280 mbuf->type, mbuf->name);
281 return;
283 nbuf = read(fd, buf, sizeof(buf)-1);
285 if(nbuf < 0) {
286 print("cannot read\n");
287 return;
289 if(nbuf == 0) {
290 print(mime ? PLAIN : "empty file\n");
291 return;
293 buf[nbuf] = 0;
295 /*
296 * build histogram table
297 */
298 memset(cfreq, 0, sizeof(cfreq));
299 for (i = 0; language[i].name; i++)
300 language[i].count = 0;
301 eob = (char *)buf+nbuf;
302 for(n = 0, p = (char *)buf; p < eob; n++) {
303 if (!fullrune(p, eob-p) && eob-p < UTFmax)
304 break;
305 p += chartorune(&r, p);
306 if (r == 0)
307 f = Cnull;
308 else if (r <= 0x7f) {
309 if (!isprint(r) && !isspace(r))
310 f = Ceascii; /* ASCII control char */
311 else f = r;
312 } else if (r == 0x080) {
313 bump_utf_count(r);
314 f = Cutf;
315 } else if (r < 0xA0)
316 f = Cbinary; /* Invalid Runes */
317 else if (r <= 0xff)
318 f = Clatin; /* Latin 1 */
319 else {
320 bump_utf_count(r);
321 f = Cutf; /* UTF extension */
323 cfreq[f]++; /* ASCII chars peg directly */
325 /*
326 * gross classify
327 */
328 if (cfreq[Cbinary])
329 guess = Fbinary;
330 else if (cfreq[Cutf])
331 guess = Futf;
332 else if (cfreq[Clatin])
333 guess = Flatin;
334 else if (cfreq[Ceascii])
335 guess = Feascii;
336 else if (cfreq[Cnull] == n) {
337 print(mime ? OCTET : "first block all null bytes\n");
338 return;
340 else guess = Fascii;
341 /*
342 * lookup dictionary words
343 */
344 memset(wfreq, 0, sizeof(wfreq));
345 if(guess == Fascii || guess == Flatin || guess == Futf)
346 wordfreq();
347 /*
348 * call individual classify routines
349 */
350 for(i=0; call[i]; i++)
351 if((*call[i])())
352 return;
354 /*
355 * if all else fails,
356 * print out gross classification
357 */
358 if (nbuf < 100 && !mime)
359 print(mime ? PLAIN : "short ");
360 if (guess == Fascii)
361 print(mime ? PLAIN : "Ascii\n");
362 else if (guess == Feascii)
363 print(mime ? PLAIN : "extended ascii\n");
364 else if (guess == Flatin)
365 print(mime ? PLAIN : "latin ascii\n");
366 else if (guess == Futf && utf_count() < 4)
367 print_utf();
368 else print(mime ? OCTET : "binary\n");
371 void
372 bump_utf_count(Rune r)
374 int low, high, mid;
376 high = sizeof(language)/sizeof(language[0])-1;
377 for (low = 0; low < high;) {
378 mid = (low+high)/2;
379 if (r >=language[mid].low) {
380 if (r <= language[mid].high) {
381 language[mid].count++;
382 break;
383 } else low = mid+1;
384 } else high = mid;
388 int
389 utf_count(void)
391 int i, count;
393 count = 0;
394 for (i = 0; language[i].name; i++)
395 if (language[i].count > 0)
396 switch (language[i].mode) {
397 case Normal:
398 case First:
399 count++;
400 break;
401 default:
402 break;
404 return count;
407 int
408 chkascii(void)
410 int i;
412 for (i = 'a'; i < 'z'; i++)
413 if (cfreq[i])
414 return 1;
415 for (i = 'A'; i < 'Z'; i++)
416 if (cfreq[i])
417 return 1;
418 return 0;
421 int
422 find_first(char *name)
424 int i;
426 for (i = 0; language[i].name != 0; i++)
427 if (language[i].mode == First
428 && strcmp(language[i].name, name) == 0)
429 return i;
430 return -1;
433 void
434 print_utf(void)
436 int i, printed, j;
438 if(mime){
439 print(PLAIN);
440 return;
442 if (chkascii()) {
443 printed = 1;
444 print("Ascii");
445 } else
446 printed = 0;
447 for (i = 0; language[i].name; i++)
448 if (language[i].count) {
449 switch(language[i].mode) {
450 case Multi:
451 j = find_first(language[i].name);
452 if (j < 0)
453 break;
454 if (language[j].count > 0)
455 break;
456 /* Fall through */
457 case Normal:
458 case First:
459 if (printed)
460 print(" & ");
461 else printed = 1;
462 print("%s", language[i].name);
463 break;
464 case Shared:
465 default:
466 break;
469 if(!printed)
470 print("UTF");
471 print(" text\n");
474 void
475 wordfreq(void)
477 int low, high, mid, r;
478 uchar *p, *p2, c;
480 p = buf;
481 for(;;) {
482 while (p < buf+nbuf && !isalpha(*p))
483 p++;
484 if (p >= buf+nbuf)
485 return;
486 p2 = p;
487 while(p < buf+nbuf && isalpha(*p))
488 p++;
489 c = *p;
490 *p = 0;
491 high = sizeof(dict)/sizeof(dict[0]);
492 for(low = 0;low < high;) {
493 mid = (low+high)/2;
494 r = strcmp(dict[mid].word, (char*)p2);
495 if(r == 0) {
496 wfreq[dict[mid].class]++;
497 break;
499 if(r < 0)
500 low = mid+1;
501 else
502 high = mid;
504 *p++ = c;
508 typedef struct Filemagic Filemagic;
509 struct Filemagic {
510 ulong x;
511 ulong mask;
512 char *desc;
513 char *mime;
514 };
516 Filemagic long0tab[] = {
517 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET,
518 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET,
519 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET,
520 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET,
521 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET,
522 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip",
523 070707, 0xFFFF, "cpio archive\n", OCTET,
524 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi",
525 0xfffa0000, 0xfffe0000, "mp3 audio", "audio/mpeg",
526 };
528 int
529 filemagic(Filemagic *tab, int ntab, ulong x)
531 int i;
533 for(i=0; i<ntab; i++)
534 if((x&tab[i].mask) == tab[i].x){
535 print(mime ? tab[i].mime : tab[i].desc);
536 return 1;
538 return 0;
541 int
542 long0(void)
544 // Fhdr *f;
545 long x;
547 seek(fd, 0, 0); /* reposition to start of file */
548 /*
549 if(crackhdr(fd, &f)) {
550 print(mime ? OCTET : "%s\n", f.name);
551 return 1;
553 */
554 x = LENDIAN(buf);
555 if(filemagic(long0tab, nelem(long0tab), x))
556 return 1;
557 return 0;
560 /* from tar.c */
561 enum { NAMSIZ = 100, TBLOCK = 512 };
563 union hblock
565 char dummy[TBLOCK];
566 struct header
568 char name[NAMSIZ];
569 char mode[8];
570 char uid[8];
571 char gid[8];
572 char size[12];
573 char mtime[12];
574 char chksum[8];
575 char linkflag;
576 char linkname[NAMSIZ];
577 /* rest are defined by POSIX's ustar format; see p1003.2b */
578 char magic[6]; /* "ustar" */
579 char version[2];
580 char uname[32];
581 char gname[32];
582 char devmajor[8];
583 char devminor[8];
584 char prefix[155]; /* if non-null, path = prefix "/" name */
585 } dbuf;
586 };
588 int
589 checksum(union hblock *hp)
591 int i;
592 char *cp;
593 struct header *hdr = &hp->dbuf;
595 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
596 *cp = ' ';
597 i = 0;
598 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
599 i += *cp & 0xff;
600 return i;
603 int
604 istar(void)
606 int chksum;
607 char tblock[TBLOCK];
608 union hblock *hp = (union hblock *)tblock;
609 struct header *hdr = &hp->dbuf;
611 seek(fd, 0, 0); /* reposition to start of file */
612 if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
613 return 0;
614 chksum = strtol(hdr->chksum, 0, 8);
615 if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
616 if (strcmp(hdr->magic, "ustar") == 0)
617 print(mime? "application/x-ustar\n":
618 "posix tar archive\n");
619 else
620 print(mime? "application/x-tar\n": "tar archive\n");
621 return 1;
623 return 0;
626 /*
627 * initial words to classify file
628 */
629 struct FILE_STRING
631 char *key;
632 char *filetype;
633 int length;
634 char *mime;
635 } file_string[] =
637 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream",
638 "!<arch>\n", "archive", 8, "application/octet-stream",
639 "070707", "cpio archive - ascii header", 6, "application/octet-stream",
640 "%!", "postscript", 2, "application/postscript",
641 "\004%!", "postscript", 3, "application/postscript",
642 "x T post", "troff output for post", 8, "application/troff",
643 "x T Latin1", "troff output for Latin1", 10, "application/troff",
644 "x T utf", "troff output for UTF", 7, "application/troff",
645 "x T 202", "troff output for 202", 7, "application/troff",
646 "x T aps", "troff output for aps", 7, "application/troff",
647 "GIF", "GIF image", 3, "image/gif",
648 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript",
649 "%PDF", "PDF", 4, "application/pdf",
650 "<html>\n", "HTML file", 7, "text/html",
651 "<HTML>\n", "HTML file", 7, "text/html",
652 "compressed\n", "Compressed image or subfont", 11, "application/octet-stream",
653 "\111\111\052\000", "tiff", 4, "image/tiff",
654 "\115\115\000\052", "tiff", 4, "image/tiff",
655 "\377\330\377\340", "jpeg", 4, "image/jpeg",
656 "\377\330\377\341", "jpeg", 4, "image/jpeg",
657 "\377\330\377\333", "jpeg", 4, "image/jpeg",
658 "BM", "bmp", 2, "image/bmp",
659 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream",
660 "<MakerFile ", "FrameMaker file", 11, "application/framemaker",
661 "\033%-12345X", "HPJCL file", 9, "application/hpjcl",
662 "ID3", "mp3 audio with id3", 3, "audio/mpeg",
663 0,0,0,0
664 };
666 int
667 istring(void)
669 int i, j;
670 struct FILE_STRING *p;
672 for(p = file_string; p->key; p++) {
673 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
674 if(mime)
675 print("%s\n", p->mime);
676 else
677 print("%s\n", p->filetype);
678 return 1;
681 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */
682 for(i = 5; i < nbuf; i++)
683 if(buf[i] == '\n')
684 break;
685 if(mime)
686 print(OCTET);
687 else
688 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
689 return 1;
691 if(buf[0]=='#' && buf[1]=='!'){
692 i=2;
693 for(j=2; j < nbuf && buf[j] != ' ' && buf[j] != '\n' && buf[j] != '\r'; j++)
694 if(buf[j] == '/')
695 i = j+1;
696 if(mime)
697 print(PLAIN);
698 else
699 print("%.*s executable file script\n", utfnlen((char*)buf+i, j-i), (char*)buf+i);
700 return 1;
702 return 0;
705 int
706 iff(void)
708 if (strncmp((char*)buf, "FORM", 4) == 0 &&
709 strncmp((char*)buf+8, "AIFF", 4) == 0) {
710 print("%s\n", mime? "audio/x-aiff": "aiff audio");
711 return 1;
713 return 0;
716 char* html_string[] =
718 "title",
719 "body",
720 "head",
721 "strong",
722 "h1",
723 "h2",
724 "h3",
725 "h4",
726 "h5",
727 "h6",
728 "ul",
729 "li",
730 "dl",
731 "br",
732 "em",
733 0,
734 };
736 int
737 ishtml(void)
739 uchar *p, *q;
740 int i, count;
742 /* compare strings between '<' and '>' to html table */
743 count = 0;
744 p = buf;
745 for(;;) {
746 while (p < buf+nbuf && *p != '<')
747 p++;
748 p++;
749 if (p >= buf+nbuf)
750 break;
751 if(*p == '/')
752 p++;
753 q = p;
754 while(p < buf+nbuf && *p != '>')
755 p++;
756 if (p >= buf+nbuf)
757 break;
758 for(i = 0; html_string[i]; i++) {
759 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
760 if(count++ > 4) {
761 print(mime ? "text/html\n" : "HTML file\n");
762 return 1;
764 break;
767 p++;
769 return 0;
772 char* rfc822_string[] =
774 "from:",
775 "date:",
776 "to:",
777 "subject:",
778 "received:",
779 "reply to:",
780 "sender:",
781 0,
782 };
784 int
785 isrfc822(void)
788 char *p, *q, *r;
789 int i, count;
791 count = 0;
792 p = (char*)buf;
793 for(;;) {
794 q = strchr(p, '\n');
795 if(q == nil)
796 break;
797 *q = 0;
798 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
799 count++;
800 *q = '\n';
801 p = q+1;
802 continue;
804 *q = '\n';
805 if(*p != '\t' && *p != ' '){
806 r = strchr(p, ':');
807 if(r == 0 || r > q)
808 break;
809 for(i = 0; rfc822_string[i]; i++) {
810 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
811 count++;
812 break;
816 p = q+1;
818 if(count >= 3){
819 print(mime ? "message/rfc822\n" : "email file\n");
820 return 1;
822 return 0;
825 int
826 ismbox(void)
828 char *p, *q;
830 p = (char*)buf;
831 q = strchr(p, '\n');
832 if(q == nil)
833 return 0;
834 *q = 0;
835 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
836 print(mime ? "text/plain\n" : "mail box\n");
837 return 1;
839 *q = '\n';
840 return 0;
843 int
844 isc(void)
846 int n;
848 n = wfreq[I1];
849 /*
850 * includes
851 */
852 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
853 goto yes;
854 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
855 goto yes;
856 /*
857 * declarations
858 */
859 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
860 goto yes;
861 /*
862 * assignments
863 */
864 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
865 goto yes;
866 return 0;
868 yes:
869 if(mime){
870 print(PLAIN);
871 return 1;
873 if(wfreq[Alword] > 0)
874 print("alef program\n");
875 else
876 print("c program\n");
877 return 1;
880 int
881 islimbo(void)
884 /*
885 * includes
886 */
887 if(wfreq[Lword] < 4)
888 return 0;
889 print(mime ? PLAIN : "limbo program\n");
890 return 1;
893 int
894 isas(void)
897 /*
898 * includes
899 */
900 if(wfreq[Aword] < 2)
901 return 0;
902 print(mime ? PLAIN : "as program\n");
903 return 1;
906 /*
907 * low entropy means encrypted
908 */
909 int
910 ismung(void)
912 int i, bucket[8];
913 float cs;
915 if(nbuf < 64)
916 return 0;
917 memset(bucket, 0, sizeof(bucket));
918 for(i=0; i<64; i++)
919 bucket[(buf[i]>>5)&07] += 1;
921 cs = 0.;
922 for(i=0; i<8; i++)
923 cs += (bucket[i]-8)*(bucket[i]-8);
924 cs /= 8.;
925 if(cs <= 24.322) {
926 if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
927 print(mime ? OCTET : "compressed\n");
928 else
929 print(mime ? OCTET : "encrypted\n");
930 return 1;
932 return 0;
935 /*
936 * english by punctuation and frequencies
937 */
938 int
939 isenglish(void)
941 int vow, comm, rare, badpun, punct;
942 char *p;
944 if(guess != Fascii && guess != Feascii)
945 return 0;
946 badpun = 0;
947 punct = 0;
948 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
949 switch(*p) {
950 case '.':
951 case ',':
952 case ')':
953 case '%':
954 case ';':
955 case ':':
956 case '?':
957 punct++;
958 if(p[1] != ' ' && p[1] != '\n')
959 badpun++;
961 if(badpun*5 > punct)
962 return 0;
963 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */
964 return 0;
965 if(2*cfreq[';'] > cfreq['e'])
966 return 0;
968 vow = 0;
969 for(p="AEIOU"; *p; p++) {
970 vow += cfreq[(uchar)*p];
971 vow += cfreq[tolower((uchar)*p)];
973 comm = 0;
974 for(p="ETAION"; *p; p++) {
975 comm += cfreq[(uchar)*p];
976 comm += cfreq[tolower((uchar)*p)];
978 rare = 0;
979 for(p="VJKQXZ"; *p; p++) {
980 rare += cfreq[(uchar)*p];
981 rare += cfreq[tolower((uchar)*p)];
983 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
984 print(mime ? PLAIN : "English text\n");
985 return 1;
987 return 0;
990 /*
991 * pick up a number with
992 * syntax _*[0-9]+_
993 */
994 #define P9BITLEN 12
995 int
996 p9bitnum(uchar *bp)
998 int n, c, len;
1000 len = P9BITLEN;
1001 while(*bp == ' ') {
1002 bp++;
1003 len--;
1004 if(len <= 0)
1005 return -1;
1007 n = 0;
1008 while(len > 1) {
1009 c = *bp++;
1010 if(!isdigit(c))
1011 return -1;
1012 n = n*10 + c-'0';
1013 len--;
1015 if(*bp != ' ')
1016 return -1;
1017 return n;
1020 int
1021 depthof(char *s, int *newp)
1023 char *es;
1024 int d;
1026 *newp = 0;
1027 es = s+12;
1028 while(s<es && *s==' ')
1029 s++;
1030 if(s == es)
1031 return -1;
1032 if('0'<=*s && *s<='9')
1033 return 1<<atoi(s);
1035 *newp = 1;
1036 d = 0;
1037 while(s<es && *s!=' '){
1038 s++; /* skip letter */
1039 d += strtoul(s, &s, 10);
1042 switch(d){
1043 case 32:
1044 case 24:
1045 case 16:
1046 case 8:
1047 return d;
1049 return -1;
1052 int
1053 isp9bit(void)
1055 int dep, lox, loy, hix, hiy, px, new;
1056 ulong t;
1057 long len;
1058 char *newlabel;
1060 newlabel = "old ";
1062 dep = depthof((char*)buf + 0*P9BITLEN, &new);
1063 if(new)
1064 newlabel = "";
1065 lox = p9bitnum(buf + 1*P9BITLEN);
1066 loy = p9bitnum(buf + 2*P9BITLEN);
1067 hix = p9bitnum(buf + 3*P9BITLEN);
1068 hiy = p9bitnum(buf + 4*P9BITLEN);
1069 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1070 return 0;
1072 if(dep < 8){
1073 px = 8/dep; /* pixels per byte */
1074 /* set l to number of bytes of data per scan line */
1075 if(lox >= 0)
1076 len = (hix+px-1)/px - lox/px;
1077 else{ /* make positive before divide */
1078 t = (-lox)+px-1;
1079 t = (t/px)*px;
1080 len = (t+hix+px-1)/px;
1082 }else
1083 len = (hix-lox)*dep/8;
1084 len *= (hiy-loy); /* col length */
1085 len += 5*P9BITLEN; /* size of initial ascii */
1088 * for image file, length is non-zero and must match calculation above
1089 * for /dev/window and /dev/screen the length is always zero
1090 * for subfont, the subfont header should follow immediately.
1092 if (len != 0 && mbuf->length == 0) {
1093 print("%splan 9 image\n", newlabel);
1094 return 1;
1096 if (mbuf->length == len) {
1097 print("%splan 9 image\n", newlabel);
1098 return 1;
1100 /* Ghostscript sometimes produces a little extra on the end */
1101 if (mbuf->length < len+P9BITLEN) {
1102 print("%splan 9 image\n", newlabel);
1103 return 1;
1105 if (p9subfont(buf+len)) {
1106 print("%ssubfont file\n", newlabel);
1107 return 1;
1109 return 0;
1112 int
1113 p9subfont(uchar *p)
1115 int n, h, a;
1117 /* if image too big, assume it's a subfont */
1118 if (p+3*P9BITLEN > buf+sizeof(buf))
1119 return 1;
1121 n = p9bitnum(p + 0*P9BITLEN); /* char count */
1122 if (n < 0)
1123 return 0;
1124 h = p9bitnum(p + 1*P9BITLEN); /* height */
1125 if (h < 0)
1126 return 0;
1127 a = p9bitnum(p + 2*P9BITLEN); /* ascent */
1128 if (a < 0)
1129 return 0;
1130 return 1;
1133 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
1135 int
1136 isp9font(void)
1138 uchar *cp, *p;
1139 int i, n;
1140 char pathname[1024];
1142 cp = buf;
1143 if (!getfontnum(cp, &cp)) /* height */
1144 return 0;
1145 if (!getfontnum(cp, &cp)) /* ascent */
1146 return 0;
1147 for (i = 0; 1; i++) {
1148 if (!getfontnum(cp, &cp)) /* min */
1149 break;
1150 if (!getfontnum(cp, &cp)) /* max */
1151 return 0;
1152 while (WHITESPACE(*cp))
1153 cp++;
1154 for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1156 /* construct a path name, if needed */
1157 n = 0;
1158 if (*p != '/' && slash) {
1159 n = slash-fname+1;
1160 if (n < sizeof(pathname))
1161 memcpy(pathname, fname, n);
1162 else n = 0;
1164 if (n+cp-p < sizeof(pathname)) {
1165 memcpy(pathname+n, p, cp-p);
1166 n += cp-p;
1167 pathname[n] = 0;
1168 if (access(pathname, AEXIST) < 0)
1169 return 0;
1172 if (i) {
1173 print(mime ? "text/plain\n" : "font file\n");
1174 return 1;
1176 return 0;
1179 int
1180 getfontnum(uchar *cp, uchar **rp)
1182 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */
1183 cp++;
1184 if (*cp < '0' || *cp > '9')
1185 return 0;
1186 strtoul((char *)cp, (char **)rp, 0);
1187 if (!WHITESPACE(**rp))
1188 return 0;
1189 return 1;
1192 int
1193 isrtf(void)
1195 if(strstr((char *)buf, "\\rtf1")){
1196 print(mime ? "application/rtf\n" : "rich text format\n");
1197 return 1;
1199 return 0;
1202 int
1203 ismsdos(void)
1205 if (buf[0] == 0x4d && buf[1] == 0x5a){
1206 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1207 return 1;
1209 return 0;
1212 int
1213 iself(void)
1215 static char *cpu[] = { /* NB: incomplete and arbitary list */
1216 nil,
1217 /*1*/ "WE32100",
1218 /*2*/ "SPARC",
1219 /*3*/ "i386",
1220 /*4*/ "M68000",
1221 /*5*/ "M88000",
1222 /*6*/ "i486",
1223 /*7*/ "i860",
1224 /*8*/ "R3000",
1225 /*9*/ "S370",
1226 /*10*/ "R4000",
1227 nil, nil, nil, nil,
1228 /*15*/ "HP-PA",
1229 nil,
1230 nil,
1231 /*18*/ "sparc v8+",
1232 /*19*/ "i960",
1233 /*20*/ "PPC-32",
1234 /*21*/ "PPC-64",
1235 nil, nil, nil, nil,
1236 nil, nil, nil, nil, nil,
1237 nil, nil, nil, nil, nil,
1238 nil, nil, nil, nil,
1239 /*40*/ "ARM",
1240 /*41*/ "Alpha",
1241 nil,
1242 /*43*/ "sparc v9",
1243 nil, nil,
1244 nil, nil, nil, nil,
1245 /*50*/ "IA-64",
1246 nil, nil, nil, nil, nil,
1247 nil, nil, nil, nil, nil,
1248 nil,
1249 /*62*/ "AMD64",
1250 nil, nil, nil,
1251 nil, nil, nil, nil, nil,
1252 nil, nil, nil, nil,
1253 /*75*/ "VAX",
1257 if (memcmp(buf, "\177ELF", 4) == 0){
1258 /* gcc misparses \x7FELF as \x7FE L F */
1259 if (!mime){
1260 int n = (buf[19] << 8) | buf[18];
1261 char *p = "unknown";
1263 if (n > 0 && n < nelem(cpu) && cpu[n])
1264 p = cpu[n];
1265 else {
1266 /* try the other byte order */
1267 n = (buf[18] << 8) | buf[19];
1268 if (n > 0 && n < nelem(cpu) && cpu[n])
1269 p = cpu[n];
1271 print("%s ELF executable\n", p);
1273 else
1274 print("application/x-elf-executable");
1275 return 1;
1278 return 0;