Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
7 /*
8 * file - determine type of file
9 */
10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
12 uchar buf[6001];
13 short cfreq[140];
14 short wfreq[50];
15 int nbuf;
16 Dir* mbuf;
17 int fd;
18 char *fname;
19 char *slash;
21 enum
22 {
23 Cword,
24 Fword,
25 Aword,
26 Alword,
27 Lword,
28 I1,
29 I2,
30 I3,
31 Clatin = 128,
32 Cbinary,
33 Cnull,
34 Ceascii,
35 Cutf,
36 };
37 struct
38 {
39 char* word;
40 int class;
41 } dict[] =
42 {
43 "PATH", Lword,
44 "TEXT", Aword,
45 "adt", Alword,
46 "aggr", Alword,
47 "alef", Alword,
48 "array", Lword,
49 "block", Fword,
50 "chan", Alword,
51 "char", Cword,
52 "common", Fword,
53 "con", Lword,
54 "data", Fword,
55 "dimension", Fword,
56 "double", Cword,
57 "extern", Cword,
58 "bio", I2,
59 "float", Cword,
60 "fn", Lword,
61 "function", Fword,
62 "h", I3,
63 "implement", Lword,
64 "import", Lword,
65 "include", I1,
66 "int", Cword,
67 "integer", Fword,
68 "iota", Lword,
69 "libc", I2,
70 "long", Cword,
71 "module", Lword,
72 "real", Fword,
73 "ref", Lword,
74 "register", Cword,
75 "self", Lword,
76 "short", Cword,
77 "static", Cword,
78 "stdio", I2,
79 "struct", Cword,
80 "subroutine", Fword,
81 "u", I2,
82 "void", Cword,
83 };
85 /* codes for 'mode' field in language structure */
86 enum {
87 Normal = 0,
88 First, /* first entry for language spanning several ranges */
89 Multi, /* later entries " " " ... */
90 Shared, /* codes used in several languages */
91 };
93 struct
94 {
95 int mode; /* see enum above */
96 int count;
97 int low;
98 int high;
99 char *name;
101 } language[] =
103 Normal, 0, 0x0080, 0x0080, "Extended Latin",
104 Normal, 0, 0x0100, 0x01FF, "Extended Latin",
105 Normal, 0, 0x0370, 0x03FF, "Greek",
106 Normal, 0, 0x0400, 0x04FF, "Cyrillic",
107 Normal, 0, 0x0530, 0x058F, "Armenian",
108 Normal, 0, 0x0590, 0x05FF, "Hebrew",
109 Normal, 0, 0x0600, 0x06FF, "Arabic",
110 Normal, 0, 0x0900, 0x097F, "Devanagari",
111 Normal, 0, 0x0980, 0x09FF, "Bengali",
112 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi",
113 Normal, 0, 0x0A80, 0x0AFF, "Gujarati",
114 Normal, 0, 0x0B00, 0x0B7F, "Oriya",
115 Normal, 0, 0x0B80, 0x0BFF, "Tamil",
116 Normal, 0, 0x0C00, 0x0C7F, "Telugu",
117 Normal, 0, 0x0C80, 0x0CFF, "Kannada",
118 Normal, 0, 0x0D00, 0x0D7F, "Malayalam",
119 Normal, 0, 0x0E00, 0x0E7F, "Thai",
120 Normal, 0, 0x0E80, 0x0EFF, "Lao",
121 Normal, 0, 0x1000, 0x105F, "Tibetan",
122 Normal, 0, 0x10A0, 0x10FF, "Georgian",
123 Normal, 0, 0x3040, 0x30FF, "Japanese",
124 Normal, 0, 0x3100, 0x312F, "Chinese",
125 First, 0, 0x3130, 0x318F, "Korean",
126 Multi, 0, 0x3400, 0x3D2F, "Korean",
127 Shared, 0, 0x4e00, 0x9fff, "CJK",
128 Normal, 0, 0, 0, 0, /* terminal entry */
129 };
132 enum
134 Fascii, /* printable ascii */
135 Flatin, /* latin 1*/
136 Futf, /* UTf character set */
137 Fbinary, /* binary */
138 Feascii, /* ASCII with control chars */
139 Fnull, /* NULL in file */
140 } guess;
142 void bump_utf_count(Rune);
143 int cistrncmp(char*, char*, int);
144 void filetype(int);
145 int getfontnum(uchar*, uchar**);
146 int isas(void);
147 int isc(void);
148 int isenglish(void);
149 int ishp(void);
150 int ishtml(void);
151 int isrfc822(void);
152 int ismbox(void);
153 int islimbo(void);
154 int ismung(void);
155 int isp9bit(void);
156 int isp9font(void);
157 int isrtf(void);
158 int ismsdos(void);
159 int iself(void);
160 int istring(void);
161 int iff(void);
162 int long0(void);
163 int istar(void);
164 int p9bitnum(uchar*);
165 int p9subfont(uchar*);
166 void print_utf(void);
167 void type(char*, int);
168 int utf_count(void);
169 void wordfreq(void);
171 int (*call[])(void) =
173 long0, /* recognizable by first 4 bytes */
174 istring, /* recognizable by first string */
175 iff, /* interchange file format (strings) */
176 isrfc822, /* email file */
177 ismbox, /* mail box */
178 istar, /* recognizable by tar checksum */
179 ishtml, /* html keywords */
180 /* iscint, /* compiler/assembler intermediate */
181 islimbo, /* limbo source */
182 isc, /* c & alef compiler key words */
183 isas, /* assembler key words */
184 ismung, /* entropy compressed/encrypted */
185 isp9font, /* plan 9 font */
186 isp9bit, /* plan 9 image (as from /dev/window) */
187 isenglish, /* char frequency English */
188 isrtf, /* rich text format */
189 ismsdos, /* msdos exe (virus file attachement) */
190 iself, /* ELF (foreign) executable */
192 };
194 int mime;
196 #define OCTET "application/octet-stream\n"
197 #define PLAIN "text/plain\n"
199 void
200 main(int argc, char *argv[])
202 int i, j, maxlen;
203 char *cp;
204 Rune r;
206 ARGBEGIN{
207 case 'm':
208 mime = 1;
209 break;
210 default:
211 fprint(2, "usage: file [-m] [file...]\n");
212 exits("usage");
213 }ARGEND;
215 maxlen = 0;
216 if(mime == 0 || argc > 1){
217 for(i = 0; i < argc; i++) {
218 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
220 if(j > maxlen)
221 maxlen = j;
224 if (argc <= 0) {
225 if(!mime)
226 print ("stdin: ");
227 filetype(0);
229 else {
230 for(i = 0; i < argc; i++)
231 type(argv[i], maxlen);
233 exits(0);
236 void
237 type(char *file, int nlen)
239 Rune r;
240 int i;
241 char *p;
243 if(nlen > 0){
244 slash = 0;
245 for (i = 0, p = file; *p; i++) {
246 if (*p == '/') /* find rightmost slash */
247 slash = p;
248 p += chartorune(&r, p); /* count runes */
250 print("%s:%*s",file, nlen-i+1, "");
252 fname = file;
253 if ((fd = open(file, OREAD)) < 0) {
254 print("cannot open\n");
255 return;
257 filetype(fd);
258 close(fd);
261 void
262 filetype(int fd)
264 Rune r;
265 int i, f, n;
266 char *p, *eob;
268 free(mbuf);
269 mbuf = dirfstat(fd);
270 if(mbuf == nil){
271 print("cannot stat: %r\n");
272 return;
274 if(mbuf->mode & DMDIR) {
275 print(mime ? "text/directory\n" : "directory\n");
276 return;
278 if(mbuf->type != 'M' && mbuf->type != '|') {
279 print(mime ? OCTET : "special file #%c/%s\n",
280 mbuf->type, mbuf->name);
281 return;
283 nbuf = read(fd, buf, sizeof(buf)-1);
285 if(nbuf < 0) {
286 print("cannot read\n");
287 return;
289 if(nbuf == 0) {
290 print(mime ? PLAIN : "empty file\n");
291 return;
293 buf[nbuf] = 0;
295 /*
296 * build histogram table
297 */
298 memset(cfreq, 0, sizeof(cfreq));
299 for (i = 0; language[i].name; i++)
300 language[i].count = 0;
301 eob = (char *)buf+nbuf;
302 for(n = 0, p = (char *)buf; p < eob; n++) {
303 if (!fullrune(p, eob-p) && eob-p < UTFmax)
304 break;
305 p += chartorune(&r, p);
306 if (r == 0)
307 f = Cnull;
308 else if (r <= 0x7f) {
309 if (!isprint(r) && !isspace(r))
310 f = Ceascii; /* ASCII control char */
311 else f = r;
312 } else if (r == 0x080) {
313 bump_utf_count(r);
314 f = Cutf;
315 } else if (r < 0xA0)
316 f = Cbinary; /* Invalid Runes */
317 else if (r <= 0xff)
318 f = Clatin; /* Latin 1 */
319 else {
320 bump_utf_count(r);
321 f = Cutf; /* UTF extension */
323 cfreq[f]++; /* ASCII chars peg directly */
325 /*
326 * gross classify
327 */
328 if (cfreq[Cbinary])
329 guess = Fbinary;
330 else if (cfreq[Cutf])
331 guess = Futf;
332 else if (cfreq[Clatin])
333 guess = Flatin;
334 else if (cfreq[Ceascii])
335 guess = Feascii;
336 else if (cfreq[Cnull] == n) {
337 print(mime ? OCTET : "first block all null bytes\n");
338 return;
340 else guess = Fascii;
341 /*
342 * lookup dictionary words
343 */
344 memset(wfreq, 0, sizeof(wfreq));
345 if(guess == Fascii || guess == Flatin || guess == Futf)
346 wordfreq();
347 /*
348 * call individual classify routines
349 */
350 for(i=0; call[i]; i++)
351 if((*call[i])())
352 return;
354 /*
355 * if all else fails,
356 * print out gross classification
357 */
358 if (nbuf < 100 && !mime)
359 print(mime ? PLAIN : "short ");
360 if (guess == Fascii)
361 print(mime ? PLAIN : "Ascii\n");
362 else if (guess == Feascii)
363 print(mime ? PLAIN : "extended ascii\n");
364 else if (guess == Flatin)
365 print(mime ? PLAIN : "latin ascii\n");
366 else if (guess == Futf && utf_count() < 4)
367 print_utf();
368 else print(mime ? OCTET : "binary\n");
371 void
372 bump_utf_count(Rune r)
374 int low, high, mid;
376 high = sizeof(language)/sizeof(language[0])-1;
377 for (low = 0; low < high;) {
378 mid = (low+high)/2;
379 if (r >=language[mid].low) {
380 if (r <= language[mid].high) {
381 language[mid].count++;
382 break;
383 } else low = mid+1;
384 } else high = mid;
388 int
389 utf_count(void)
391 int i, count;
393 count = 0;
394 for (i = 0; language[i].name; i++)
395 if (language[i].count > 0)
396 switch (language[i].mode) {
397 case Normal:
398 case First:
399 count++;
400 break;
401 default:
402 break;
404 return count;
407 int
408 chkascii(void)
410 int i;
412 for (i = 'a'; i < 'z'; i++)
413 if (cfreq[i])
414 return 1;
415 for (i = 'A'; i < 'Z'; i++)
416 if (cfreq[i])
417 return 1;
418 return 0;
421 int
422 find_first(char *name)
424 int i;
426 for (i = 0; language[i].name != 0; i++)
427 if (language[i].mode == First
428 && strcmp(language[i].name, name) == 0)
429 return i;
430 return -1;
433 void
434 print_utf(void)
436 int i, printed, j;
438 if(mime){
439 print(PLAIN);
440 return;
442 if (chkascii()) {
443 printed = 1;
444 print("Ascii");
445 } else
446 printed = 0;
447 for (i = 0; language[i].name; i++)
448 if (language[i].count) {
449 switch(language[i].mode) {
450 case Multi:
451 j = find_first(language[i].name);
452 if (j < 0)
453 break;
454 if (language[j].count > 0)
455 break;
456 /* Fall through */
457 case Normal:
458 case First:
459 if (printed)
460 print(" & ");
461 else printed = 1;
462 print("%s", language[i].name);
463 break;
464 case Shared:
465 default:
466 break;
469 if(!printed)
470 print("UTF");
471 print(" text\n");
474 void
475 wordfreq(void)
477 int low, high, mid, r;
478 uchar *p, *p2, c;
480 p = buf;
481 for(;;) {
482 while (p < buf+nbuf && !isalpha(*p))
483 p++;
484 if (p >= buf+nbuf)
485 return;
486 p2 = p;
487 while(p < buf+nbuf && isalpha(*p))
488 p++;
489 c = *p;
490 *p = 0;
491 high = sizeof(dict)/sizeof(dict[0]);
492 for(low = 0;low < high;) {
493 mid = (low+high)/2;
494 r = strcmp(dict[mid].word, (char*)p2);
495 if(r == 0) {
496 wfreq[dict[mid].class]++;
497 break;
499 if(r < 0)
500 low = mid+1;
501 else
502 high = mid;
504 *p++ = c;
508 typedef struct Filemagic Filemagic;
509 struct Filemagic {
510 ulong x;
511 ulong mask;
512 char *desc;
513 char *mime;
514 };
516 Filemagic long0tab[] = {
517 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET,
518 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET,
519 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET,
520 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET,
521 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET,
522 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip\n",
523 070707, 0xFFFF, "cpio archive\n", OCTET,
524 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi\n",
525 0xfffa0000, 0xfffe0000, "mp3 audio\n", "audio/mpeg\n",
526 0xcafebabe, 0xFFFFFFFF, "Mach-O fat executable\n", "application/x-mach-binary\n",
527 0xfeedface, 0xFFFFFFFE, "Mach-O executable\n", "application/x-mach-binary\n",
528 0xbebafeca, 0xFFFFFFFF, "Java class\n", "application/x-java-applet\n",
529 };
531 int
532 filemagic(Filemagic *tab, int ntab, ulong x)
534 int i;
536 for(i=0; i<ntab; i++)
537 if((x&tab[i].mask) == tab[i].x){
538 print(mime ? tab[i].mime : tab[i].desc);
539 return 1;
541 return 0;
544 int
545 long0(void)
547 /* Fhdr *f; */
548 long x;
550 seek(fd, 0, 0); /* reposition to start of file */
551 /*
552 if(crackhdr(fd, &f)) {
553 print(mime ? OCTET : "%s\n", f.name);
554 return 1;
556 */
557 x = LENDIAN(buf);
558 if(filemagic(long0tab, nelem(long0tab), x))
559 return 1;
560 return 0;
563 /* from tar.c */
564 enum { NAMSIZ = 100, TBLOCK = 512 };
566 union hblock
568 char dummy[TBLOCK];
569 struct header
571 char name[NAMSIZ];
572 char mode[8];
573 char uid[8];
574 char gid[8];
575 char size[12];
576 char mtime[12];
577 char chksum[8];
578 char linkflag;
579 char linkname[NAMSIZ];
580 /* rest are defined by POSIX's ustar format; see p1003.2b */
581 char magic[6]; /* "ustar" */
582 char version[2];
583 char uname[32];
584 char gname[32];
585 char devmajor[8];
586 char devminor[8];
587 char prefix[155]; /* if non-null, path = prefix "/" name */
588 } dbuf;
589 };
591 int
592 checksum(union hblock *hp)
594 int i;
595 char *cp;
596 struct header *hdr = &hp->dbuf;
598 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
599 *cp = ' ';
600 i = 0;
601 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
602 i += *cp & 0xff;
603 return i;
606 int
607 istar(void)
609 int chksum;
610 char tblock[TBLOCK];
611 union hblock *hp = (union hblock *)tblock;
612 struct header *hdr = &hp->dbuf;
614 seek(fd, 0, 0); /* reposition to start of file */
615 if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
616 return 0;
617 chksum = strtol(hdr->chksum, 0, 8);
618 if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
619 if (strcmp(hdr->magic, "ustar") == 0)
620 print(mime? "application/x-ustar\n":
621 "posix tar archive\n");
622 else
623 print(mime? "application/x-tar\n": "tar archive\n");
624 return 1;
626 return 0;
629 /*
630 * initial words to classify file
631 */
632 struct FILE_STRING
634 char *key;
635 char *filetype;
636 int length;
637 char *mime;
638 } file_string[] =
640 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream",
641 "!<arch>\n", "archive", 8, "application/octet-stream",
642 "070707", "cpio archive - ascii header", 6, "application/octet-stream",
643 "%!", "postscript", 2, "application/postscript",
644 "\004%!", "postscript", 3, "application/postscript",
645 "x T post", "troff output for post", 8, "application/troff",
646 "x T Latin1", "troff output for Latin1", 10, "application/troff",
647 "x T utf", "troff output for UTF", 7, "application/troff",
648 "x T 202", "troff output for 202", 7, "application/troff",
649 "x T aps", "troff output for aps", 7, "application/troff",
650 "GIF", "GIF image", 3, "image/gif",
651 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript",
652 "%PDF", "PDF", 4, "application/pdf",
653 "<html>\n", "HTML file", 7, "text/html",
654 "<HTML>\n", "HTML file", 7, "text/html",
655 "compressed\n", "Compressed image or subfont", 11, "application/octet-stream",
656 "\111\111\052\000", "tiff", 4, "image/tiff",
657 "\115\115\000\052", "tiff", 4, "image/tiff",
658 "\377\330\377\340", "jpeg", 4, "image/jpeg",
659 "\377\330\377\341", "jpeg", 4, "image/jpeg",
660 "\377\330\377\333", "jpeg", 4, "image/jpeg",
661 "\106\117\126\142", "x3f", 4, "image/x3f",
662 "BM", "bmp", 2, "image/bmp",
663 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream",
664 "<MakerFile ", "FrameMaker file", 11, "application/framemaker",
665 "\033%-12345X", "HPJCL file", 9, "application/hpjcl",
666 "ID3", "mp3 audio with id3", 3, "audio/mpeg",
667 0,0,0,0
668 };
670 int
671 istring(void)
673 int i, j;
674 struct FILE_STRING *p;
676 for(p = file_string; p->key; p++) {
677 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
678 if(mime)
679 print("%s\n", p->mime);
680 else
681 print("%s\n", p->filetype);
682 return 1;
685 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */
686 for(i = 5; i < nbuf; i++)
687 if(buf[i] == '\n')
688 break;
689 if(mime)
690 print(OCTET);
691 else
692 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
693 return 1;
695 if(buf[0]=='#' && buf[1]=='!'){
696 i=2;
697 for(j=2; j < nbuf && buf[j] != ' ' && buf[j] != '\n' && buf[j] != '\r'; j++)
698 if(buf[j] == '/')
699 i = j+1;
700 if(mime)
701 print(PLAIN);
702 else
703 print("%.*s executable file script\n", utfnlen((char*)buf+i, j-i), (char*)buf+i);
704 return 1;
706 return 0;
709 int
710 iff(void)
712 if (strncmp((char*)buf, "FORM", 4) == 0 &&
713 strncmp((char*)buf+8, "AIFF", 4) == 0) {
714 print("%s\n", mime? "audio/x-aiff": "aiff audio");
715 return 1;
717 return 0;
720 char* html_string[] =
722 "title",
723 "body",
724 "head",
725 "strong",
726 "h1",
727 "h2",
728 "h3",
729 "h4",
730 "h5",
731 "h6",
732 "ul",
733 "li",
734 "dl",
735 "br",
736 "em",
737 0,
738 };
740 int
741 ishtml(void)
743 uchar *p, *q;
744 int i, count;
746 /* compare strings between '<' and '>' to html table */
747 count = 0;
748 p = buf;
749 for(;;) {
750 while (p < buf+nbuf && *p != '<')
751 p++;
752 p++;
753 if (p >= buf+nbuf)
754 break;
755 if(*p == '/')
756 p++;
757 q = p;
758 while(p < buf+nbuf && *p != '>')
759 p++;
760 if (p >= buf+nbuf)
761 break;
762 for(i = 0; html_string[i]; i++) {
763 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
764 if(count++ > 4) {
765 print(mime ? "text/html\n" : "HTML file\n");
766 return 1;
768 break;
771 p++;
773 return 0;
776 char* rfc822_string[] =
778 "from:",
779 "date:",
780 "to:",
781 "subject:",
782 "received:",
783 "reply to:",
784 "sender:",
785 0,
786 };
788 int
789 isrfc822(void)
792 char *p, *q, *r;
793 int i, count;
795 count = 0;
796 p = (char*)buf;
797 for(;;) {
798 q = strchr(p, '\n');
799 if(q == nil)
800 break;
801 *q = 0;
802 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
803 count++;
804 *q = '\n';
805 p = q+1;
806 continue;
808 *q = '\n';
809 if(*p != '\t' && *p != ' '){
810 r = strchr(p, ':');
811 if(r == 0 || r > q)
812 break;
813 for(i = 0; rfc822_string[i]; i++) {
814 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
815 count++;
816 break;
820 p = q+1;
822 if(count >= 3){
823 print(mime ? "message/rfc822\n" : "email file\n");
824 return 1;
826 return 0;
829 int
830 ismbox(void)
832 char *p, *q;
834 p = (char*)buf;
835 q = strchr(p, '\n');
836 if(q == nil)
837 return 0;
838 *q = 0;
839 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
840 print(mime ? "text/plain\n" : "mail box\n");
841 return 1;
843 *q = '\n';
844 return 0;
847 int
848 isc(void)
850 int n;
852 n = wfreq[I1];
853 /*
854 * includes
855 */
856 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
857 goto yes;
858 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
859 goto yes;
860 /*
861 * declarations
862 */
863 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
864 goto yes;
865 /*
866 * assignments
867 */
868 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
869 goto yes;
870 return 0;
872 yes:
873 if(mime){
874 print(PLAIN);
875 return 1;
877 if(wfreq[Alword] > 0)
878 print("alef program\n");
879 else
880 print("c program\n");
881 return 1;
884 int
885 islimbo(void)
888 /*
889 * includes
890 */
891 if(wfreq[Lword] < 4)
892 return 0;
893 print(mime ? PLAIN : "limbo program\n");
894 return 1;
897 int
898 isas(void)
901 /*
902 * includes
903 */
904 if(wfreq[Aword] < 2)
905 return 0;
906 print(mime ? PLAIN : "as program\n");
907 return 1;
910 /*
911 * low entropy means encrypted
912 */
913 int
914 ismung(void)
916 int i, bucket[8];
917 float cs;
919 if(nbuf < 64)
920 return 0;
921 memset(bucket, 0, sizeof(bucket));
922 for(i=0; i<64; i++)
923 bucket[(buf[i]>>5)&07] += 1;
925 cs = 0.;
926 for(i=0; i<8; i++)
927 cs += (bucket[i]-8)*(bucket[i]-8);
928 cs /= 8.;
929 if(cs <= 24.322) {
930 if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
931 print(mime ? OCTET : "compressed\n");
932 else
933 print(mime ? OCTET : "encrypted\n");
934 return 1;
936 return 0;
939 /*
940 * english by punctuation and frequencies
941 */
942 int
943 isenglish(void)
945 int vow, comm, rare, badpun, punct;
946 char *p;
948 if(guess != Fascii && guess != Feascii)
949 return 0;
950 badpun = 0;
951 punct = 0;
952 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
953 switch(*p) {
954 case '.':
955 case ',':
956 case ')':
957 case '%':
958 case ';':
959 case ':':
960 case '?':
961 punct++;
962 if(p[1] != ' ' && p[1] != '\n')
963 badpun++;
965 if(badpun*5 > punct)
966 return 0;
967 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */
968 return 0;
969 if(2*cfreq[';'] > cfreq['e'])
970 return 0;
972 vow = 0;
973 for(p="AEIOU"; *p; p++) {
974 vow += cfreq[(uchar)*p];
975 vow += cfreq[tolower((uchar)*p)];
977 comm = 0;
978 for(p="ETAION"; *p; p++) {
979 comm += cfreq[(uchar)*p];
980 comm += cfreq[tolower((uchar)*p)];
982 rare = 0;
983 for(p="VJKQXZ"; *p; p++) {
984 rare += cfreq[(uchar)*p];
985 rare += cfreq[tolower((uchar)*p)];
987 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
988 print(mime ? PLAIN : "English text\n");
989 return 1;
991 return 0;
994 /*
995 * pick up a number with
996 * syntax _*[0-9]+_
997 */
998 #define P9BITLEN 12
999 int
1000 p9bitnum(uchar *bp)
1002 int n, c, len;
1004 len = P9BITLEN;
1005 while(*bp == ' ') {
1006 bp++;
1007 len--;
1008 if(len <= 0)
1009 return -1;
1011 n = 0;
1012 while(len > 1) {
1013 c = *bp++;
1014 if(!isdigit(c))
1015 return -1;
1016 n = n*10 + c-'0';
1017 len--;
1019 if(*bp != ' ')
1020 return -1;
1021 return n;
1024 int
1025 depthof(char *s, int *newp)
1027 char *es;
1028 int d;
1030 *newp = 0;
1031 es = s+12;
1032 while(s<es && *s==' ')
1033 s++;
1034 if(s == es)
1035 return -1;
1036 if('0'<=*s && *s<='9')
1037 return 1<<atoi(s);
1039 *newp = 1;
1040 d = 0;
1041 while(s<es && *s!=' '){
1042 s++; /* skip letter */
1043 d += strtoul(s, &s, 10);
1046 switch(d){
1047 case 32:
1048 case 24:
1049 case 16:
1050 case 8:
1051 return d;
1053 return -1;
1056 int
1057 isp9bit(void)
1059 int dep, lox, loy, hix, hiy, px, new;
1060 ulong t;
1061 long len;
1062 char *newlabel;
1064 newlabel = "old ";
1066 dep = depthof((char*)buf + 0*P9BITLEN, &new);
1067 if(new)
1068 newlabel = "";
1069 lox = p9bitnum(buf + 1*P9BITLEN);
1070 loy = p9bitnum(buf + 2*P9BITLEN);
1071 hix = p9bitnum(buf + 3*P9BITLEN);
1072 hiy = p9bitnum(buf + 4*P9BITLEN);
1073 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1074 return 0;
1076 if(dep < 8){
1077 px = 8/dep; /* pixels per byte */
1078 /* set l to number of bytes of data per scan line */
1079 if(lox >= 0)
1080 len = (hix+px-1)/px - lox/px;
1081 else{ /* make positive before divide */
1082 t = (-lox)+px-1;
1083 t = (t/px)*px;
1084 len = (t+hix+px-1)/px;
1086 }else
1087 len = (hix-lox)*dep/8;
1088 len *= (hiy-loy); /* col length */
1089 len += 5*P9BITLEN; /* size of initial ascii */
1092 * for image file, length is non-zero and must match calculation above
1093 * for /dev/window and /dev/screen the length is always zero
1094 * for subfont, the subfont header should follow immediately.
1096 if (len != 0 && mbuf->length == 0) {
1097 print("%splan 9 image\n", newlabel);
1098 return 1;
1100 if (mbuf->length == len) {
1101 print("%splan 9 image\n", newlabel);
1102 return 1;
1104 /* Ghostscript sometimes produces a little extra on the end */
1105 if (mbuf->length < len+P9BITLEN) {
1106 print("%splan 9 image\n", newlabel);
1107 return 1;
1109 if (p9subfont(buf+len)) {
1110 print("%ssubfont file\n", newlabel);
1111 return 1;
1113 return 0;
1116 int
1117 p9subfont(uchar *p)
1119 int n, h, a;
1121 /* if image too big, assume it's a subfont */
1122 if (p+3*P9BITLEN > buf+sizeof(buf))
1123 return 1;
1125 n = p9bitnum(p + 0*P9BITLEN); /* char count */
1126 if (n < 0)
1127 return 0;
1128 h = p9bitnum(p + 1*P9BITLEN); /* height */
1129 if (h < 0)
1130 return 0;
1131 a = p9bitnum(p + 2*P9BITLEN); /* ascent */
1132 if (a < 0)
1133 return 0;
1134 return 1;
1137 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
1139 int
1140 isp9font(void)
1142 uchar *cp, *p;
1143 int i, n;
1144 char pathname[1024];
1146 cp = buf;
1147 if (!getfontnum(cp, &cp)) /* height */
1148 return 0;
1149 if (!getfontnum(cp, &cp)) /* ascent */
1150 return 0;
1151 for (i = 0; 1; i++) {
1152 if (!getfontnum(cp, &cp)) /* min */
1153 break;
1154 if (!getfontnum(cp, &cp)) /* max */
1155 return 0;
1156 while (WHITESPACE(*cp))
1157 cp++;
1158 for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1160 /* construct a path name, if needed */
1161 n = 0;
1162 if (*p != '/' && slash) {
1163 n = slash-fname+1;
1164 if (n < sizeof(pathname))
1165 memcpy(pathname, fname, n);
1166 else n = 0;
1168 if (n+cp-p < sizeof(pathname)) {
1169 memcpy(pathname+n, p, cp-p);
1170 n += cp-p;
1171 pathname[n] = 0;
1172 if (access(pathname, AEXIST) < 0)
1173 return 0;
1176 if (i) {
1177 print(mime ? "text/plain\n" : "font file\n");
1178 return 1;
1180 return 0;
1183 int
1184 getfontnum(uchar *cp, uchar **rp)
1186 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */
1187 cp++;
1188 if (*cp < '0' || *cp > '9')
1189 return 0;
1190 strtoul((char *)cp, (char **)rp, 0);
1191 if (!WHITESPACE(**rp))
1192 return 0;
1193 return 1;
1196 int
1197 isrtf(void)
1199 if(strstr((char *)buf, "\\rtf1")){
1200 print(mime ? "application/rtf\n" : "rich text format\n");
1201 return 1;
1203 return 0;
1206 int
1207 ismsdos(void)
1209 if (buf[0] == 0x4d && buf[1] == 0x5a){
1210 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1211 return 1;
1213 return 0;
1216 int
1217 iself(void)
1219 static char *cpu[] = { /* NB: incomplete and arbitary list */
1220 nil,
1221 /*1*/ "WE32100",
1222 /*2*/ "SPARC",
1223 /*3*/ "i386",
1224 /*4*/ "M68000",
1225 /*5*/ "M88000",
1226 /*6*/ "i486",
1227 /*7*/ "i860",
1228 /*8*/ "R3000",
1229 /*9*/ "S370",
1230 /*10*/ "R4000",
1231 nil, nil, nil, nil,
1232 /*15*/ "HP-PA",
1233 nil,
1234 nil,
1235 /*18*/ "sparc v8+",
1236 /*19*/ "i960",
1237 /*20*/ "PPC-32",
1238 /*21*/ "PPC-64",
1239 nil, nil, nil, nil,
1240 nil, nil, nil, nil, nil,
1241 nil, nil, nil, nil, nil,
1242 nil, nil, nil, nil,
1243 /*40*/ "ARM",
1244 /*41*/ "Alpha",
1245 nil,
1246 /*43*/ "sparc v9",
1247 nil, nil,
1248 nil, nil, nil, nil,
1249 /*50*/ "IA-64",
1250 nil, nil, nil, nil, nil,
1251 nil, nil, nil, nil, nil,
1252 nil,
1253 /*62*/ "AMD64",
1254 nil, nil, nil,
1255 nil, nil, nil, nil, nil,
1256 nil, nil, nil, nil,
1257 /*75*/ "VAX",
1261 if (memcmp(buf, "\177ELF", 4) == 0){
1262 /* gcc misparses \x7FELF as \x7FE L F */
1263 if (!mime){
1264 int n = (buf[19] << 8) | buf[18];
1265 char *p = "unknown";
1267 if (n > 0 && n < nelem(cpu) && cpu[n])
1268 p = cpu[n];
1269 else {
1270 /* try the other byte order */
1271 n = (buf[18] << 8) | buf[19];
1272 if (n > 0 && n < nelem(cpu) && cpu[n])
1273 p = cpu[n];
1275 print("%s ELF executable\n", p);
1277 else
1278 print("application/x-elf-executable");
1279 return 1;
1282 return 0;