Blob


1 #include <u.h>
2 #include <libc.h>
3 #include <bio.h>
4 #include <ctype.h>
5 #include <mach.h>
7 /*
8 * file - determine type of file
9 */
10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
12 uchar buf[6001];
13 short cfreq[140];
14 short wfreq[50];
15 int nbuf;
16 Dir* mbuf;
17 int fd;
18 char *fname;
19 char *slash;
21 enum
22 {
23 Cword,
24 Fword,
25 Aword,
26 Alword,
27 Lword,
28 I1,
29 I2,
30 I3,
31 Clatin = 128,
32 Cbinary,
33 Cnull,
34 Ceascii,
35 Cutf,
36 };
37 struct
38 {
39 char* word;
40 int class;
41 } dict[] =
42 {
43 "PATH", Lword,
44 "TEXT", Aword,
45 "adt", Alword,
46 "aggr", Alword,
47 "alef", Alword,
48 "array", Lword,
49 "block", Fword,
50 "chan", Alword,
51 "char", Cword,
52 "common", Fword,
53 "con", Lword,
54 "data", Fword,
55 "dimension", Fword,
56 "double", Cword,
57 "extern", Cword,
58 "bio", I2,
59 "float", Cword,
60 "fn", Lword,
61 "function", Fword,
62 "h", I3,
63 "implement", Lword,
64 "import", Lword,
65 "include", I1,
66 "int", Cword,
67 "integer", Fword,
68 "iota", Lword,
69 "libc", I2,
70 "long", Cword,
71 "module", Lword,
72 "real", Fword,
73 "ref", Lword,
74 "register", Cword,
75 "self", Lword,
76 "short", Cword,
77 "static", Cword,
78 "stdio", I2,
79 "struct", Cword,
80 "subroutine", Fword,
81 "u", I2,
82 "void", Cword,
83 };
85 /* codes for 'mode' field in language structure */
86 enum {
87 Normal = 0,
88 First, /* first entry for language spanning several ranges */
89 Multi, /* later entries " " " ... */
90 Shared, /* codes used in several languages */
91 };
93 struct
94 {
95 int mode; /* see enum above */
96 int count;
97 int low;
98 int high;
99 char *name;
101 } language[] =
103 Normal, 0, 0x0080, 0x0080, "Extended Latin",
104 Normal, 0, 0x0100, 0x01FF, "Extended Latin",
105 Normal, 0, 0x0370, 0x03FF, "Greek",
106 Normal, 0, 0x0400, 0x04FF, "Cyrillic",
107 Normal, 0, 0x0530, 0x058F, "Armenian",
108 Normal, 0, 0x0590, 0x05FF, "Hebrew",
109 Normal, 0, 0x0600, 0x06FF, "Arabic",
110 Normal, 0, 0x0900, 0x097F, "Devanagari",
111 Normal, 0, 0x0980, 0x09FF, "Bengali",
112 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi",
113 Normal, 0, 0x0A80, 0x0AFF, "Gujarati",
114 Normal, 0, 0x0B00, 0x0B7F, "Oriya",
115 Normal, 0, 0x0B80, 0x0BFF, "Tamil",
116 Normal, 0, 0x0C00, 0x0C7F, "Telugu",
117 Normal, 0, 0x0C80, 0x0CFF, "Kannada",
118 Normal, 0, 0x0D00, 0x0D7F, "Malayalam",
119 Normal, 0, 0x0E00, 0x0E7F, "Thai",
120 Normal, 0, 0x0E80, 0x0EFF, "Lao",
121 Normal, 0, 0x1000, 0x105F, "Tibetan",
122 Normal, 0, 0x10A0, 0x10FF, "Georgian",
123 Normal, 0, 0x3040, 0x30FF, "Japanese",
124 Normal, 0, 0x3100, 0x312F, "Chinese",
125 First, 0, 0x3130, 0x318F, "Korean",
126 Multi, 0, 0x3400, 0x3D2F, "Korean",
127 Shared, 0, 0x4e00, 0x9fff, "CJK",
128 Normal, 0, 0, 0, 0, /* terminal entry */
129 };
132 enum
134 Fascii, /* printable ascii */
135 Flatin, /* latin 1*/
136 Futf, /* UTf character set */
137 Fbinary, /* binary */
138 Feascii, /* ASCII with control chars */
139 Fnull, /* NULL in file */
140 } guess;
142 void bump_utf_count(Rune);
143 int cistrncmp(char*, char*, int);
144 void filetype(int);
145 int getfontnum(uchar*, uchar**);
146 int isas(void);
147 int isc(void);
148 int isenglish(void);
149 int ishp(void);
150 int ishtml(void);
151 int isrfc822(void);
152 int ismbox(void);
153 int islimbo(void);
154 int ismung(void);
155 int isp9bit(void);
156 int isp9font(void);
157 int isrtf(void);
158 int ismsdos(void);
159 int iself(void);
160 int istring(void);
161 int iff(void);
162 int long0(void);
163 int istar(void);
164 int p9bitnum(uchar*);
165 int p9subfont(uchar*);
166 void print_utf(void);
167 void type(char*, int);
168 int utf_count(void);
169 void wordfreq(void);
171 int (*call[])(void) =
173 long0, /* recognizable by first 4 bytes */
174 istring, /* recognizable by first string */
175 iff, /* interchange file format (strings) */
176 isrfc822, /* email file */
177 ismbox, /* mail box */
178 istar, /* recognizable by tar checksum */
179 ishtml, /* html keywords */
180 /* iscint, /* compiler/assembler intermediate */
181 islimbo, /* limbo source */
182 isc, /* c & alef compiler key words */
183 isas, /* assembler key words */
184 ismung, /* entropy compressed/encrypted */
185 isp9font, /* plan 9 font */
186 isp9bit, /* plan 9 image (as from /dev/window) */
187 isenglish, /* char frequency English */
188 isrtf, /* rich text format */
189 ismsdos, /* msdos exe (virus file attachement) */
190 iself, /* ELF (foreign) executable */
192 };
194 int mime;
196 #define OCTET "application/octet-stream\n"
197 #define PLAIN "text/plain\n"
199 void
200 main(int argc, char *argv[])
202 int i, j, maxlen;
203 char *cp;
204 Rune r;
206 ARGBEGIN{
207 case 'm':
208 mime = 1;
209 break;
210 default:
211 fprint(2, "usage: file [-m] [file...]\n");
212 exits("usage");
213 }ARGEND;
215 maxlen = 0;
216 if(mime == 0 || argc > 1){
217 for(i = 0; i < argc; i++) {
218 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
220 if(j > maxlen)
221 maxlen = j;
224 if (argc <= 0) {
225 if(!mime)
226 print ("stdin: ");
227 filetype(0);
229 else {
230 for(i = 0; i < argc; i++)
231 type(argv[i], maxlen);
233 exits(0);
236 void
237 type(char *file, int nlen)
239 Rune r;
240 int i;
241 char *p;
243 if(nlen > 0){
244 slash = 0;
245 for (i = 0, p = file; *p; i++) {
246 if (*p == '/') /* find rightmost slash */
247 slash = p;
248 p += chartorune(&r, p); /* count runes */
250 print("%s:%*s",file, nlen-i+1, "");
252 fname = file;
253 if ((fd = open(file, OREAD)) < 0) {
254 print("cannot open\n");
255 return;
257 filetype(fd);
258 close(fd);
261 void
262 filetype(int fd)
264 Rune r;
265 int i, f, n;
266 char *p, *eob;
268 free(mbuf);
269 mbuf = dirfstat(fd);
270 if(mbuf == nil){
271 print("cannot stat: %r\n");
272 return;
274 if(mbuf->mode & DMDIR) {
275 print(mime ? "text/directory\n" : "directory\n");
276 return;
278 if(mbuf->type != 'M' && mbuf->type != '|') {
279 print(mime ? OCTET : "special file #%c/%s\n",
280 mbuf->type, mbuf->name);
281 return;
283 nbuf = read(fd, buf, sizeof(buf)-1);
285 if(nbuf < 0) {
286 print("cannot read\n");
287 return;
289 if(nbuf == 0) {
290 print(mime ? PLAIN : "empty file\n");
291 return;
293 buf[nbuf] = 0;
295 /*
296 * build histogram table
297 */
298 memset(cfreq, 0, sizeof(cfreq));
299 for (i = 0; language[i].name; i++)
300 language[i].count = 0;
301 eob = (char *)buf+nbuf;
302 for(n = 0, p = (char *)buf; p < eob; n++) {
303 if (!fullrune(p, eob-p) && eob-p < UTFmax)
304 break;
305 p += chartorune(&r, p);
306 if (r == 0)
307 f = Cnull;
308 else if (r <= 0x7f) {
309 if (!isprint(r) && !isspace(r))
310 f = Ceascii; /* ASCII control char */
311 else f = r;
312 } else if (r == 0x080) {
313 bump_utf_count(r);
314 f = Cutf;
315 } else if (r < 0xA0)
316 f = Cbinary; /* Invalid Runes */
317 else if (r <= 0xff)
318 f = Clatin; /* Latin 1 */
319 else {
320 bump_utf_count(r);
321 f = Cutf; /* UTF extension */
323 cfreq[f]++; /* ASCII chars peg directly */
325 /*
326 * gross classify
327 */
328 if (cfreq[Cbinary])
329 guess = Fbinary;
330 else if (cfreq[Cutf])
331 guess = Futf;
332 else if (cfreq[Clatin])
333 guess = Flatin;
334 else if (cfreq[Ceascii])
335 guess = Feascii;
336 else if (cfreq[Cnull] == n) {
337 print(mime ? OCTET : "first block all null bytes\n");
338 return;
340 else guess = Fascii;
341 /*
342 * lookup dictionary words
343 */
344 memset(wfreq, 0, sizeof(wfreq));
345 if(guess == Fascii || guess == Flatin || guess == Futf)
346 wordfreq();
347 /*
348 * call individual classify routines
349 */
350 for(i=0; call[i]; i++)
351 if((*call[i])())
352 return;
354 /*
355 * if all else fails,
356 * print out gross classification
357 */
358 if (nbuf < 100 && !mime)
359 print(mime ? PLAIN : "short ");
360 if (guess == Fascii)
361 print(mime ? PLAIN : "Ascii\n");
362 else if (guess == Feascii)
363 print(mime ? PLAIN : "extended ascii\n");
364 else if (guess == Flatin)
365 print(mime ? PLAIN : "latin ascii\n");
366 else if (guess == Futf && utf_count() < 4)
367 print_utf();
368 else print(mime ? OCTET : "binary\n");
371 void
372 bump_utf_count(Rune r)
374 int low, high, mid;
376 high = sizeof(language)/sizeof(language[0])-1;
377 for (low = 0; low < high;) {
378 mid = (low+high)/2;
379 if (r >=language[mid].low) {
380 if (r <= language[mid].high) {
381 language[mid].count++;
382 break;
383 } else low = mid+1;
384 } else high = mid;
388 int
389 utf_count(void)
391 int i, count;
393 count = 0;
394 for (i = 0; language[i].name; i++)
395 if (language[i].count > 0)
396 switch (language[i].mode) {
397 case Normal:
398 case First:
399 count++;
400 break;
401 default:
402 break;
404 return count;
407 int
408 chkascii(void)
410 int i;
412 for (i = 'a'; i < 'z'; i++)
413 if (cfreq[i])
414 return 1;
415 for (i = 'A'; i < 'Z'; i++)
416 if (cfreq[i])
417 return 1;
418 return 0;
421 int
422 find_first(char *name)
424 int i;
426 for (i = 0; language[i].name != 0; i++)
427 if (language[i].mode == First
428 && strcmp(language[i].name, name) == 0)
429 return i;
430 return -1;
433 void
434 print_utf(void)
436 int i, printed, j;
438 if(mime){
439 print(PLAIN);
440 return;
442 if (chkascii()) {
443 printed = 1;
444 print("Ascii");
445 } else
446 printed = 0;
447 for (i = 0; language[i].name; i++)
448 if (language[i].count) {
449 switch(language[i].mode) {
450 case Multi:
451 j = find_first(language[i].name);
452 if (j < 0)
453 break;
454 if (language[j].count > 0)
455 break;
456 /* Fall through */
457 case Normal:
458 case First:
459 if (printed)
460 print(" & ");
461 else printed = 1;
462 print("%s", language[i].name);
463 break;
464 case Shared:
465 default:
466 break;
469 if(!printed)
470 print("UTF");
471 print(" text\n");
474 void
475 wordfreq(void)
477 int low, high, mid, r;
478 uchar *p, *p2, c;
480 p = buf;
481 for(;;) {
482 while (p < buf+nbuf && !isalpha(*p))
483 p++;
484 if (p >= buf+nbuf)
485 return;
486 p2 = p;
487 while(p < buf+nbuf && isalpha(*p))
488 p++;
489 c = *p;
490 *p = 0;
491 high = sizeof(dict)/sizeof(dict[0]);
492 for(low = 0;low < high;) {
493 mid = (low+high)/2;
494 r = strcmp(dict[mid].word, (char*)p2);
495 if(r == 0) {
496 wfreq[dict[mid].class]++;
497 break;
499 if(r < 0)
500 low = mid+1;
501 else
502 high = mid;
504 *p++ = c;
508 typedef struct Filemagic Filemagic;
509 struct Filemagic {
510 ulong x;
511 ulong mask;
512 char *desc;
513 char *mime;
514 };
516 Filemagic long0tab[] = {
517 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET,
518 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET,
519 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET,
520 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET,
521 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET,
522 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip",
523 070707, 0xFFFF, "cpio archive\n", OCTET,
524 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi",
525 0xfffa0000, 0xfffe0000, "mp3 audio", "audio/mpeg",
526 };
528 int
529 filemagic(Filemagic *tab, int ntab, ulong x)
531 int i;
533 for(i=0; i<ntab; i++)
534 if((x&tab[i].mask) == tab[i].x){
535 print(mime ? tab[i].mime : tab[i].desc);
536 return 1;
538 return 0;
541 int
542 long0(void)
544 /* Fhdr *f; */
545 long x;
547 seek(fd, 0, 0); /* reposition to start of file */
548 /*
549 if(crackhdr(fd, &f)) {
550 print(mime ? OCTET : "%s\n", f.name);
551 return 1;
553 */
554 x = LENDIAN(buf);
555 if(filemagic(long0tab, nelem(long0tab), x))
556 return 1;
557 return 0;
560 /* from tar.c */
561 enum { NAMSIZ = 100, TBLOCK = 512 };
563 union hblock
565 char dummy[TBLOCK];
566 struct header
568 char name[NAMSIZ];
569 char mode[8];
570 char uid[8];
571 char gid[8];
572 char size[12];
573 char mtime[12];
574 char chksum[8];
575 char linkflag;
576 char linkname[NAMSIZ];
577 /* rest are defined by POSIX's ustar format; see p1003.2b */
578 char magic[6]; /* "ustar" */
579 char version[2];
580 char uname[32];
581 char gname[32];
582 char devmajor[8];
583 char devminor[8];
584 char prefix[155]; /* if non-null, path = prefix "/" name */
585 } dbuf;
586 };
588 int
589 checksum(union hblock *hp)
591 int i;
592 char *cp;
593 struct header *hdr = &hp->dbuf;
595 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
596 *cp = ' ';
597 i = 0;
598 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
599 i += *cp & 0xff;
600 return i;
603 int
604 istar(void)
606 int chksum;
607 char tblock[TBLOCK];
608 union hblock *hp = (union hblock *)tblock;
609 struct header *hdr = &hp->dbuf;
611 seek(fd, 0, 0); /* reposition to start of file */
612 if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
613 return 0;
614 chksum = strtol(hdr->chksum, 0, 8);
615 if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
616 if (strcmp(hdr->magic, "ustar") == 0)
617 print(mime? "application/x-ustar\n":
618 "posix tar archive\n");
619 else
620 print(mime? "application/x-tar\n": "tar archive\n");
621 return 1;
623 return 0;
626 /*
627 * initial words to classify file
628 */
629 struct FILE_STRING
631 char *key;
632 char *filetype;
633 int length;
634 char *mime;
635 } file_string[] =
637 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream",
638 "!<arch>\n", "archive", 8, "application/octet-stream",
639 "070707", "cpio archive - ascii header", 6, "application/octet-stream",
640 "%!", "postscript", 2, "application/postscript",
641 "\004%!", "postscript", 3, "application/postscript",
642 "x T post", "troff output for post", 8, "application/troff",
643 "x T Latin1", "troff output for Latin1", 10, "application/troff",
644 "x T utf", "troff output for UTF", 7, "application/troff",
645 "x T 202", "troff output for 202", 7, "application/troff",
646 "x T aps", "troff output for aps", 7, "application/troff",
647 "GIF", "GIF image", 3, "image/gif",
648 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript",
649 "%PDF", "PDF", 4, "application/pdf",
650 "<html>\n", "HTML file", 7, "text/html",
651 "<HTML>\n", "HTML file", 7, "text/html",
652 "compressed\n", "Compressed image or subfont", 11, "application/octet-stream",
653 "\111\111\052\000", "tiff", 4, "image/tiff",
654 "\115\115\000\052", "tiff", 4, "image/tiff",
655 "\377\330\377\340", "jpeg", 4, "image/jpeg",
656 "\377\330\377\341", "jpeg", 4, "image/jpeg",
657 "\377\330\377\333", "jpeg", 4, "image/jpeg",
658 "\106\117\126\142", "x3f", 4, "image/x3f",
659 "BM", "bmp", 2, "image/bmp",
660 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream",
661 "<MakerFile ", "FrameMaker file", 11, "application/framemaker",
662 "\033%-12345X", "HPJCL file", 9, "application/hpjcl",
663 "ID3", "mp3 audio with id3", 3, "audio/mpeg",
664 0,0,0,0
665 };
667 int
668 istring(void)
670 int i, j;
671 struct FILE_STRING *p;
673 for(p = file_string; p->key; p++) {
674 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
675 if(mime)
676 print("%s\n", p->mime);
677 else
678 print("%s\n", p->filetype);
679 return 1;
682 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */
683 for(i = 5; i < nbuf; i++)
684 if(buf[i] == '\n')
685 break;
686 if(mime)
687 print(OCTET);
688 else
689 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
690 return 1;
692 if(buf[0]=='#' && buf[1]=='!'){
693 i=2;
694 for(j=2; j < nbuf && buf[j] != ' ' && buf[j] != '\n' && buf[j] != '\r'; j++)
695 if(buf[j] == '/')
696 i = j+1;
697 if(mime)
698 print(PLAIN);
699 else
700 print("%.*s executable file script\n", utfnlen((char*)buf+i, j-i), (char*)buf+i);
701 return 1;
703 return 0;
706 int
707 iff(void)
709 if (strncmp((char*)buf, "FORM", 4) == 0 &&
710 strncmp((char*)buf+8, "AIFF", 4) == 0) {
711 print("%s\n", mime? "audio/x-aiff": "aiff audio");
712 return 1;
714 return 0;
717 char* html_string[] =
719 "title",
720 "body",
721 "head",
722 "strong",
723 "h1",
724 "h2",
725 "h3",
726 "h4",
727 "h5",
728 "h6",
729 "ul",
730 "li",
731 "dl",
732 "br",
733 "em",
734 0,
735 };
737 int
738 ishtml(void)
740 uchar *p, *q;
741 int i, count;
743 /* compare strings between '<' and '>' to html table */
744 count = 0;
745 p = buf;
746 for(;;) {
747 while (p < buf+nbuf && *p != '<')
748 p++;
749 p++;
750 if (p >= buf+nbuf)
751 break;
752 if(*p == '/')
753 p++;
754 q = p;
755 while(p < buf+nbuf && *p != '>')
756 p++;
757 if (p >= buf+nbuf)
758 break;
759 for(i = 0; html_string[i]; i++) {
760 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
761 if(count++ > 4) {
762 print(mime ? "text/html\n" : "HTML file\n");
763 return 1;
765 break;
768 p++;
770 return 0;
773 char* rfc822_string[] =
775 "from:",
776 "date:",
777 "to:",
778 "subject:",
779 "received:",
780 "reply to:",
781 "sender:",
782 0,
783 };
785 int
786 isrfc822(void)
789 char *p, *q, *r;
790 int i, count;
792 count = 0;
793 p = (char*)buf;
794 for(;;) {
795 q = strchr(p, '\n');
796 if(q == nil)
797 break;
798 *q = 0;
799 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
800 count++;
801 *q = '\n';
802 p = q+1;
803 continue;
805 *q = '\n';
806 if(*p != '\t' && *p != ' '){
807 r = strchr(p, ':');
808 if(r == 0 || r > q)
809 break;
810 for(i = 0; rfc822_string[i]; i++) {
811 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
812 count++;
813 break;
817 p = q+1;
819 if(count >= 3){
820 print(mime ? "message/rfc822\n" : "email file\n");
821 return 1;
823 return 0;
826 int
827 ismbox(void)
829 char *p, *q;
831 p = (char*)buf;
832 q = strchr(p, '\n');
833 if(q == nil)
834 return 0;
835 *q = 0;
836 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
837 print(mime ? "text/plain\n" : "mail box\n");
838 return 1;
840 *q = '\n';
841 return 0;
844 int
845 isc(void)
847 int n;
849 n = wfreq[I1];
850 /*
851 * includes
852 */
853 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
854 goto yes;
855 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
856 goto yes;
857 /*
858 * declarations
859 */
860 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
861 goto yes;
862 /*
863 * assignments
864 */
865 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
866 goto yes;
867 return 0;
869 yes:
870 if(mime){
871 print(PLAIN);
872 return 1;
874 if(wfreq[Alword] > 0)
875 print("alef program\n");
876 else
877 print("c program\n");
878 return 1;
881 int
882 islimbo(void)
885 /*
886 * includes
887 */
888 if(wfreq[Lword] < 4)
889 return 0;
890 print(mime ? PLAIN : "limbo program\n");
891 return 1;
894 int
895 isas(void)
898 /*
899 * includes
900 */
901 if(wfreq[Aword] < 2)
902 return 0;
903 print(mime ? PLAIN : "as program\n");
904 return 1;
907 /*
908 * low entropy means encrypted
909 */
910 int
911 ismung(void)
913 int i, bucket[8];
914 float cs;
916 if(nbuf < 64)
917 return 0;
918 memset(bucket, 0, sizeof(bucket));
919 for(i=0; i<64; i++)
920 bucket[(buf[i]>>5)&07] += 1;
922 cs = 0.;
923 for(i=0; i<8; i++)
924 cs += (bucket[i]-8)*(bucket[i]-8);
925 cs /= 8.;
926 if(cs <= 24.322) {
927 if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
928 print(mime ? OCTET : "compressed\n");
929 else
930 print(mime ? OCTET : "encrypted\n");
931 return 1;
933 return 0;
936 /*
937 * english by punctuation and frequencies
938 */
939 int
940 isenglish(void)
942 int vow, comm, rare, badpun, punct;
943 char *p;
945 if(guess != Fascii && guess != Feascii)
946 return 0;
947 badpun = 0;
948 punct = 0;
949 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
950 switch(*p) {
951 case '.':
952 case ',':
953 case ')':
954 case '%':
955 case ';':
956 case ':':
957 case '?':
958 punct++;
959 if(p[1] != ' ' && p[1] != '\n')
960 badpun++;
962 if(badpun*5 > punct)
963 return 0;
964 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */
965 return 0;
966 if(2*cfreq[';'] > cfreq['e'])
967 return 0;
969 vow = 0;
970 for(p="AEIOU"; *p; p++) {
971 vow += cfreq[(uchar)*p];
972 vow += cfreq[tolower((uchar)*p)];
974 comm = 0;
975 for(p="ETAION"; *p; p++) {
976 comm += cfreq[(uchar)*p];
977 comm += cfreq[tolower((uchar)*p)];
979 rare = 0;
980 for(p="VJKQXZ"; *p; p++) {
981 rare += cfreq[(uchar)*p];
982 rare += cfreq[tolower((uchar)*p)];
984 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
985 print(mime ? PLAIN : "English text\n");
986 return 1;
988 return 0;
991 /*
992 * pick up a number with
993 * syntax _*[0-9]+_
994 */
995 #define P9BITLEN 12
996 int
997 p9bitnum(uchar *bp)
999 int n, c, len;
1001 len = P9BITLEN;
1002 while(*bp == ' ') {
1003 bp++;
1004 len--;
1005 if(len <= 0)
1006 return -1;
1008 n = 0;
1009 while(len > 1) {
1010 c = *bp++;
1011 if(!isdigit(c))
1012 return -1;
1013 n = n*10 + c-'0';
1014 len--;
1016 if(*bp != ' ')
1017 return -1;
1018 return n;
1021 int
1022 depthof(char *s, int *newp)
1024 char *es;
1025 int d;
1027 *newp = 0;
1028 es = s+12;
1029 while(s<es && *s==' ')
1030 s++;
1031 if(s == es)
1032 return -1;
1033 if('0'<=*s && *s<='9')
1034 return 1<<atoi(s);
1036 *newp = 1;
1037 d = 0;
1038 while(s<es && *s!=' '){
1039 s++; /* skip letter */
1040 d += strtoul(s, &s, 10);
1043 switch(d){
1044 case 32:
1045 case 24:
1046 case 16:
1047 case 8:
1048 return d;
1050 return -1;
1053 int
1054 isp9bit(void)
1056 int dep, lox, loy, hix, hiy, px, new;
1057 ulong t;
1058 long len;
1059 char *newlabel;
1061 newlabel = "old ";
1063 dep = depthof((char*)buf + 0*P9BITLEN, &new);
1064 if(new)
1065 newlabel = "";
1066 lox = p9bitnum(buf + 1*P9BITLEN);
1067 loy = p9bitnum(buf + 2*P9BITLEN);
1068 hix = p9bitnum(buf + 3*P9BITLEN);
1069 hiy = p9bitnum(buf + 4*P9BITLEN);
1070 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1071 return 0;
1073 if(dep < 8){
1074 px = 8/dep; /* pixels per byte */
1075 /* set l to number of bytes of data per scan line */
1076 if(lox >= 0)
1077 len = (hix+px-1)/px - lox/px;
1078 else{ /* make positive before divide */
1079 t = (-lox)+px-1;
1080 t = (t/px)*px;
1081 len = (t+hix+px-1)/px;
1083 }else
1084 len = (hix-lox)*dep/8;
1085 len *= (hiy-loy); /* col length */
1086 len += 5*P9BITLEN; /* size of initial ascii */
1089 * for image file, length is non-zero and must match calculation above
1090 * for /dev/window and /dev/screen the length is always zero
1091 * for subfont, the subfont header should follow immediately.
1093 if (len != 0 && mbuf->length == 0) {
1094 print("%splan 9 image\n", newlabel);
1095 return 1;
1097 if (mbuf->length == len) {
1098 print("%splan 9 image\n", newlabel);
1099 return 1;
1101 /* Ghostscript sometimes produces a little extra on the end */
1102 if (mbuf->length < len+P9BITLEN) {
1103 print("%splan 9 image\n", newlabel);
1104 return 1;
1106 if (p9subfont(buf+len)) {
1107 print("%ssubfont file\n", newlabel);
1108 return 1;
1110 return 0;
1113 int
1114 p9subfont(uchar *p)
1116 int n, h, a;
1118 /* if image too big, assume it's a subfont */
1119 if (p+3*P9BITLEN > buf+sizeof(buf))
1120 return 1;
1122 n = p9bitnum(p + 0*P9BITLEN); /* char count */
1123 if (n < 0)
1124 return 0;
1125 h = p9bitnum(p + 1*P9BITLEN); /* height */
1126 if (h < 0)
1127 return 0;
1128 a = p9bitnum(p + 2*P9BITLEN); /* ascent */
1129 if (a < 0)
1130 return 0;
1131 return 1;
1134 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
1136 int
1137 isp9font(void)
1139 uchar *cp, *p;
1140 int i, n;
1141 char pathname[1024];
1143 cp = buf;
1144 if (!getfontnum(cp, &cp)) /* height */
1145 return 0;
1146 if (!getfontnum(cp, &cp)) /* ascent */
1147 return 0;
1148 for (i = 0; 1; i++) {
1149 if (!getfontnum(cp, &cp)) /* min */
1150 break;
1151 if (!getfontnum(cp, &cp)) /* max */
1152 return 0;
1153 while (WHITESPACE(*cp))
1154 cp++;
1155 for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1157 /* construct a path name, if needed */
1158 n = 0;
1159 if (*p != '/' && slash) {
1160 n = slash-fname+1;
1161 if (n < sizeof(pathname))
1162 memcpy(pathname, fname, n);
1163 else n = 0;
1165 if (n+cp-p < sizeof(pathname)) {
1166 memcpy(pathname+n, p, cp-p);
1167 n += cp-p;
1168 pathname[n] = 0;
1169 if (access(pathname, AEXIST) < 0)
1170 return 0;
1173 if (i) {
1174 print(mime ? "text/plain\n" : "font file\n");
1175 return 1;
1177 return 0;
1180 int
1181 getfontnum(uchar *cp, uchar **rp)
1183 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */
1184 cp++;
1185 if (*cp < '0' || *cp > '9')
1186 return 0;
1187 strtoul((char *)cp, (char **)rp, 0);
1188 if (!WHITESPACE(**rp))
1189 return 0;
1190 return 1;
1193 int
1194 isrtf(void)
1196 if(strstr((char *)buf, "\\rtf1")){
1197 print(mime ? "application/rtf\n" : "rich text format\n");
1198 return 1;
1200 return 0;
1203 int
1204 ismsdos(void)
1206 if (buf[0] == 0x4d && buf[1] == 0x5a){
1207 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1208 return 1;
1210 return 0;
1213 int
1214 iself(void)
1216 static char *cpu[] = { /* NB: incomplete and arbitary list */
1217 nil,
1218 /*1*/ "WE32100",
1219 /*2*/ "SPARC",
1220 /*3*/ "i386",
1221 /*4*/ "M68000",
1222 /*5*/ "M88000",
1223 /*6*/ "i486",
1224 /*7*/ "i860",
1225 /*8*/ "R3000",
1226 /*9*/ "S370",
1227 /*10*/ "R4000",
1228 nil, nil, nil, nil,
1229 /*15*/ "HP-PA",
1230 nil,
1231 nil,
1232 /*18*/ "sparc v8+",
1233 /*19*/ "i960",
1234 /*20*/ "PPC-32",
1235 /*21*/ "PPC-64",
1236 nil, nil, nil, nil,
1237 nil, nil, nil, nil, nil,
1238 nil, nil, nil, nil, nil,
1239 nil, nil, nil, nil,
1240 /*40*/ "ARM",
1241 /*41*/ "Alpha",
1242 nil,
1243 /*43*/ "sparc v9",
1244 nil, nil,
1245 nil, nil, nil, nil,
1246 /*50*/ "IA-64",
1247 nil, nil, nil, nil, nil,
1248 nil, nil, nil, nil, nil,
1249 nil,
1250 /*62*/ "AMD64",
1251 nil, nil, nil,
1252 nil, nil, nil, nil, nil,
1253 nil, nil, nil, nil,
1254 /*75*/ "VAX",
1258 if (memcmp(buf, "\177ELF", 4) == 0){
1259 /* gcc misparses \x7FELF as \x7FE L F */
1260 if (!mime){
1261 int n = (buf[19] << 8) | buf[18];
1262 char *p = "unknown";
1264 if (n > 0 && n < nelem(cpu) && cpu[n])
1265 p = cpu[n];
1266 else {
1267 /* try the other byte order */
1268 n = (buf[18] << 8) | buf[19];
1269 if (n > 0 && n < nelem(cpu) && cpu[n])
1270 p = cpu[n];
1272 print("%s ELF executable\n", p);
1274 else
1275 print("application/x-elf-executable");
1276 return 1;
1279 return 0;