8 * file - determine type of file
10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
85 /* codes for 'mode' field in language structure */
88 First, /* first entry for language spanning several ranges */
89 Multi, /* later entries " " " ... */
90 Shared, /* codes used in several languages */
95 int mode; /* see enum above */
103 Normal, 0, 0x0080, 0x0080, "Extended Latin",
104 Normal, 0, 0x0100, 0x01FF, "Extended Latin",
105 Normal, 0, 0x0370, 0x03FF, "Greek",
106 Normal, 0, 0x0400, 0x04FF, "Cyrillic",
107 Normal, 0, 0x0530, 0x058F, "Armenian",
108 Normal, 0, 0x0590, 0x05FF, "Hebrew",
109 Normal, 0, 0x0600, 0x06FF, "Arabic",
110 Normal, 0, 0x0900, 0x097F, "Devanagari",
111 Normal, 0, 0x0980, 0x09FF, "Bengali",
112 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi",
113 Normal, 0, 0x0A80, 0x0AFF, "Gujarati",
114 Normal, 0, 0x0B00, 0x0B7F, "Oriya",
115 Normal, 0, 0x0B80, 0x0BFF, "Tamil",
116 Normal, 0, 0x0C00, 0x0C7F, "Telugu",
117 Normal, 0, 0x0C80, 0x0CFF, "Kannada",
118 Normal, 0, 0x0D00, 0x0D7F, "Malayalam",
119 Normal, 0, 0x0E00, 0x0E7F, "Thai",
120 Normal, 0, 0x0E80, 0x0EFF, "Lao",
121 Normal, 0, 0x1000, 0x105F, "Tibetan",
122 Normal, 0, 0x10A0, 0x10FF, "Georgian",
123 Normal, 0, 0x3040, 0x30FF, "Japanese",
124 Normal, 0, 0x3100, 0x312F, "Chinese",
125 First, 0, 0x3130, 0x318F, "Korean",
126 Multi, 0, 0x3400, 0x3D2F, "Korean",
127 Shared, 0, 0x4e00, 0x9fff, "CJK",
128 Normal, 0, 0, 0, 0, /* terminal entry */
134 Fascii, /* printable ascii */
136 Futf, /* UTf character set */
137 Fbinary, /* binary */
138 Feascii, /* ASCII with control chars */
139 Fnull, /* NULL in file */
142 void bump_utf_count(Rune);
143 int cistrncmp(char*, char*, int);
145 int getfontnum(uchar*, uchar**);
164 int p9bitnum(uchar*);
165 int p9subfont(uchar*);
166 void print_utf(void);
167 void type(char*, int);
171 int (*call[])(void) =
173 long0, /* recognizable by first 4 bytes */
174 istring, /* recognizable by first string */
175 iff, /* interchange file format (strings) */
176 isrfc822, /* email file */
177 ismbox, /* mail box */
178 istar, /* recognizable by tar checksum */
179 ishtml, /* html keywords */
180 /* iscint, /* compiler/assembler intermediate */
181 islimbo, /* limbo source */
182 isc, /* c & alef compiler key words */
183 isas, /* assembler key words */
184 ismung, /* entropy compressed/encrypted */
185 isp9font, /* plan 9 font */
186 isp9bit, /* plan 9 image (as from /dev/window) */
187 isenglish, /* char frequency English */
188 isrtf, /* rich text format */
189 ismsdos, /* msdos exe (virus file attachement) */
190 iself, /* ELF (foreign) executable */
196 #define OCTET "application/octet-stream\n"
197 #define PLAIN "text/plain\n"
200 main(int argc, char *argv[])
211 fprint(2, "usage: file [-m] [file...]\n");
216 if(mime == 0 || argc > 1){
217 for(i = 0; i < argc; i++) {
218 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
230 for(i = 0; i < argc; i++)
231 type(argv[i], maxlen);
237 type(char *file, int nlen)
245 for (i = 0, p = file; *p; i++) {
246 if (*p == '/') /* find rightmost slash */
248 p += chartorune(&r, p); /* count runes */
250 print("%s:%*s",file, nlen-i+1, "");
253 if ((fd = open(file, OREAD)) < 0) {
254 print("cannot open\n");
271 print("cannot stat: %r\n");
274 if(mbuf->mode & DMDIR) {
275 print(mime ? "text/directory\n" : "directory\n");
278 if(mbuf->type != 'M' && mbuf->type != '|') {
279 print(mime ? OCTET : "special file #%c/%s\n",
280 mbuf->type, mbuf->name);
283 nbuf = read(fd, buf, sizeof(buf)-1);
286 print("cannot read\n");
290 print(mime ? PLAIN : "empty file\n");
296 * build histogram table
298 memset(cfreq, 0, sizeof(cfreq));
299 for (i = 0; language[i].name; i++)
300 language[i].count = 0;
301 eob = (char *)buf+nbuf;
302 for(n = 0, p = (char *)buf; p < eob; n++) {
303 if (!fullrune(p, eob-p) && eob-p < UTFmax)
305 p += chartorune(&r, p);
308 else if (r <= 0x7f) {
309 if (!isprint(r) && !isspace(r))
310 f = Ceascii; /* ASCII control char */
312 } else if (r == 0x080) {
316 f = Cbinary; /* Invalid Runes */
318 f = Clatin; /* Latin 1 */
321 f = Cutf; /* UTF extension */
323 cfreq[f]++; /* ASCII chars peg directly */
330 else if (cfreq[Cutf])
332 else if (cfreq[Clatin])
334 else if (cfreq[Ceascii])
336 else if (cfreq[Cnull] == n) {
337 print(mime ? OCTET : "first block all null bytes\n");
342 * lookup dictionary words
344 memset(wfreq, 0, sizeof(wfreq));
345 if(guess == Fascii || guess == Flatin || guess == Futf)
348 * call individual classify routines
350 for(i=0; call[i]; i++)
356 * print out gross classification
358 if (nbuf < 100 && !mime)
359 print(mime ? PLAIN : "short ");
361 print(mime ? PLAIN : "Ascii\n");
362 else if (guess == Feascii)
363 print(mime ? PLAIN : "extended ascii\n");
364 else if (guess == Flatin)
365 print(mime ? PLAIN : "latin ascii\n");
366 else if (guess == Futf && utf_count() < 4)
368 else print(mime ? OCTET : "binary\n");
372 bump_utf_count(Rune r)
376 high = sizeof(language)/sizeof(language[0])-1;
377 for (low = 0; low < high;) {
379 if (r >=language[mid].low) {
380 if (r <= language[mid].high) {
381 language[mid].count++;
394 for (i = 0; language[i].name; i++)
395 if (language[i].count > 0)
396 switch (language[i].mode) {
412 for (i = 'a'; i < 'z'; i++)
415 for (i = 'A'; i < 'Z'; i++)
422 find_first(char *name)
426 for (i = 0; language[i].name != 0; i++)
427 if (language[i].mode == First
428 && strcmp(language[i].name, name) == 0)
447 for (i = 0; language[i].name; i++)
448 if (language[i].count) {
449 switch(language[i].mode) {
451 j = find_first(language[i].name);
454 if (language[j].count > 0)
462 print("%s", language[i].name);
477 int low, high, mid, r;
482 while (p < buf+nbuf && !isalpha(*p))
487 while(p < buf+nbuf && isalpha(*p))
491 high = sizeof(dict)/sizeof(dict[0]);
492 for(low = 0;low < high;) {
494 r = strcmp(dict[mid].word, (char*)p2);
496 wfreq[dict[mid].class]++;
508 typedef struct Filemagic Filemagic;
516 Filemagic long0tab[] = {
517 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET,
518 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET,
519 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET,
520 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET,
521 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET,
522 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip",
523 070707, 0xFFFF, "cpio archive\n", OCTET,
524 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi",
525 0xfffa0000, 0xfffe0000, "mp3 audio", "audio/mpeg",
529 filemagic(Filemagic *tab, int ntab, ulong x)
533 for(i=0; i<ntab; i++)
534 if((x&tab[i].mask) == tab[i].x){
535 print(mime ? tab[i].mime : tab[i].desc);
547 seek(fd, 0, 0); /* reposition to start of file */
549 if(crackhdr(fd, &f)) {
550 print(mime ? OCTET : "%s\n", f.name);
555 if(filemagic(long0tab, nelem(long0tab), x))
561 enum { NAMSIZ = 100, TBLOCK = 512 };
576 char linkname[NAMSIZ];
577 /* rest are defined by POSIX's ustar format; see p1003.2b */
578 char magic[6]; /* "ustar" */
584 char prefix[155]; /* if non-null, path = prefix "/" name */
589 checksum(union hblock *hp)
593 struct header *hdr = &hp->dbuf;
595 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
598 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
608 union hblock *hp = (union hblock *)tblock;
609 struct header *hdr = &hp->dbuf;
611 seek(fd, 0, 0); /* reposition to start of file */
612 if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
614 chksum = strtol(hdr->chksum, 0, 8);
615 if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
616 if (strcmp(hdr->magic, "ustar") == 0)
617 print(mime? "application/x-ustar\n":
618 "posix tar archive\n");
620 print(mime? "application/x-tar\n": "tar archive\n");
627 * initial words to classify file
637 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream",
638 "!<arch>\n", "archive", 8, "application/octet-stream",
639 "070707", "cpio archive - ascii header", 6, "application/octet-stream",
640 "#!/bin/rc", "rc executable file", 9, "text/plain",
641 "#!/bin/sh", "sh executable file", 9, "text/plain",
642 "%!", "postscript", 2, "application/postscript",
643 "\004%!", "postscript", 3, "application/postscript",
644 "x T post", "troff output for post", 8, "application/troff",
645 "x T Latin1", "troff output for Latin1", 10, "application/troff",
646 "x T utf", "troff output for UTF", 7, "application/troff",
647 "x T 202", "troff output for 202", 7, "application/troff",
648 "x T aps", "troff output for aps", 7, "application/troff",
649 "GIF", "GIF image", 3, "image/gif",
650 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript",
651 "%PDF", "PDF", 4, "application/pdf",
652 "<html>\n", "HTML file", 7, "text/html",
653 "<HTML>\n", "HTML file", 7, "text/html",
654 "compressed\n", "Compressed image or subfont", 11, "application/octet-stream",
655 "\111\111\052\000", "tiff", 4, "image/tiff",
656 "\115\115\000\052", "tiff", 4, "image/tiff",
657 "\377\330\377\340", "jpeg", 4, "image/jpeg",
658 "\377\330\377\341", "jpeg", 4, "image/jpeg",
659 "\377\330\377\333", "jpeg", 4, "image/jpeg",
660 "BM", "bmp", 2, "image/bmp",
661 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream",
662 "<MakerFile ", "FrameMaker file", 11, "application/framemaker",
663 "\033%-12345X", "HPJCL file", 9, "application/hpjcl",
664 "ID3", "mp3 audio with id3", 3, "audio/mpeg",
672 struct FILE_STRING *p;
674 for(p = file_string; p->key; p++) {
675 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
677 print("%s\n", p->mime);
679 print("%s\n", p->filetype);
683 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */
684 for(i = 5; i < nbuf; i++)
690 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
699 if (strncmp((char*)buf, "FORM", 4) == 0 &&
700 strncmp((char*)buf+8, "AIFF", 4) == 0) {
701 print("%s\n", mime? "audio/x-aiff": "aiff audio");
707 char* html_string[] =
733 /* compare strings between '<' and '>' to html table */
737 while (p < buf+nbuf && *p != '<')
745 while(p < buf+nbuf && *p != '>')
749 for(i = 0; html_string[i]; i++) {
750 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
752 print(mime ? "text/html\n" : "HTML file\n");
763 char* rfc822_string[] =
789 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
796 if(*p != '\t' && *p != ' '){
800 for(i = 0; rfc822_string[i]; i++) {
801 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
810 print(mime ? "message/rfc822\n" : "email file\n");
826 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
827 print(mime ? "text/plain\n" : "mail box\n");
843 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
845 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
850 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
855 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
864 if(wfreq[Alword] > 0)
865 print("alef program\n");
867 print("c program\n");
880 print(mime ? PLAIN : "limbo program\n");
893 print(mime ? PLAIN : "as program\n");
898 * low entropy means encrypted
908 memset(bucket, 0, sizeof(bucket));
910 bucket[(buf[i]>>5)&07] += 1;
914 cs += (bucket[i]-8)*(bucket[i]-8);
917 if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
918 print(mime ? OCTET : "compressed\n");
920 print(mime ? OCTET : "encrypted\n");
927 * english by punctuation and frequencies
932 int vow, comm, rare, badpun, punct;
935 if(guess != Fascii && guess != Feascii)
939 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
949 if(p[1] != ' ' && p[1] != '\n')
954 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */
956 if(2*cfreq[';'] > cfreq['e'])
960 for(p="AEIOU"; *p; p++) {
961 vow += cfreq[(uchar)*p];
962 vow += cfreq[tolower((uchar)*p)];
965 for(p="ETAION"; *p; p++) {
966 comm += cfreq[(uchar)*p];
967 comm += cfreq[tolower((uchar)*p)];
970 for(p="VJKQXZ"; *p; p++) {
971 rare += cfreq[(uchar)*p];
972 rare += cfreq[tolower((uchar)*p)];
974 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
975 print(mime ? PLAIN : "English text\n");
982 * pick up a number with
1012 depthof(char *s, int *newp)
1019 while(s<es && *s==' ')
1023 if('0'<=*s && *s<='9')
1028 while(s<es && *s!=' '){
1029 s++; /* skip letter */
1030 d += strtoul(s, &s, 10);
1046 int dep, lox, loy, hix, hiy, px, new;
1053 dep = depthof((char*)buf + 0*P9BITLEN, &new);
1056 lox = p9bitnum(buf + 1*P9BITLEN);
1057 loy = p9bitnum(buf + 2*P9BITLEN);
1058 hix = p9bitnum(buf + 3*P9BITLEN);
1059 hiy = p9bitnum(buf + 4*P9BITLEN);
1060 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1064 px = 8/dep; /* pixels per byte */
1065 /* set l to number of bytes of data per scan line */
1067 len = (hix+px-1)/px - lox/px;
1068 else{ /* make positive before divide */
1071 len = (t+hix+px-1)/px;
1074 len = (hix-lox)*dep/8;
1075 len *= (hiy-loy); /* col length */
1076 len += 5*P9BITLEN; /* size of initial ascii */
1079 * for image file, length is non-zero and must match calculation above
1080 * for /dev/window and /dev/screen the length is always zero
1081 * for subfont, the subfont header should follow immediately.
1083 if (len != 0 && mbuf->length == 0) {
1084 print("%splan 9 image\n", newlabel);
1087 if (mbuf->length == len) {
1088 print("%splan 9 image\n", newlabel);
1091 /* Ghostscript sometimes produces a little extra on the end */
1092 if (mbuf->length < len+P9BITLEN) {
1093 print("%splan 9 image\n", newlabel);
1096 if (p9subfont(buf+len)) {
1097 print("%ssubfont file\n", newlabel);
1108 /* if image too big, assume it's a subfont */
1109 if (p+3*P9BITLEN > buf+sizeof(buf))
1112 n = p9bitnum(p + 0*P9BITLEN); /* char count */
1115 h = p9bitnum(p + 1*P9BITLEN); /* height */
1118 a = p9bitnum(p + 2*P9BITLEN); /* ascent */
1124 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
1131 char pathname[1024];
1134 if (!getfontnum(cp, &cp)) /* height */
1136 if (!getfontnum(cp, &cp)) /* ascent */
1138 for (i = 0; 1; i++) {
1139 if (!getfontnum(cp, &cp)) /* min */
1141 if (!getfontnum(cp, &cp)) /* max */
1143 while (WHITESPACE(*cp))
1145 for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1147 /* construct a path name, if needed */
1149 if (*p != '/' && slash) {
1151 if (n < sizeof(pathname))
1152 memcpy(pathname, fname, n);
1155 if (n+cp-p < sizeof(pathname)) {
1156 memcpy(pathname+n, p, cp-p);
1159 if (access(pathname, AEXIST) < 0)
1164 print(mime ? "text/plain\n" : "font file\n");
1171 getfontnum(uchar *cp, uchar **rp)
1173 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */
1175 if (*cp < '0' || *cp > '9')
1177 strtoul((char *)cp, (char **)rp, 0);
1178 if (!WHITESPACE(**rp))
1186 if(strstr((char *)buf, "\\rtf1")){
1187 print(mime ? "application/rtf\n" : "rich text format\n");
1196 if (buf[0] == 0x4d && buf[1] == 0x5a){
1197 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1206 static char *cpu[] = { /* NB: incomplete and arbitary list */
1227 nil, nil, nil, nil, nil,
1228 nil, nil, nil, nil, nil,
1237 nil, nil, nil, nil, nil,
1238 nil, nil, nil, nil, nil,
1242 nil, nil, nil, nil, nil,
1248 if (memcmp(buf, "\0177ELF", 4) == 0){
1249 /* gcc misparses \x7FELF as \x7FE L F */
1251 int n = (buf[19] << 8) | buf[18];
1252 char *p = "unknown";
1254 if (n > 0 && n < nelem(cpu) && cpu[n])
1257 /* try the other byte order */
1258 n = (buf[18] << 8) | buf[19];
1259 if (n > 0 && n < nelem(cpu) && cpu[n])
1262 print("%s ELF executable\n", p);
1265 print("application/x-elf-executable");