8 * file - determine type of file
10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
85 /* codes for 'mode' field in language structure */
88 First, /* first entry for language spanning several ranges */
89 Multi, /* later entries " " " ... */
90 Shared, /* codes used in several languages */
95 int mode; /* see enum above */
103 Normal, 0, 0x0080, 0x0080, "Extended Latin",
104 Normal, 0, 0x0100, 0x01FF, "Extended Latin",
105 Normal, 0, 0x0370, 0x03FF, "Greek",
106 Normal, 0, 0x0400, 0x04FF, "Cyrillic",
107 Normal, 0, 0x0530, 0x058F, "Armenian",
108 Normal, 0, 0x0590, 0x05FF, "Hebrew",
109 Normal, 0, 0x0600, 0x06FF, "Arabic",
110 Normal, 0, 0x0900, 0x097F, "Devanagari",
111 Normal, 0, 0x0980, 0x09FF, "Bengali",
112 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi",
113 Normal, 0, 0x0A80, 0x0AFF, "Gujarati",
114 Normal, 0, 0x0B00, 0x0B7F, "Oriya",
115 Normal, 0, 0x0B80, 0x0BFF, "Tamil",
116 Normal, 0, 0x0C00, 0x0C7F, "Telugu",
117 Normal, 0, 0x0C80, 0x0CFF, "Kannada",
118 Normal, 0, 0x0D00, 0x0D7F, "Malayalam",
119 Normal, 0, 0x0E00, 0x0E7F, "Thai",
120 Normal, 0, 0x0E80, 0x0EFF, "Lao",
121 Normal, 0, 0x1000, 0x105F, "Tibetan",
122 Normal, 0, 0x10A0, 0x10FF, "Georgian",
123 Normal, 0, 0x3040, 0x30FF, "Japanese",
124 Normal, 0, 0x3100, 0x312F, "Chinese",
125 First, 0, 0x3130, 0x318F, "Korean",
126 Multi, 0, 0x3400, 0x3D2F, "Korean",
127 Shared, 0, 0x4e00, 0x9fff, "CJK",
128 Normal, 0, 0, 0, 0, /* terminal entry */
134 Fascii, /* printable ascii */
136 Futf, /* UTf character set */
137 Fbinary, /* binary */
138 Feascii, /* ASCII with control chars */
139 Fnull, /* NULL in file */
142 void bump_utf_count(Rune);
143 int cistrncmp(char*, char*, int);
145 int getfontnum(uchar*, uchar**);
164 int p9bitnum(uchar*);
165 int p9subfont(uchar*);
166 void print_utf(void);
167 void type(char*, int);
171 int (*call[])(void) =
173 long0, /* recognizable by first 4 bytes */
174 istring, /* recognizable by first string */
175 iff, /* interchange file format (strings) */
176 isrfc822, /* email file */
177 ismbox, /* mail box */
178 istar, /* recognizable by tar checksum */
179 ishtml, /* html keywords */
180 /* iscint, /* compiler/assembler intermediate */
181 islimbo, /* limbo source */
182 isc, /* c & alef compiler key words */
183 isas, /* assembler key words */
184 ismung, /* entropy compressed/encrypted */
185 isp9font, /* plan 9 font */
186 isp9bit, /* plan 9 image (as from /dev/window) */
187 isenglish, /* char frequency English */
188 isrtf, /* rich text format */
189 ismsdos, /* msdos exe (virus file attachement) */
190 iself, /* ELF (foreign) executable */
196 #define OCTET "application/octet-stream\n"
197 #define PLAIN "text/plain\n"
200 main(int argc, char *argv[])
211 fprint(2, "usage: file [-m] [file...]\n");
216 if(mime == 0 || argc > 1){
217 for(i = 0; i < argc; i++) {
218 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
230 for(i = 0; i < argc; i++)
231 type(argv[i], maxlen);
237 type(char *file, int nlen)
245 for (i = 0, p = file; *p; i++) {
246 if (*p == '/') /* find rightmost slash */
248 p += chartorune(&r, p); /* count runes */
250 print("%s:%*s",file, nlen-i+1, "");
253 if ((fd = open(file, OREAD)) < 0) {
254 print("cannot open\n");
271 print("cannot stat: %r\n");
274 if(mbuf->mode & DMDIR) {
275 print(mime ? "text/directory\n" : "directory\n");
278 if(mbuf->type != 'M' && mbuf->type != '|') {
279 print(mime ? OCTET : "special file #%c/%s\n",
280 mbuf->type, mbuf->name);
283 nbuf = read(fd, buf, sizeof(buf)-1);
286 print("cannot read\n");
290 print(mime ? PLAIN : "empty file\n");
296 * build histogram table
298 memset(cfreq, 0, sizeof(cfreq));
299 for (i = 0; language[i].name; i++)
300 language[i].count = 0;
301 eob = (char *)buf+nbuf;
302 for(n = 0, p = (char *)buf; p < eob; n++) {
303 if (!fullrune(p, eob-p) && eob-p < UTFmax)
305 p += chartorune(&r, p);
308 else if (r <= 0x7f) {
309 if (!isprint(r) && !isspace(r))
310 f = Ceascii; /* ASCII control char */
312 } else if (r == 0x080) {
316 f = Cbinary; /* Invalid Runes */
318 f = Clatin; /* Latin 1 */
321 f = Cutf; /* UTF extension */
323 cfreq[f]++; /* ASCII chars peg directly */
330 else if (cfreq[Cutf])
332 else if (cfreq[Clatin])
334 else if (cfreq[Ceascii])
336 else if (cfreq[Cnull] == n) {
337 print(mime ? OCTET : "first block all null bytes\n");
342 * lookup dictionary words
344 memset(wfreq, 0, sizeof(wfreq));
345 if(guess == Fascii || guess == Flatin || guess == Futf)
348 * call individual classify routines
350 for(i=0; call[i]; i++)
356 * print out gross classification
358 if (nbuf < 100 && !mime)
359 print(mime ? PLAIN : "short ");
361 print(mime ? PLAIN : "Ascii\n");
362 else if (guess == Feascii)
363 print(mime ? PLAIN : "extended ascii\n");
364 else if (guess == Flatin)
365 print(mime ? PLAIN : "latin ascii\n");
366 else if (guess == Futf && utf_count() < 4)
368 else print(mime ? OCTET : "binary\n");
372 bump_utf_count(Rune r)
376 high = sizeof(language)/sizeof(language[0])-1;
377 for (low = 0; low < high;) {
379 if (r >=language[mid].low) {
380 if (r <= language[mid].high) {
381 language[mid].count++;
394 for (i = 0; language[i].name; i++)
395 if (language[i].count > 0)
396 switch (language[i].mode) {
412 for (i = 'a'; i < 'z'; i++)
415 for (i = 'A'; i < 'Z'; i++)
422 find_first(char *name)
426 for (i = 0; language[i].name != 0; i++)
427 if (language[i].mode == First
428 && strcmp(language[i].name, name) == 0)
447 for (i = 0; language[i].name; i++)
448 if (language[i].count) {
449 switch(language[i].mode) {
451 j = find_first(language[i].name);
454 if (language[j].count > 0)
462 print("%s", language[i].name);
477 int low, high, mid, r;
482 while (p < buf+nbuf && !isalpha(*p))
487 while(p < buf+nbuf && isalpha(*p))
491 high = sizeof(dict)/sizeof(dict[0]);
492 for(low = 0;low < high;) {
494 r = strcmp(dict[mid].word, (char*)p2);
496 wfreq[dict[mid].class]++;
508 typedef struct Filemagic Filemagic;
516 Filemagic long0tab[] = {
517 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET,
518 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET,
519 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET,
520 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET,
521 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET,
522 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip\n",
523 070707, 0xFFFF, "cpio archive\n", OCTET,
524 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi\n",
525 0xfffa0000, 0xfffe0000, "mp3 audio\n", "audio/mpeg\n",
526 0xcafebabe, 0xFFFFFFFF, "Mach-O fat executable\n", "application/x-mach-binary\n",
527 0xfeedface, 0xFFFFFFFE, "Mach-O executable\n", "application/x-mach-binary\n",
528 0xbebafeca, 0xFFFFFFFF, "Java class\n", "application/x-java-applet\n",
532 filemagic(Filemagic *tab, int ntab, ulong x)
536 for(i=0; i<ntab; i++)
537 if((x&tab[i].mask) == tab[i].x){
538 print(mime ? tab[i].mime : tab[i].desc);
550 seek(fd, 0, 0); /* reposition to start of file */
552 if(crackhdr(fd, &f)) {
553 print(mime ? OCTET : "%s\n", f.name);
558 if(filemagic(long0tab, nelem(long0tab), x))
564 enum { NAMSIZ = 100, TBLOCK = 512 };
579 char linkname[NAMSIZ];
580 /* rest are defined by POSIX's ustar format; see p1003.2b */
581 char magic[6]; /* "ustar" */
587 char prefix[155]; /* if non-null, path = prefix "/" name */
592 checksum(union hblock *hp)
596 struct header *hdr = &hp->dbuf;
598 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
601 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
611 union hblock *hp = (union hblock *)tblock;
612 struct header *hdr = &hp->dbuf;
614 seek(fd, 0, 0); /* reposition to start of file */
615 if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
617 chksum = strtol(hdr->chksum, 0, 8);
618 if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
619 if (strcmp(hdr->magic, "ustar") == 0)
620 print(mime? "application/x-ustar\n":
621 "posix tar archive\n");
623 print(mime? "application/x-tar\n": "tar archive\n");
630 * initial words to classify file
640 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream",
641 "!<arch>\n", "archive", 8, "application/octet-stream",
642 "070707", "cpio archive - ascii header", 6, "application/octet-stream",
643 "%!", "postscript", 2, "application/postscript",
644 "\004%!", "postscript", 3, "application/postscript",
645 "x T post", "troff output for post", 8, "application/troff",
646 "x T Latin1", "troff output for Latin1", 10, "application/troff",
647 "x T utf", "troff output for UTF", 7, "application/troff",
648 "x T 202", "troff output for 202", 7, "application/troff",
649 "x T aps", "troff output for aps", 7, "application/troff",
650 "GIF", "GIF image", 3, "image/gif",
651 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript",
652 "%PDF", "PDF", 4, "application/pdf",
653 "<html>\n", "HTML file", 7, "text/html",
654 "<HTML>\n", "HTML file", 7, "text/html",
655 "compressed\n", "Compressed image or subfont", 11, "application/octet-stream",
656 "\111\111\052\000", "tiff", 4, "image/tiff",
657 "\115\115\000\052", "tiff", 4, "image/tiff",
658 "\377\330\377\340", "jpeg", 4, "image/jpeg",
659 "\377\330\377\341", "jpeg", 4, "image/jpeg",
660 "\377\330\377\333", "jpeg", 4, "image/jpeg",
661 "\106\117\126\142", "x3f", 4, "image/x3f",
662 "BM", "bmp", 2, "image/bmp",
663 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream",
664 "<MakerFile ", "FrameMaker file", 11, "application/framemaker",
665 "\033%-12345X", "HPJCL file", 9, "application/hpjcl",
666 "ID3", "mp3 audio with id3", 3, "audio/mpeg",
674 struct FILE_STRING *p;
676 for(p = file_string; p->key; p++) {
677 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
679 print("%s\n", p->mime);
681 print("%s\n", p->filetype);
685 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */
686 for(i = 5; i < nbuf; i++)
692 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
695 if(buf[0]=='#' && buf[1]=='!'){
697 for(j=2; j < nbuf && buf[j] != ' ' && buf[j] != '\n' && buf[j] != '\r'; j++)
703 print("%.*s executable file script\n", utfnlen((char*)buf+i, j-i), (char*)buf+i);
712 if (strncmp((char*)buf, "FORM", 4) == 0 &&
713 strncmp((char*)buf+8, "AIFF", 4) == 0) {
714 print("%s\n", mime? "audio/x-aiff": "aiff audio");
720 char* html_string[] =
746 /* compare strings between '<' and '>' to html table */
750 while (p < buf+nbuf && *p != '<')
758 while(p < buf+nbuf && *p != '>')
762 for(i = 0; html_string[i]; i++) {
763 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
765 print(mime ? "text/html\n" : "HTML file\n");
776 char* rfc822_string[] =
802 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
809 if(*p != '\t' && *p != ' '){
813 for(i = 0; rfc822_string[i]; i++) {
814 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
823 print(mime ? "message/rfc822\n" : "email file\n");
839 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
840 print(mime ? "text/plain\n" : "mail box\n");
856 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
858 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
863 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
868 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
877 if(wfreq[Alword] > 0)
878 print("alef program\n");
880 print("c program\n");
893 print(mime ? PLAIN : "limbo program\n");
906 print(mime ? PLAIN : "as program\n");
911 * low entropy means encrypted
921 memset(bucket, 0, sizeof(bucket));
923 bucket[(buf[i]>>5)&07] += 1;
927 cs += (bucket[i]-8)*(bucket[i]-8);
930 if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
931 print(mime ? OCTET : "compressed\n");
933 print(mime ? OCTET : "encrypted\n");
940 * english by punctuation and frequencies
945 int vow, comm, rare, badpun, punct;
948 if(guess != Fascii && guess != Feascii)
952 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
962 if(p[1] != ' ' && p[1] != '\n')
967 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */
969 if(2*cfreq[';'] > cfreq['e'])
973 for(p="AEIOU"; *p; p++) {
974 vow += cfreq[(uchar)*p];
975 vow += cfreq[tolower((uchar)*p)];
978 for(p="ETAION"; *p; p++) {
979 comm += cfreq[(uchar)*p];
980 comm += cfreq[tolower((uchar)*p)];
983 for(p="VJKQXZ"; *p; p++) {
984 rare += cfreq[(uchar)*p];
985 rare += cfreq[tolower((uchar)*p)];
987 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
988 print(mime ? PLAIN : "English text\n");
995 * pick up a number with
1025 depthof(char *s, int *newp)
1032 while(s<es && *s==' ')
1036 if('0'<=*s && *s<='9')
1041 while(s<es && *s!=' '){
1042 s++; /* skip letter */
1043 d += strtoul(s, &s, 10);
1059 int dep, lox, loy, hix, hiy, px, new;
1066 dep = depthof((char*)buf + 0*P9BITLEN, &new);
1069 lox = p9bitnum(buf + 1*P9BITLEN);
1070 loy = p9bitnum(buf + 2*P9BITLEN);
1071 hix = p9bitnum(buf + 3*P9BITLEN);
1072 hiy = p9bitnum(buf + 4*P9BITLEN);
1073 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1077 px = 8/dep; /* pixels per byte */
1078 /* set l to number of bytes of data per scan line */
1080 len = (hix+px-1)/px - lox/px;
1081 else{ /* make positive before divide */
1084 len = (t+hix+px-1)/px;
1087 len = (hix-lox)*dep/8;
1088 len *= (hiy-loy); /* col length */
1089 len += 5*P9BITLEN; /* size of initial ascii */
1092 * for image file, length is non-zero and must match calculation above
1093 * for /dev/window and /dev/screen the length is always zero
1094 * for subfont, the subfont header should follow immediately.
1096 if (len != 0 && mbuf->length == 0) {
1097 print("%splan 9 image\n", newlabel);
1100 if (mbuf->length == len) {
1101 print("%splan 9 image\n", newlabel);
1104 /* Ghostscript sometimes produces a little extra on the end */
1105 if (mbuf->length < len+P9BITLEN) {
1106 print("%splan 9 image\n", newlabel);
1109 if (p9subfont(buf+len)) {
1110 print("%ssubfont file\n", newlabel);
1121 /* if image too big, assume it's a subfont */
1122 if (p+3*P9BITLEN > buf+sizeof(buf))
1125 n = p9bitnum(p + 0*P9BITLEN); /* char count */
1128 h = p9bitnum(p + 1*P9BITLEN); /* height */
1131 a = p9bitnum(p + 2*P9BITLEN); /* ascent */
1137 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
1144 char pathname[1024];
1147 if (!getfontnum(cp, &cp)) /* height */
1149 if (!getfontnum(cp, &cp)) /* ascent */
1151 for (i = 0; 1; i++) {
1152 if (!getfontnum(cp, &cp)) /* min */
1154 if (!getfontnum(cp, &cp)) /* max */
1156 while (WHITESPACE(*cp))
1158 for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1160 /* construct a path name, if needed */
1162 if (*p != '/' && slash) {
1164 if (n < sizeof(pathname))
1165 memcpy(pathname, fname, n);
1168 if (n+cp-p < sizeof(pathname)) {
1169 memcpy(pathname+n, p, cp-p);
1172 if (access(pathname, AEXIST) < 0)
1177 print(mime ? "text/plain\n" : "font file\n");
1184 getfontnum(uchar *cp, uchar **rp)
1186 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */
1188 if (*cp < '0' || *cp > '9')
1190 strtoul((char *)cp, (char **)rp, 0);
1191 if (!WHITESPACE(**rp))
1199 if(strstr((char *)buf, "\\rtf1")){
1200 print(mime ? "application/rtf\n" : "rich text format\n");
1209 if (buf[0] == 0x4d && buf[1] == 0x5a){
1210 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1219 static char *cpu[] = { /* NB: incomplete and arbitary list */
1240 nil, nil, nil, nil, nil,
1241 nil, nil, nil, nil, nil,
1250 nil, nil, nil, nil, nil,
1251 nil, nil, nil, nil, nil,
1255 nil, nil, nil, nil, nil,
1261 if (memcmp(buf, "\177ELF", 4) == 0){
1262 /* gcc misparses \x7FELF as \x7FE L F */
1264 int n = (buf[19] << 8) | buf[18];
1265 char *p = "unknown";
1267 if (n > 0 && n < nelem(cpu) && cpu[n])
1270 /* try the other byte order */
1271 n = (buf[18] << 8) | buf[19];
1272 if (n > 0 && n < nelem(cpu) && cpu[n])
1275 print("%s ELF executable\n", p);
1278 print("application/x-elf-executable");