8 * file - determine type of file
10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
85 /* codes for 'mode' field in language structure */
88 First, /* first entry for language spanning several ranges */
89 Multi, /* later entries " " " ... */
90 Shared, /* codes used in several languages */
95 int mode; /* see enum above */
103 Normal, 0, 0x0080, 0x0080, "Extended Latin",
104 Normal, 0, 0x0100, 0x01FF, "Extended Latin",
105 Normal, 0, 0x0370, 0x03FF, "Greek",
106 Normal, 0, 0x0400, 0x04FF, "Cyrillic",
107 Normal, 0, 0x0530, 0x058F, "Armenian",
108 Normal, 0, 0x0590, 0x05FF, "Hebrew",
109 Normal, 0, 0x0600, 0x06FF, "Arabic",
110 Normal, 0, 0x0900, 0x097F, "Devanagari",
111 Normal, 0, 0x0980, 0x09FF, "Bengali",
112 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi",
113 Normal, 0, 0x0A80, 0x0AFF, "Gujarati",
114 Normal, 0, 0x0B00, 0x0B7F, "Oriya",
115 Normal, 0, 0x0B80, 0x0BFF, "Tamil",
116 Normal, 0, 0x0C00, 0x0C7F, "Telugu",
117 Normal, 0, 0x0C80, 0x0CFF, "Kannada",
118 Normal, 0, 0x0D00, 0x0D7F, "Malayalam",
119 Normal, 0, 0x0E00, 0x0E7F, "Thai",
120 Normal, 0, 0x0E80, 0x0EFF, "Lao",
121 Normal, 0, 0x1000, 0x105F, "Tibetan",
122 Normal, 0, 0x10A0, 0x10FF, "Georgian",
123 Normal, 0, 0x3040, 0x30FF, "Japanese",
124 Normal, 0, 0x3100, 0x312F, "Chinese",
125 First, 0, 0x3130, 0x318F, "Korean",
126 Multi, 0, 0x3400, 0x3D2F, "Korean",
127 Shared, 0, 0x4e00, 0x9fff, "CJK",
128 Normal, 0, 0, 0, 0, /* terminal entry */
134 Fascii, /* printable ascii */
136 Futf, /* UTf character set */
137 Fbinary, /* binary */
138 Feascii, /* ASCII with control chars */
139 Fnull, /* NULL in file */
142 void bump_utf_count(Rune);
143 int cistrncmp(char*, char*, int);
145 int getfontnum(uchar*, uchar**);
164 int p9bitnum(uchar*);
165 int p9subfont(uchar*);
166 void print_utf(void);
167 void type(char*, int);
171 int (*call[])(void) =
173 long0, /* recognizable by first 4 bytes */
174 istring, /* recognizable by first string */
175 iff, /* interchange file format (strings) */
176 isrfc822, /* email file */
177 ismbox, /* mail box */
178 istar, /* recognizable by tar checksum */
179 ishtml, /* html keywords */
180 /* iscint, /* compiler/assembler intermediate */
181 islimbo, /* limbo source */
182 isc, /* c & alef compiler key words */
183 isas, /* assembler key words */
184 ismung, /* entropy compressed/encrypted */
185 isp9font, /* plan 9 font */
186 isp9bit, /* plan 9 image (as from /dev/window) */
187 isenglish, /* char frequency English */
188 isrtf, /* rich text format */
189 ismsdos, /* msdos exe (virus file attachement) */
190 iself, /* ELF (foreign) executable */
196 #define OCTET "application/octet-stream\n"
197 #define PLAIN "text/plain\n"
200 main(int argc, char *argv[])
211 fprint(2, "usage: file [-m] [file...]\n");
216 if(mime == 0 || argc > 1){
217 for(i = 0; i < argc; i++) {
218 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
230 for(i = 0; i < argc; i++)
231 type(argv[i], maxlen);
237 type(char *file, int nlen)
245 for (i = 0, p = file; *p; i++) {
246 if (*p == '/') /* find rightmost slash */
248 p += chartorune(&r, p); /* count runes */
250 print("%s:%*s",file, nlen-i+1, "");
253 if ((fd = open(file, OREAD)) < 0) {
254 print("cannot open\n");
271 print("cannot stat: %r\n");
274 if(mbuf->mode & DMDIR) {
275 print(mime ? "text/directory\n" : "directory\n");
278 if(mbuf->type != 'M' && mbuf->type != '|') {
279 print(mime ? OCTET : "special file #%c/%s\n",
280 mbuf->type, mbuf->name);
283 nbuf = read(fd, buf, sizeof(buf)-1);
286 print("cannot read\n");
290 print(mime ? PLAIN : "empty file\n");
296 * build histogram table
298 memset(cfreq, 0, sizeof(cfreq));
299 for (i = 0; language[i].name; i++)
300 language[i].count = 0;
301 eob = (char *)buf+nbuf;
302 for(n = 0, p = (char *)buf; p < eob; n++) {
303 if (!fullrune(p, eob-p) && eob-p < UTFmax)
305 p += chartorune(&r, p);
308 else if (r <= 0x7f) {
309 if (!isprint(r) && !isspace(r))
310 f = Ceascii; /* ASCII control char */
312 } else if (r == 0x080) {
316 f = Cbinary; /* Invalid Runes */
318 f = Clatin; /* Latin 1 */
321 f = Cutf; /* UTF extension */
323 cfreq[f]++; /* ASCII chars peg directly */
330 else if (cfreq[Cutf])
332 else if (cfreq[Clatin])
334 else if (cfreq[Ceascii])
336 else if (cfreq[Cnull] == n) {
337 print(mime ? OCTET : "first block all null bytes\n");
342 * lookup dictionary words
344 memset(wfreq, 0, sizeof(wfreq));
345 if(guess == Fascii || guess == Flatin || guess == Futf)
348 * call individual classify routines
350 for(i=0; call[i]; i++)
356 * print out gross classification
358 if (nbuf < 100 && !mime)
359 print(mime ? PLAIN : "short ");
361 print(mime ? PLAIN : "Ascii\n");
362 else if (guess == Feascii)
363 print(mime ? PLAIN : "extended ascii\n");
364 else if (guess == Flatin)
365 print(mime ? PLAIN : "latin ascii\n");
366 else if (guess == Futf && utf_count() < 4)
368 else print(mime ? OCTET : "binary\n");
372 bump_utf_count(Rune r)
376 high = sizeof(language)/sizeof(language[0])-1;
377 for (low = 0; low < high;) {
379 if (r >=language[mid].low) {
380 if (r <= language[mid].high) {
381 language[mid].count++;
394 for (i = 0; language[i].name; i++)
395 if (language[i].count > 0)
396 switch (language[i].mode) {
412 for (i = 'a'; i < 'z'; i++)
415 for (i = 'A'; i < 'Z'; i++)
422 find_first(char *name)
426 for (i = 0; language[i].name != 0; i++)
427 if (language[i].mode == First
428 && strcmp(language[i].name, name) == 0)
447 for (i = 0; language[i].name; i++)
448 if (language[i].count) {
449 switch(language[i].mode) {
451 j = find_first(language[i].name);
454 if (language[j].count > 0)
462 print("%s", language[i].name);
477 int low, high, mid, r;
482 while (p < buf+nbuf && !isalpha(*p))
487 while(p < buf+nbuf && isalpha(*p))
491 high = sizeof(dict)/sizeof(dict[0]);
492 for(low = 0;low < high;) {
494 r = strcmp(dict[mid].word, (char*)p2);
496 wfreq[dict[mid].class]++;
508 typedef struct Filemagic Filemagic;
516 Filemagic long0tab[] = {
517 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET,
518 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET,
519 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET,
520 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET,
521 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET,
522 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip",
523 070707, 0xFFFF, "cpio archive\n", OCTET,
524 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi",
525 0xfffa0000, 0xfffe0000, "mp3 audio", "audio/mpeg",
529 filemagic(Filemagic *tab, int ntab, ulong x)
533 for(i=0; i<ntab; i++)
534 if((x&tab[i].mask) == tab[i].x){
535 print(mime ? tab[i].mime : tab[i].desc);
547 seek(fd, 0, 0); /* reposition to start of file */
549 if(crackhdr(fd, &f)) {
550 print(mime ? OCTET : "%s\n", f.name);
555 if(filemagic(long0tab, nelem(long0tab), x))
561 enum { NAMSIZ = 100, TBLOCK = 512 };
576 char linkname[NAMSIZ];
577 /* rest are defined by POSIX's ustar format; see p1003.2b */
578 char magic[6]; /* "ustar" */
584 char prefix[155]; /* if non-null, path = prefix "/" name */
589 checksum(union hblock *hp)
593 struct header *hdr = &hp->dbuf;
595 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
598 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
608 union hblock *hp = (union hblock *)tblock;
609 struct header *hdr = &hp->dbuf;
611 seek(fd, 0, 0); /* reposition to start of file */
612 if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
614 chksum = strtol(hdr->chksum, 0, 8);
615 if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
616 if (strcmp(hdr->magic, "ustar") == 0)
617 print(mime? "application/x-ustar\n":
618 "posix tar archive\n");
620 print(mime? "application/x-tar\n": "tar archive\n");
627 * initial words to classify file
637 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream",
638 "!<arch>\n", "archive", 8, "application/octet-stream",
639 "070707", "cpio archive - ascii header", 6, "application/octet-stream",
640 "%!", "postscript", 2, "application/postscript",
641 "\004%!", "postscript", 3, "application/postscript",
642 "x T post", "troff output for post", 8, "application/troff",
643 "x T Latin1", "troff output for Latin1", 10, "application/troff",
644 "x T utf", "troff output for UTF", 7, "application/troff",
645 "x T 202", "troff output for 202", 7, "application/troff",
646 "x T aps", "troff output for aps", 7, "application/troff",
647 "GIF", "GIF image", 3, "image/gif",
648 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript",
649 "%PDF", "PDF", 4, "application/pdf",
650 "<html>\n", "HTML file", 7, "text/html",
651 "<HTML>\n", "HTML file", 7, "text/html",
652 "compressed\n", "Compressed image or subfont", 11, "application/octet-stream",
653 "\111\111\052\000", "tiff", 4, "image/tiff",
654 "\115\115\000\052", "tiff", 4, "image/tiff",
655 "\377\330\377\340", "jpeg", 4, "image/jpeg",
656 "\377\330\377\341", "jpeg", 4, "image/jpeg",
657 "\377\330\377\333", "jpeg", 4, "image/jpeg",
658 "\106\117\126\142", "x3f", 4, "image/x3f",
659 "BM", "bmp", 2, "image/bmp",
660 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream",
661 "<MakerFile ", "FrameMaker file", 11, "application/framemaker",
662 "\033%-12345X", "HPJCL file", 9, "application/hpjcl",
663 "ID3", "mp3 audio with id3", 3, "audio/mpeg",
671 struct FILE_STRING *p;
673 for(p = file_string; p->key; p++) {
674 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
676 print("%s\n", p->mime);
678 print("%s\n", p->filetype);
682 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */
683 for(i = 5; i < nbuf; i++)
689 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
692 if(buf[0]=='#' && buf[1]=='!'){
694 for(j=2; j < nbuf && buf[j] != ' ' && buf[j] != '\n' && buf[j] != '\r'; j++)
700 print("%.*s executable file script\n", utfnlen((char*)buf+i, j-i), (char*)buf+i);
709 if (strncmp((char*)buf, "FORM", 4) == 0 &&
710 strncmp((char*)buf+8, "AIFF", 4) == 0) {
711 print("%s\n", mime? "audio/x-aiff": "aiff audio");
717 char* html_string[] =
743 /* compare strings between '<' and '>' to html table */
747 while (p < buf+nbuf && *p != '<')
755 while(p < buf+nbuf && *p != '>')
759 for(i = 0; html_string[i]; i++) {
760 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
762 print(mime ? "text/html\n" : "HTML file\n");
773 char* rfc822_string[] =
799 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
806 if(*p != '\t' && *p != ' '){
810 for(i = 0; rfc822_string[i]; i++) {
811 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
820 print(mime ? "message/rfc822\n" : "email file\n");
836 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
837 print(mime ? "text/plain\n" : "mail box\n");
853 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
855 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
860 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
865 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
874 if(wfreq[Alword] > 0)
875 print("alef program\n");
877 print("c program\n");
890 print(mime ? PLAIN : "limbo program\n");
903 print(mime ? PLAIN : "as program\n");
908 * low entropy means encrypted
918 memset(bucket, 0, sizeof(bucket));
920 bucket[(buf[i]>>5)&07] += 1;
924 cs += (bucket[i]-8)*(bucket[i]-8);
927 if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
928 print(mime ? OCTET : "compressed\n");
930 print(mime ? OCTET : "encrypted\n");
937 * english by punctuation and frequencies
942 int vow, comm, rare, badpun, punct;
945 if(guess != Fascii && guess != Feascii)
949 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
959 if(p[1] != ' ' && p[1] != '\n')
964 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */
966 if(2*cfreq[';'] > cfreq['e'])
970 for(p="AEIOU"; *p; p++) {
971 vow += cfreq[(uchar)*p];
972 vow += cfreq[tolower((uchar)*p)];
975 for(p="ETAION"; *p; p++) {
976 comm += cfreq[(uchar)*p];
977 comm += cfreq[tolower((uchar)*p)];
980 for(p="VJKQXZ"; *p; p++) {
981 rare += cfreq[(uchar)*p];
982 rare += cfreq[tolower((uchar)*p)];
984 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
985 print(mime ? PLAIN : "English text\n");
992 * pick up a number with
1022 depthof(char *s, int *newp)
1029 while(s<es && *s==' ')
1033 if('0'<=*s && *s<='9')
1038 while(s<es && *s!=' '){
1039 s++; /* skip letter */
1040 d += strtoul(s, &s, 10);
1056 int dep, lox, loy, hix, hiy, px, new;
1063 dep = depthof((char*)buf + 0*P9BITLEN, &new);
1066 lox = p9bitnum(buf + 1*P9BITLEN);
1067 loy = p9bitnum(buf + 2*P9BITLEN);
1068 hix = p9bitnum(buf + 3*P9BITLEN);
1069 hiy = p9bitnum(buf + 4*P9BITLEN);
1070 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1074 px = 8/dep; /* pixels per byte */
1075 /* set l to number of bytes of data per scan line */
1077 len = (hix+px-1)/px - lox/px;
1078 else{ /* make positive before divide */
1081 len = (t+hix+px-1)/px;
1084 len = (hix-lox)*dep/8;
1085 len *= (hiy-loy); /* col length */
1086 len += 5*P9BITLEN; /* size of initial ascii */
1089 * for image file, length is non-zero and must match calculation above
1090 * for /dev/window and /dev/screen the length is always zero
1091 * for subfont, the subfont header should follow immediately.
1093 if (len != 0 && mbuf->length == 0) {
1094 print("%splan 9 image\n", newlabel);
1097 if (mbuf->length == len) {
1098 print("%splan 9 image\n", newlabel);
1101 /* Ghostscript sometimes produces a little extra on the end */
1102 if (mbuf->length < len+P9BITLEN) {
1103 print("%splan 9 image\n", newlabel);
1106 if (p9subfont(buf+len)) {
1107 print("%ssubfont file\n", newlabel);
1118 /* if image too big, assume it's a subfont */
1119 if (p+3*P9BITLEN > buf+sizeof(buf))
1122 n = p9bitnum(p + 0*P9BITLEN); /* char count */
1125 h = p9bitnum(p + 1*P9BITLEN); /* height */
1128 a = p9bitnum(p + 2*P9BITLEN); /* ascent */
1134 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
1141 char pathname[1024];
1144 if (!getfontnum(cp, &cp)) /* height */
1146 if (!getfontnum(cp, &cp)) /* ascent */
1148 for (i = 0; 1; i++) {
1149 if (!getfontnum(cp, &cp)) /* min */
1151 if (!getfontnum(cp, &cp)) /* max */
1153 while (WHITESPACE(*cp))
1155 for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1157 /* construct a path name, if needed */
1159 if (*p != '/' && slash) {
1161 if (n < sizeof(pathname))
1162 memcpy(pathname, fname, n);
1165 if (n+cp-p < sizeof(pathname)) {
1166 memcpy(pathname+n, p, cp-p);
1169 if (access(pathname, AEXIST) < 0)
1174 print(mime ? "text/plain\n" : "font file\n");
1181 getfontnum(uchar *cp, uchar **rp)
1183 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */
1185 if (*cp < '0' || *cp > '9')
1187 strtoul((char *)cp, (char **)rp, 0);
1188 if (!WHITESPACE(**rp))
1196 if(strstr((char *)buf, "\\rtf1")){
1197 print(mime ? "application/rtf\n" : "rich text format\n");
1206 if (buf[0] == 0x4d && buf[1] == 0x5a){
1207 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1216 static char *cpu[] = { /* NB: incomplete and arbitary list */
1237 nil, nil, nil, nil, nil,
1238 nil, nil, nil, nil, nil,
1247 nil, nil, nil, nil, nil,
1248 nil, nil, nil, nil, nil,
1252 nil, nil, nil, nil, nil,
1258 if (memcmp(buf, "\177ELF", 4) == 0){
1259 /* gcc misparses \x7FELF as \x7FE L F */
1261 int n = (buf[19] << 8) | buf[18];
1262 char *p = "unknown";
1264 if (n > 0 && n < nelem(cpu) && cpu[n])
1267 /* try the other byte order */
1268 n = (buf[18] << 8) | buf[19];
1269 if (n > 0 && n < nelem(cpu) && cpu[n])
1272 print("%s ELF executable\n", p);
1275 print("application/x-elf-executable");