8 * file - determine type of file
10 #define LENDIAN(p) ((p)[0] | ((p)[1]<<8) | ((p)[2]<<16) | ((p)[3]<<24))
85 /* codes for 'mode' field in language structure */
88 First, /* first entry for language spanning several ranges */
89 Multi, /* later entries " " " ... */
90 Shared, /* codes used in several languages */
95 int mode; /* see enum above */
103 Normal, 0, 0x0080, 0x0080, "Extended Latin",
104 Normal, 0, 0x0100, 0x01FF, "Extended Latin",
105 Normal, 0, 0x0370, 0x03FF, "Greek",
106 Normal, 0, 0x0400, 0x04FF, "Cyrillic",
107 Normal, 0, 0x0530, 0x058F, "Armenian",
108 Normal, 0, 0x0590, 0x05FF, "Hebrew",
109 Normal, 0, 0x0600, 0x06FF, "Arabic",
110 Normal, 0, 0x0900, 0x097F, "Devanagari",
111 Normal, 0, 0x0980, 0x09FF, "Bengali",
112 Normal, 0, 0x0A00, 0x0A7F, "Gurmukhi",
113 Normal, 0, 0x0A80, 0x0AFF, "Gujarati",
114 Normal, 0, 0x0B00, 0x0B7F, "Oriya",
115 Normal, 0, 0x0B80, 0x0BFF, "Tamil",
116 Normal, 0, 0x0C00, 0x0C7F, "Telugu",
117 Normal, 0, 0x0C80, 0x0CFF, "Kannada",
118 Normal, 0, 0x0D00, 0x0D7F, "Malayalam",
119 Normal, 0, 0x0E00, 0x0E7F, "Thai",
120 Normal, 0, 0x0E80, 0x0EFF, "Lao",
121 Normal, 0, 0x1000, 0x105F, "Tibetan",
122 Normal, 0, 0x10A0, 0x10FF, "Georgian",
123 Normal, 0, 0x3040, 0x30FF, "Japanese",
124 Normal, 0, 0x3100, 0x312F, "Chinese",
125 First, 0, 0x3130, 0x318F, "Korean",
126 Multi, 0, 0x3400, 0x3D2F, "Korean",
127 Shared, 0, 0x4e00, 0x9fff, "CJK",
128 Normal, 0, 0, 0, 0, /* terminal entry */
134 Fascii, /* printable ascii */
136 Futf, /* UTf character set */
137 Fbinary, /* binary */
138 Feascii, /* ASCII with control chars */
139 Fnull, /* NULL in file */
142 void bump_utf_count(Rune);
143 int cistrncmp(char*, char*, int);
145 int getfontnum(uchar*, uchar**);
164 int p9bitnum(uchar*);
165 int p9subfont(uchar*);
166 void print_utf(void);
167 void type(char*, int);
171 int (*call[])(void) =
173 long0, /* recognizable by first 4 bytes */
174 istring, /* recognizable by first string */
175 iff, /* interchange file format (strings) */
176 isrfc822, /* email file */
177 ismbox, /* mail box */
178 istar, /* recognizable by tar checksum */
179 ishtml, /* html keywords */
180 /* iscint, /* compiler/assembler intermediate */
181 islimbo, /* limbo source */
182 isc, /* c & alef compiler key words */
183 isas, /* assembler key words */
184 ismung, /* entropy compressed/encrypted */
185 isp9font, /* plan 9 font */
186 isp9bit, /* plan 9 image (as from /dev/window) */
187 isenglish, /* char frequency English */
188 isrtf, /* rich text format */
189 ismsdos, /* msdos exe (virus file attachement) */
190 iself, /* ELF (foreign) executable */
196 #define OCTET "application/octet-stream\n"
197 #define PLAIN "text/plain\n"
200 main(int argc, char *argv[])
211 fprint(2, "usage: file [-m] [file...]\n");
216 if(mime == 0 || argc > 1){
217 for(i = 0; i < argc; i++) {
218 for (j = 0, cp = argv[i]; *cp; j++, cp += chartorune(&r, cp))
230 for(i = 0; i < argc; i++)
231 type(argv[i], maxlen);
237 type(char *file, int nlen)
245 for (i = 0, p = file; *p; i++) {
246 if (*p == '/') /* find rightmost slash */
248 p += chartorune(&r, p); /* count runes */
250 print("%s:%*s",file, nlen-i+1, "");
253 if ((fd = open(file, OREAD)) < 0) {
254 print("cannot open\n");
271 print("cannot stat: %r\n");
274 if(mbuf->mode & DMDIR) {
275 print(mime ? "text/directory\n" : "directory\n");
278 if(mbuf->type != 'M' && mbuf->type != '|') {
279 print(mime ? OCTET : "special file #%c/%s\n",
280 mbuf->type, mbuf->name);
283 nbuf = read(fd, buf, sizeof(buf)-1);
286 print("cannot read\n");
290 print(mime ? PLAIN : "empty file\n");
296 * build histogram table
298 memset(cfreq, 0, sizeof(cfreq));
299 for (i = 0; language[i].name; i++)
300 language[i].count = 0;
301 eob = (char *)buf+nbuf;
302 for(n = 0, p = (char *)buf; p < eob; n++) {
303 if (!fullrune(p, eob-p) && eob-p < UTFmax)
305 p += chartorune(&r, p);
308 else if (r <= 0x7f) {
309 if (!isprint(r) && !isspace(r))
310 f = Ceascii; /* ASCII control char */
312 } else if (r == 0x080) {
316 f = Cbinary; /* Invalid Runes */
318 f = Clatin; /* Latin 1 */
321 f = Cutf; /* UTF extension */
323 cfreq[f]++; /* ASCII chars peg directly */
330 else if (cfreq[Cutf])
332 else if (cfreq[Clatin])
334 else if (cfreq[Ceascii])
336 else if (cfreq[Cnull] == n) {
337 print(mime ? OCTET : "first block all null bytes\n");
342 * lookup dictionary words
344 memset(wfreq, 0, sizeof(wfreq));
345 if(guess == Fascii || guess == Flatin || guess == Futf)
348 * call individual classify routines
350 for(i=0; call[i]; i++)
356 * print out gross classification
358 if (nbuf < 100 && !mime)
359 print(mime ? PLAIN : "short ");
361 print(mime ? PLAIN : "Ascii\n");
362 else if (guess == Feascii)
363 print(mime ? PLAIN : "extended ascii\n");
364 else if (guess == Flatin)
365 print(mime ? PLAIN : "latin ascii\n");
366 else if (guess == Futf && utf_count() < 4)
368 else print(mime ? OCTET : "binary\n");
372 bump_utf_count(Rune r)
376 high = sizeof(language)/sizeof(language[0])-1;
377 for (low = 0; low < high;) {
379 if (r >=language[mid].low) {
380 if (r <= language[mid].high) {
381 language[mid].count++;
394 for (i = 0; language[i].name; i++)
395 if (language[i].count > 0)
396 switch (language[i].mode) {
412 for (i = 'a'; i < 'z'; i++)
415 for (i = 'A'; i < 'Z'; i++)
422 find_first(char *name)
426 for (i = 0; language[i].name != 0; i++)
427 if (language[i].mode == First
428 && strcmp(language[i].name, name) == 0)
447 for (i = 0; language[i].name; i++)
448 if (language[i].count) {
449 switch(language[i].mode) {
451 j = find_first(language[i].name);
454 if (language[j].count > 0)
462 print("%s", language[i].name);
477 int low, high, mid, r;
482 while (p < buf+nbuf && !isalpha(*p))
487 while(p < buf+nbuf && isalpha(*p))
491 high = sizeof(dict)/sizeof(dict[0]);
492 for(low = 0;low < high;) {
494 r = strcmp(dict[mid].word, (char*)p2);
496 wfreq[dict[mid].class]++;
508 typedef struct Filemagic Filemagic;
516 Filemagic long0tab[] = {
517 0xF16DF16D, 0xFFFFFFFF, "pac1 audio file\n", OCTET,
518 0x31636170, 0xFFFFFFFF, "pac3 audio file\n", OCTET,
519 0x32636170, 0xFFFF00FF, "pac4 audio file\n", OCTET,
520 0xBA010000, 0xFFFFFFFF, "mpeg system stream\n", OCTET,
521 0x30800CC0, 0xFFFFFFFF, "inferno .dis executable\n", OCTET,
522 0x04034B50, 0xFFFFFFFF, "zip archive\n", "application/zip",
523 070707, 0xFFFF, "cpio archive\n", OCTET,
524 0x2F7, 0xFFFF, "tex dvi\n", "application/dvi",
525 0xfffa0000, 0xfffe0000, "mp3 audio", "audio/mpeg",
529 filemagic(Filemagic *tab, int ntab, ulong x)
533 for(i=0; i<ntab; i++)
534 if((x&tab[i].mask) == tab[i].x){
535 print(mime ? tab[i].mime : tab[i].desc);
547 seek(fd, 0, 0); /* reposition to start of file */
549 if(crackhdr(fd, &f)) {
550 print(mime ? OCTET : "%s\n", f.name);
555 if(filemagic(long0tab, nelem(long0tab), x))
561 enum { NAMSIZ = 100, TBLOCK = 512 };
576 char linkname[NAMSIZ];
577 /* rest are defined by POSIX's ustar format; see p1003.2b */
578 char magic[6]; /* "ustar" */
584 char prefix[155]; /* if non-null, path = prefix "/" name */
589 checksum(union hblock *hp)
593 struct header *hdr = &hp->dbuf;
595 for (cp = hdr->chksum; cp < &hdr->chksum[sizeof hdr->chksum]; cp++)
598 for (cp = hp->dummy; cp < &hp->dummy[TBLOCK]; cp++)
608 union hblock *hp = (union hblock *)tblock;
609 struct header *hdr = &hp->dbuf;
611 seek(fd, 0, 0); /* reposition to start of file */
612 if (readn(fd, tblock, sizeof tblock) != sizeof tblock)
614 chksum = strtol(hdr->chksum, 0, 8);
615 if (hdr->name[0] != '\0' && checksum(hp) == chksum) {
616 if (strcmp(hdr->magic, "ustar") == 0)
617 print(mime? "application/x-ustar\n":
618 "posix tar archive\n");
620 print(mime? "application/x-tar\n": "tar archive\n");
627 * initial words to classify file
637 "!<arch>\n__.SYMDEF", "archive random library", 16, "application/octet-stream",
638 "!<arch>\n", "archive", 8, "application/octet-stream",
639 "070707", "cpio archive - ascii header", 6, "application/octet-stream",
640 "%!", "postscript", 2, "application/postscript",
641 "\004%!", "postscript", 3, "application/postscript",
642 "x T post", "troff output for post", 8, "application/troff",
643 "x T Latin1", "troff output for Latin1", 10, "application/troff",
644 "x T utf", "troff output for UTF", 7, "application/troff",
645 "x T 202", "troff output for 202", 7, "application/troff",
646 "x T aps", "troff output for aps", 7, "application/troff",
647 "GIF", "GIF image", 3, "image/gif",
648 "\0PC Research, Inc\0", "ghostscript fax file", 18, "application/ghostscript",
649 "%PDF", "PDF", 4, "application/pdf",
650 "<html>\n", "HTML file", 7, "text/html",
651 "<HTML>\n", "HTML file", 7, "text/html",
652 "compressed\n", "Compressed image or subfont", 11, "application/octet-stream",
653 "\111\111\052\000", "tiff", 4, "image/tiff",
654 "\115\115\000\052", "tiff", 4, "image/tiff",
655 "\377\330\377\340", "jpeg", 4, "image/jpeg",
656 "\377\330\377\341", "jpeg", 4, "image/jpeg",
657 "\377\330\377\333", "jpeg", 4, "image/jpeg",
658 "BM", "bmp", 2, "image/bmp",
659 "\xD0\xCF\x11\xE0\xA1\xB1\x1A\xE1", "microsoft office document", 8, "application/octet-stream",
660 "<MakerFile ", "FrameMaker file", 11, "application/framemaker",
661 "\033%-12345X", "HPJCL file", 9, "application/hpjcl",
662 "ID3", "mp3 audio with id3", 3, "audio/mpeg",
670 struct FILE_STRING *p;
672 for(p = file_string; p->key; p++) {
673 if(nbuf >= p->length && !memcmp(buf, p->key, p->length)) {
675 print("%s\n", p->mime);
677 print("%s\n", p->filetype);
681 if(strncmp((char*)buf, "TYPE=", 5) == 0) { /* td */
682 for(i = 5; i < nbuf; i++)
688 print("%.*s picture\n", utfnlen((char*)buf+5, i-5), (char*)buf+5);
691 if(buf[0]=='#' && buf[1]=='!'){
693 for(j=2; j < nbuf && buf[j] != ' ' && buf[j] != '\n' && buf[j] != '\r'; j++)
699 print("%.*s executable file script\n", utfnlen((char*)buf+i, j-i), (char*)buf+i);
708 if (strncmp((char*)buf, "FORM", 4) == 0 &&
709 strncmp((char*)buf+8, "AIFF", 4) == 0) {
710 print("%s\n", mime? "audio/x-aiff": "aiff audio");
716 char* html_string[] =
742 /* compare strings between '<' and '>' to html table */
746 while (p < buf+nbuf && *p != '<')
754 while(p < buf+nbuf && *p != '>')
758 for(i = 0; html_string[i]; i++) {
759 if(cistrncmp(html_string[i], (char*)q, p-q) == 0) {
761 print(mime ? "text/html\n" : "HTML file\n");
772 char* rfc822_string[] =
798 if(p == (char*)buf && strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ")){
805 if(*p != '\t' && *p != ' '){
809 for(i = 0; rfc822_string[i]; i++) {
810 if(cistrncmp(p, rfc822_string[i], strlen(rfc822_string[i])) == 0){
819 print(mime ? "message/rfc822\n" : "email file\n");
835 if(strncmp(p, "From ", 5) == 0 && strstr(p, " remote from ") == nil){
836 print(mime ? "text/plain\n" : "mail box\n");
852 if(n >= 2 && wfreq[I2] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
854 if(n >= 1 && wfreq[Alword] >= n && wfreq[I3] >= n && cfreq['.'] >= n)
859 if(wfreq[Cword] >= 5 && cfreq[';'] >= 5)
864 if(cfreq[';'] >= 10 && cfreq['='] >= 10 && wfreq[Cword] >= 1)
873 if(wfreq[Alword] > 0)
874 print("alef program\n");
876 print("c program\n");
889 print(mime ? PLAIN : "limbo program\n");
902 print(mime ? PLAIN : "as program\n");
907 * low entropy means encrypted
917 memset(bucket, 0, sizeof(bucket));
919 bucket[(buf[i]>>5)&07] += 1;
923 cs += (bucket[i]-8)*(bucket[i]-8);
926 if(buf[0]==0x1f && (buf[1]==0x8b || buf[1]==0x9d))
927 print(mime ? OCTET : "compressed\n");
929 print(mime ? OCTET : "encrypted\n");
936 * english by punctuation and frequencies
941 int vow, comm, rare, badpun, punct;
944 if(guess != Fascii && guess != Feascii)
948 for(p = (char *)buf; p < (char *)buf+nbuf-1; p++)
958 if(p[1] != ' ' && p[1] != '\n')
963 if(cfreq['>']+cfreq['<']+cfreq['/'] > cfreq['e']) /* shell file test */
965 if(2*cfreq[';'] > cfreq['e'])
969 for(p="AEIOU"; *p; p++) {
970 vow += cfreq[(uchar)*p];
971 vow += cfreq[tolower((uchar)*p)];
974 for(p="ETAION"; *p; p++) {
975 comm += cfreq[(uchar)*p];
976 comm += cfreq[tolower((uchar)*p)];
979 for(p="VJKQXZ"; *p; p++) {
980 rare += cfreq[(uchar)*p];
981 rare += cfreq[tolower((uchar)*p)];
983 if(vow*5 >= nbuf-cfreq[' '] && comm >= 10*rare) {
984 print(mime ? PLAIN : "English text\n");
991 * pick up a number with
1021 depthof(char *s, int *newp)
1028 while(s<es && *s==' ')
1032 if('0'<=*s && *s<='9')
1037 while(s<es && *s!=' '){
1038 s++; /* skip letter */
1039 d += strtoul(s, &s, 10);
1055 int dep, lox, loy, hix, hiy, px, new;
1062 dep = depthof((char*)buf + 0*P9BITLEN, &new);
1065 lox = p9bitnum(buf + 1*P9BITLEN);
1066 loy = p9bitnum(buf + 2*P9BITLEN);
1067 hix = p9bitnum(buf + 3*P9BITLEN);
1068 hiy = p9bitnum(buf + 4*P9BITLEN);
1069 if(dep < 0 || lox < 0 || loy < 0 || hix < 0 || hiy < 0)
1073 px = 8/dep; /* pixels per byte */
1074 /* set l to number of bytes of data per scan line */
1076 len = (hix+px-1)/px - lox/px;
1077 else{ /* make positive before divide */
1080 len = (t+hix+px-1)/px;
1083 len = (hix-lox)*dep/8;
1084 len *= (hiy-loy); /* col length */
1085 len += 5*P9BITLEN; /* size of initial ascii */
1088 * for image file, length is non-zero and must match calculation above
1089 * for /dev/window and /dev/screen the length is always zero
1090 * for subfont, the subfont header should follow immediately.
1092 if (len != 0 && mbuf->length == 0) {
1093 print("%splan 9 image\n", newlabel);
1096 if (mbuf->length == len) {
1097 print("%splan 9 image\n", newlabel);
1100 /* Ghostscript sometimes produces a little extra on the end */
1101 if (mbuf->length < len+P9BITLEN) {
1102 print("%splan 9 image\n", newlabel);
1105 if (p9subfont(buf+len)) {
1106 print("%ssubfont file\n", newlabel);
1117 /* if image too big, assume it's a subfont */
1118 if (p+3*P9BITLEN > buf+sizeof(buf))
1121 n = p9bitnum(p + 0*P9BITLEN); /* char count */
1124 h = p9bitnum(p + 1*P9BITLEN); /* height */
1127 a = p9bitnum(p + 2*P9BITLEN); /* ascent */
1133 #define WHITESPACE(c) ((c) == ' ' || (c) == '\t' || (c) == '\n')
1140 char pathname[1024];
1143 if (!getfontnum(cp, &cp)) /* height */
1145 if (!getfontnum(cp, &cp)) /* ascent */
1147 for (i = 0; 1; i++) {
1148 if (!getfontnum(cp, &cp)) /* min */
1150 if (!getfontnum(cp, &cp)) /* max */
1152 while (WHITESPACE(*cp))
1154 for (p = cp; *cp && !WHITESPACE(*cp); cp++)
1156 /* construct a path name, if needed */
1158 if (*p != '/' && slash) {
1160 if (n < sizeof(pathname))
1161 memcpy(pathname, fname, n);
1164 if (n+cp-p < sizeof(pathname)) {
1165 memcpy(pathname+n, p, cp-p);
1168 if (access(pathname, AEXIST) < 0)
1173 print(mime ? "text/plain\n" : "font file\n");
1180 getfontnum(uchar *cp, uchar **rp)
1182 while (WHITESPACE(*cp)) /* extract ulong delimited by whitespace */
1184 if (*cp < '0' || *cp > '9')
1186 strtoul((char *)cp, (char **)rp, 0);
1187 if (!WHITESPACE(**rp))
1195 if(strstr((char *)buf, "\\rtf1")){
1196 print(mime ? "application/rtf\n" : "rich text format\n");
1205 if (buf[0] == 0x4d && buf[1] == 0x5a){
1206 print(mime ? "application/x-msdownload\n" : "MSDOS executable\n");
1215 static char *cpu[] = { /* NB: incomplete and arbitary list */
1236 nil, nil, nil, nil, nil,
1237 nil, nil, nil, nil, nil,
1246 nil, nil, nil, nil, nil,
1247 nil, nil, nil, nil, nil,
1251 nil, nil, nil, nil, nil,
1257 if (memcmp(buf, "\177ELF", 4) == 0){
1258 /* gcc misparses \x7FELF as \x7FE L F */
1260 int n = (buf[19] << 8) | buf[18];
1261 char *p = "unknown";
1263 if (n > 0 && n < nelem(cpu) && cpu[n])
1266 /* try the other byte order */
1267 n = (buf[18] << 8) | buf[19];
1268 if (n > 0 && n < nelem(cpu) && cpu[n])
1271 print("%s ELF executable\n", p);
1274 print("application/x-elf-executable");