Blob
1 #ifdef PLAN92 #include <u.h>3 #include <libc.h>4 #include <bio.h>5 #else6 #include <sys/types.h>7 #include <stdio.h>8 #include <stdlib.h>9 #include <string.h>10 #include <unistd.h>11 #include <errno.h>12 #include "plan9.h"13 #endif14 #include "hdr.h"16 /*17 the our_* routines are implementations for the corresponding library18 routines. for a while, i tried to actually name them wctomb etc19 but stopped that after i found a system which made wchar_t an20 unsigned char.21 */23 #ifdef PLAN924 long getrune(Biobuf *);25 long getisorune(Biobuf *);26 #else27 long getrune(FILE *);28 long getisorune(FILE *);29 #endif30 int our_wctomb(char *s, unsigned long wc);31 int our_mbtowc(unsigned long *p, char *s, unsigned n);32 int runetoisoutf(char *str, Rune *rune);33 int fullisorune(char *str, int n);34 int isochartorune(Rune *rune, char *str);36 void37 utf_in(int fd, long *notused, struct convert *out)38 {39 #ifndef PLAN940 FILE *fp;41 #else /* PLAN9 */42 Biobuf b;43 #endif /* PLAN9 */44 Rune *r;45 long l;47 USED(notused);48 #ifndef PLAN949 if((fp = fdopen(fd, "r")) == NULL){50 EPR "%s: input setup error: %s\n", argv0, strerror(errno));51 #else /* PLAN9 */52 if(Binit(&b, fd, OREAD) < 0){53 EPR "%s: input setup error: %r\n", argv0);54 #endif /* PLAN9 */55 EXIT(1, "input error");56 }57 r = runes;58 for(;;)59 #ifndef PLAN960 switch(l = getrune(fp))61 #else /* PLAN9 */62 switch(l = getrune(&b))63 #endif /* PLAN9 */64 {65 case -1:66 goto done;67 case -2:68 if(squawk)69 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput);70 if(clean)71 continue;72 nerrors++;73 l = Runeerror;74 default:75 *r++ = l;76 if(r >= &runes[N]){77 OUT(out, runes, r-runes);78 r = runes;79 }80 }81 done:82 if(r > runes)83 OUT(out, runes, r-runes);84 }86 void87 utf_out(Rune *base, int n, long *notused)88 {89 char *p;90 Rune *r;92 USED(notused);93 nrunes += n;94 for(r = base, p = obuf; n-- > 0; r++){95 p += our_wctomb(p, *r);96 }97 noutput += p-obuf;98 write(1, obuf, p-obuf);99 }101 void102 isoutf_in(int fd, long *notused, struct convert *out)103 {104 #ifndef PLAN9105 FILE *fp;106 #else /* PLAN9 */107 Biobuf b;108 #endif /* PLAN9 */109 Rune *r;110 long l;112 USED(notused);113 #ifndef PLAN9114 if((fp = fdopen(fd, "r")) == 0){115 EPR "%s: input setup error: %s\n", argv0, strerror(errno));116 #else /* PLAN9 */117 if(Binit(&b, fd, OREAD) < 0){118 EPR "%s: input setup error: %r\n", argv0);119 #endif /* PLAN9 */120 EXIT(1, "input error");121 }122 r = runes;123 for(;;)124 #ifndef PLAN9125 switch(l = getisorune(fp))126 #else /* PLAN9 */127 switch(l = getisorune(&b))128 #endif /* PLAN9 */129 {130 case -1:131 goto done;132 case -2:133 if(squawk)134 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput);135 if(clean)136 continue;137 nerrors++;138 l = Runeerror;139 default:140 *r++ = l;141 if(r >= &runes[N]){142 OUT(out, runes, r-runes);143 r = runes;144 }145 }146 done:147 if(r > runes)148 OUT(out, runes, r-runes);149 }151 void152 isoutf_out(Rune *base, int n, long *notused)153 {154 char *p;155 Rune *r;157 USED(notused);158 nrunes += n;159 for(r = base, p = obuf; n-- > 0; r++)160 p += runetoisoutf(p, r);161 noutput += p-obuf;162 write(1, obuf, p-obuf);163 }165 long166 #ifndef PLAN9167 getrune(FILE *fp)168 #else /* PLAN9 */169 getrune(Biobuf *bp)170 #endif /* PLAN9 */171 {172 int c, i;173 char str[UTFmax]; /* MB_LEN_MAX really */174 unsigned long l;175 int n;177 for(i = 0;;){178 #ifndef PLAN9179 c = getc(fp);180 #else /* PLAN9 */181 c = Bgetc(bp);182 #endif /* PLAN9 */183 if(c < 0)184 return(c);185 ninput++;186 str[i++] = c;187 n = our_mbtowc(&l, str, i);188 if(n == -1)189 return(-2);190 if(n > 0)191 return(l);192 }193 }195 long196 #ifndef PLAN9197 getisorune(FILE *fp)198 #else /* PLAN9 */199 getisorune(Biobuf *bp)200 #endif /* PLAN9 */201 {202 int c, i;203 Rune rune;204 char str[UTFmax]; /* MB_LEN_MAX really */206 for(i = 0;;){207 #ifndef PLAN9208 c = getc(fp);209 #else /* PLAN9 */210 c = Bgetc(bp);211 #endif /* PLAN9 */212 if(c < 0)213 return(c);214 ninput++;215 str[i++] = c;216 if(fullisorune(str, i))217 break;218 }219 isochartorune(&rune, str);220 if(rune == Runeerror)221 return -2;222 return(rune);223 }225 enum226 {227 Char1 = Runeself, Rune1 = Runeself,228 Char21 = 0xA1, Rune21 = 0x0100,229 Char22 = 0xF6, Rune22 = 0x4016,230 Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */231 Esc = 0xBE, Bad = Runeerror232 };234 static uchar U[256];235 static uchar T[256];237 static238 void239 mktable(void)240 {241 int i, u;243 for(i=0; i<256; i++) {244 u = i + (0x5E - 0xA0);245 if(i < 0xA0)246 u = i + (0xDF - 0x7F);247 if(i < 0x7F)248 u = i + (0x00 - 0x21);249 if(i < 0x21)250 u = i + (0xBE - 0x00);251 U[i] = u;252 T[u] = i;253 }254 }256 int257 isochartorune(Rune *rune, char *str)258 {259 int c, c1, c2;260 long l;262 if(U[0] == 0)263 mktable();265 /*266 * one character sequence267 * 00000-0009F => 00-9F268 */269 c = *(uchar*)str;270 if(c < Char1) {271 *rune = c;272 return 1;273 }275 /*276 * two character sequence277 * 000A0-000FF => A0; A0-FF278 */279 c1 = *(uchar*)(str+1);280 if(c < Char21) {281 if(c1 >= Rune1 && c1 < Rune21) {282 *rune = c1;283 return 2;284 }285 goto bad;286 }288 /*289 * two character sequence290 * 00100-04015 => A1-F5; 21-7E/A0-FF291 */292 c1 = U[c1];293 if(c1 >= Esc)294 goto bad;295 if(c < Char22) {296 *rune = (c-Char21)*Esc + c1 + Rune21;297 return 2;298 }300 /*301 * three character sequence302 * 04016-38E2D => A6-FB; 21-7E/A0-FF303 */304 c2 = U[*(uchar*)(str+2)];305 if(c2 >= Esc)306 goto bad;307 if(c < Char3) {308 l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;309 if(l >= Rune3)310 goto bad;311 *rune = l;312 return 3;313 }315 /*316 * bad decoding317 */318 bad:319 *rune = Bad;320 return 1;321 }323 int324 runetoisoutf(char *str, Rune *rune)325 {326 long c;328 if(T[0] == 0)329 mktable();331 /*332 * one character sequence333 * 00000-0009F => 00-9F334 */335 c = *rune;336 if(c < Rune1) {337 str[0] = c;338 return 1;339 }341 /*342 * two character sequence343 * 000A0-000FF => A0; A0-FF344 */345 if(c < Rune21) {346 str[0] = (uchar)Char1;347 str[1] = c;348 return 2;349 }351 /*352 * two character sequence353 * 00100-04015 => A1-F5; 21-7E/A0-FF354 */355 if(c < Rune22) {356 c -= Rune21;357 str[0] = c/Esc + Char21;358 str[1] = T[c%Esc];359 return 2;360 }362 /*363 * three character sequence364 * 04016-38E2D => A6-FB; 21-7E/A0-FF365 */366 c -= Rune22;367 str[0] = c/(Esc*Esc) + Char22;368 str[1] = T[c/Esc%Esc];369 str[2] = T[c%Esc];370 return 3;371 }373 int374 fullisorune(char *str, int n)375 {376 int c;378 if(n > 0) {379 c = *(uchar*)str;380 if(c < Char1)381 return 1;382 if(n > 1)383 if(c < Char22 || n > 2)384 return 1;385 }386 return 0;387 }389 #ifdef PLAN9390 int errno;391 #endif393 enum394 {395 T1 = 0x00,396 Tx = 0x80,397 T2 = 0xC0,398 T3 = 0xE0,399 T4 = 0xF0,400 T5 = 0xF8,401 T6 = 0xFC,403 Bit1 = 7,404 Bitx = 6,405 Bit2 = 5,406 Bit3 = 4,407 Bit4 = 3,408 Bit5 = 2,409 Bit6 = 2,411 Mask1 = (1<<Bit1)-1,412 Maskx = (1<<Bitx)-1,413 Mask2 = (1<<Bit2)-1,414 Mask3 = (1<<Bit3)-1,415 Mask4 = (1<<Bit4)-1,416 Mask5 = (1<<Bit5)-1,417 Mask6 = (1<<Bit6)-1,419 Wchar1 = (1UL<<Bit1)-1,420 Wchar2 = (1UL<<(Bit2+Bitx))-1,421 Wchar3 = (1UL<<(Bit3+2*Bitx))-1,422 Wchar4 = (1UL<<(Bit4+3*Bitx))-1,423 Wchar5 = (1UL<<(Bit5+4*Bitx))-1425 #ifndef EILSEQ426 , /* we hate ansi c's comma rules */427 EILSEQ = 123428 #endif /* PLAN9 */429 };431 int432 our_wctomb(char *s, unsigned long wc)433 {434 if(s == 0)435 return 0; /* no shift states */436 if(wc & ~Wchar2) {437 if(wc & ~Wchar4) {438 if(wc & ~Wchar5) {439 /* 6 bytes */440 s[0] = T6 | ((wc >> 5*Bitx) & Mask6);441 s[1] = Tx | ((wc >> 4*Bitx) & Maskx);442 s[2] = Tx | ((wc >> 3*Bitx) & Maskx);443 s[3] = Tx | ((wc >> 2*Bitx) & Maskx);444 s[4] = Tx | ((wc >> 1*Bitx) & Maskx);445 s[5] = Tx | (wc & Maskx);446 return 6;447 }448 /* 5 bytes */449 s[0] = T5 | (wc >> 4*Bitx);450 s[1] = Tx | ((wc >> 3*Bitx) & Maskx);451 s[2] = Tx | ((wc >> 2*Bitx) & Maskx);452 s[3] = Tx | ((wc >> 1*Bitx) & Maskx);453 s[4] = Tx | (wc & Maskx);454 return 5;455 }456 if(wc & ~Wchar3) {457 /* 4 bytes */458 s[0] = T4 | (wc >> 3*Bitx);459 s[1] = Tx | ((wc >> 2*Bitx) & Maskx);460 s[2] = Tx | ((wc >> 1*Bitx) & Maskx);461 s[3] = Tx | (wc & Maskx);462 return 4;463 }464 /* 3 bytes */465 s[0] = T3 | (wc >> 2*Bitx);466 s[1] = Tx | ((wc >> 1*Bitx) & Maskx);467 s[2] = Tx | (wc & Maskx);468 return 3;469 }470 if(wc & ~Wchar1) {471 /* 2 bytes */472 s[0] = T2 | (wc >> 1*Bitx);473 s[1] = Tx | (wc & Maskx);474 return 2;475 }476 /* 1 byte */477 s[0] = T1 | wc;478 return 1;479 }481 int482 our_mbtowc(unsigned long *p, char *s, unsigned n)483 {484 uchar *us;485 int c0, c1, c2, c3, c4, c5;486 unsigned long wc;488 if(s == 0)489 return 0; /* no shift states */491 if(n < 1)492 goto badlen;493 us = (uchar*)s;494 c0 = us[0];495 if(c0 >= T3) {496 if(n < 3)497 goto badlen;498 c1 = us[1] ^ Tx;499 c2 = us[2] ^ Tx;500 if((c1|c2) & T2)501 goto bad;502 if(c0 >= T5) {503 if(n < 5)504 goto badlen;505 c3 = us[3] ^ Tx;506 c4 = us[4] ^ Tx;507 if((c3|c4) & T2)508 goto bad;509 if(c0 >= T6) {510 /* 6 bytes */511 if(n < 6)512 goto badlen;513 c5 = us[5] ^ Tx;514 if(c5 & T2)515 goto bad;516 wc = ((((((((((c0 & Mask6) << Bitx) |517 c1) << Bitx) | c2) << Bitx) |518 c3) << Bitx) | c4) << Bitx) | c5;519 if(wc <= Wchar5)520 goto bad;521 *p = wc;522 return 6;523 }524 /* 5 bytes */525 wc = ((((((((c0 & Mask5) << Bitx) |526 c1) << Bitx) | c2) << Bitx) |527 c3) << Bitx) | c4;528 if(wc <= Wchar4)529 goto bad;530 *p = wc;531 return 5;532 }533 if(c0 >= T4) {534 /* 4 bytes */535 if(n < 4)536 goto badlen;537 c3 = us[3] ^ Tx;538 if(c3 & T2)539 goto bad;540 wc = ((((((c0 & Mask4) << Bitx) |541 c1) << Bitx) | c2) << Bitx) |542 c3;543 if(wc <= Wchar3)544 goto bad;545 *p = wc;546 return 4;547 }548 /* 3 bytes */549 wc = ((((c0 & Mask3) << Bitx) |550 c1) << Bitx) | c2;551 if(wc <= Wchar2)552 goto bad;553 *p = wc;554 return 3;555 }556 if(c0 >= T2) {557 /* 2 bytes */558 if(n < 2)559 goto badlen;560 c1 = us[1] ^ Tx;561 if(c1 & T2)562 goto bad;563 wc = ((c0 & Mask2) << Bitx) |564 c1;565 if(wc <= Wchar1)566 goto bad;567 *p = wc;568 return 2;569 }570 /* 1 byte */571 if(c0 >= Tx)572 goto bad;573 *p = c0;574 return 1;576 bad:577 errno = EILSEQ;578 return -1;579 badlen:580 return -2;581 }