Blob
1 #ifdef PLAN92 #include <u.h>3 #include <libc.h>4 #include <bio.h>5 #ifdef PLAN9PORT6 #include <errno.h>7 #else8 extern int errno;9 #endif10 #else11 #include <sys/types.h>12 #include <stdio.h>13 #include <stdlib.h>14 #include <string.h>15 #include <unistd.h>16 #include <errno.h>17 #include "plan9.h"18 #endif19 #include "hdr.h"20 #ifndef EILSEQ21 #define EILSEQ 999822 #endif24 /*25 the our_* routines are implementations for the corresponding library26 routines. for a while, i tried to actually name them wctomb etc27 but stopped that after i found a system which made wchar_t an28 unsigned char.29 */31 int our_wctomb(char *s, unsigned long wc);32 int our_mbtowc(unsigned long *p, char *s, unsigned n);33 int runetoisoutf(char *str, Rune *rune);34 int fullisorune(char *str, int n);35 int isochartorune(Rune *rune, char *str);37 void38 utf_in(int fd, long *notused, struct convert *out)39 {40 char buf[N];41 int i, j, c, n, tot;42 ulong l;44 USED(notused);45 tot = 0;46 while((n = read(fd, buf+tot, N-tot)) >= 0){47 tot += n;48 for(i=j=0; i<tot; ){49 c = our_mbtowc(&l, buf+i, tot-i);50 if(c == -2)51 break;52 if(c == -1){53 if(squawk)54 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);55 if(clean)56 continue;57 nerrors++;58 l = Runeerror;59 c = 1;60 }61 runes[j++] = l;62 i += c;63 }64 OUT(out, runes, j);65 tot -= i;66 ninput += i;67 if(tot)68 memmove(buf, buf+i, tot);69 if(n == 0)70 break;71 }72 }74 void75 utf_out(Rune *base, int n, long *notused)76 {77 char *p;78 Rune *r;80 USED(notused);81 nrunes += n;82 for(r = base, p = obuf; n-- > 0; r++){83 p += our_wctomb(p, *r);84 }85 noutput += p-obuf;86 write(1, obuf, p-obuf);87 }89 void90 isoutf_in(int fd, long *notused, struct convert *out)91 {92 char buf[N];93 int i, j, c, n, tot;95 USED(notused);96 tot = 0;97 while((n = read(fd, buf+tot, N-tot)) >= 0){98 tot += n;99 for(i=j=0; i<tot; ){100 if(!fullisorune(buf+i, tot-i))101 break;102 c = isochartorune(&runes[j], buf+i);103 if(runes[j] == Runeerror){104 if(squawk)105 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);106 if(clean)107 continue;108 nerrors++;109 }110 j++;111 i += c;112 }113 OUT(out, runes, j);114 tot -= i;115 ninput += i;116 if(tot)117 memmove(buf, buf+i, tot);118 if(n == 0)119 break;120 }121 }123 void124 isoutf_out(Rune *base, int n, long *notused)125 {126 char *p;127 Rune *r;129 USED(notused);130 nrunes += n;131 for(r = base, p = obuf; n-- > 0; r++)132 p += runetoisoutf(p, r);133 noutput += p-obuf;134 write(1, obuf, p-obuf);135 }138 enum139 {140 Char1 = Runeself, Rune1 = Runeself,141 Char21 = 0xA1, Rune21 = 0x0100,142 Char22 = 0xF6, Rune22 = 0x4016,143 Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */144 Esc = 0xBE, Bad = Runeerror145 };147 static uchar U[256];148 static uchar T[256];150 static151 void152 mktable(void)153 {154 int i, u;156 for(i=0; i<256; i++) {157 u = i + (0x5E - 0xA0);158 if(i < 0xA0)159 u = i + (0xDF - 0x7F);160 if(i < 0x7F)161 u = i + (0x00 - 0x21);162 if(i < 0x21)163 u = i + (0xBE - 0x00);164 U[i] = u;165 T[u] = i;166 }167 }169 int170 isochartorune(Rune *rune, char *str)171 {172 int c, c1, c2;173 long l;175 if(U[0] == 0)176 mktable();178 /*179 * one character sequence180 * 00000-0009F => 00-9F181 */182 c = *(uchar*)str;183 if(c < Char1) {184 *rune = c;185 return 1;186 }188 /*189 * two character sequence190 * 000A0-000FF => A0; A0-FF191 */192 c1 = *(uchar*)(str+1);193 if(c < Char21) {194 if(c1 >= Rune1 && c1 < Rune21) {195 *rune = c1;196 return 2;197 }198 goto bad;199 }201 /*202 * two character sequence203 * 00100-04015 => A1-F5; 21-7E/A0-FF204 */205 c1 = U[c1];206 if(c1 >= Esc)207 goto bad;208 if(c < Char22) {209 *rune = (c-Char21)*Esc + c1 + Rune21;210 return 2;211 }213 /*214 * three character sequence215 * 04016-38E2D => A6-FB; 21-7E/A0-FF216 */217 c2 = U[*(uchar*)(str+2)];218 if(c2 >= Esc)219 goto bad;220 if(c < Char3) {221 l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;222 if(l >= Rune3)223 goto bad;224 *rune = l;225 return 3;226 }228 /*229 * bad decoding230 */231 bad:232 *rune = Bad;233 return 1;234 }236 int237 runetoisoutf(char *str, Rune *rune)238 {239 long c;241 if(T[0] == 0)242 mktable();244 /*245 * one character sequence246 * 00000-0009F => 00-9F247 */248 c = *rune;249 if(c < Rune1) {250 str[0] = c;251 return 1;252 }254 /*255 * two character sequence256 * 000A0-000FF => A0; A0-FF257 */258 if(c < Rune21) {259 str[0] = (char)Char1;260 str[1] = c;261 return 2;262 }264 /*265 * two character sequence266 * 00100-04015 => A1-F5; 21-7E/A0-FF267 */268 if(c < Rune22) {269 c -= Rune21;270 str[0] = c/Esc + Char21;271 str[1] = T[c%Esc];272 return 2;273 }275 /*276 * three character sequence277 * 04016-38E2D => A6-FB; 21-7E/A0-FF278 */279 c -= Rune22;280 str[0] = c/(Esc*Esc) + Char22;281 str[1] = T[c/Esc%Esc];282 str[2] = T[c%Esc];283 return 3;284 }286 int287 fullisorune(char *str, int n)288 {289 int c;291 if(n > 0) {292 c = *(uchar*)str;293 if(c < Char1)294 return 1;295 if(n > 1)296 if(c < Char22 || n > 2)297 return 1;298 }299 return 0;300 }302 enum303 {304 T1 = 0x00,305 Tx = 0x80,306 T2 = 0xC0,307 T3 = 0xE0,308 T4 = 0xF0,309 T5 = 0xF8,310 T6 = 0xFC,312 Bit1 = 7,313 Bitx = 6,314 Bit2 = 5,315 Bit3 = 4,316 Bit4 = 3,317 Bit5 = 2,318 Bit6 = 2,320 Mask1 = (1<<Bit1)-1,321 Maskx = (1<<Bitx)-1,322 Mask2 = (1<<Bit2)-1,323 Mask3 = (1<<Bit3)-1,324 Mask4 = (1<<Bit4)-1,325 Mask5 = (1<<Bit5)-1,326 Mask6 = (1<<Bit6)-1,328 Wchar1 = (1UL<<Bit1)-1,329 Wchar2 = (1UL<<(Bit2+Bitx))-1,330 Wchar3 = (1UL<<(Bit3+2*Bitx))-1,331 Wchar4 = (1UL<<(Bit4+3*Bitx))-1,332 Wchar5 = (1UL<<(Bit5+4*Bitx))-1333 };335 int336 our_wctomb(char *s, unsigned long wc)337 {338 if(s == 0)339 return 0; /* no shift states */340 if(wc & ~Wchar2) {341 if(wc & ~Wchar4) {342 if(wc & ~Wchar5) {343 /* 6 bytes */344 s[0] = T6 | ((wc >> 5*Bitx) & Mask6);345 s[1] = Tx | ((wc >> 4*Bitx) & Maskx);346 s[2] = Tx | ((wc >> 3*Bitx) & Maskx);347 s[3] = Tx | ((wc >> 2*Bitx) & Maskx);348 s[4] = Tx | ((wc >> 1*Bitx) & Maskx);349 s[5] = Tx | (wc & Maskx);350 return 6;351 }352 /* 5 bytes */353 s[0] = T5 | (wc >> 4*Bitx);354 s[1] = Tx | ((wc >> 3*Bitx) & Maskx);355 s[2] = Tx | ((wc >> 2*Bitx) & Maskx);356 s[3] = Tx | ((wc >> 1*Bitx) & Maskx);357 s[4] = Tx | (wc & Maskx);358 return 5;359 }360 if(wc & ~Wchar3) {361 /* 4 bytes */362 s[0] = T4 | (wc >> 3*Bitx);363 s[1] = Tx | ((wc >> 2*Bitx) & Maskx);364 s[2] = Tx | ((wc >> 1*Bitx) & Maskx);365 s[3] = Tx | (wc & Maskx);366 return 4;367 }368 /* 3 bytes */369 s[0] = T3 | (wc >> 2*Bitx);370 s[1] = Tx | ((wc >> 1*Bitx) & Maskx);371 s[2] = Tx | (wc & Maskx);372 return 3;373 }374 if(wc & ~Wchar1) {375 /* 2 bytes */376 s[0] = T2 | (wc >> 1*Bitx);377 s[1] = Tx | (wc & Maskx);378 return 2;379 }380 /* 1 byte */381 s[0] = T1 | wc;382 return 1;383 }385 int386 our_mbtowc(unsigned long *p, char *s, unsigned n)387 {388 uchar *us;389 int c0, c1, c2, c3, c4, c5;390 unsigned long wc;392 if(s == 0)393 return 0; /* no shift states */395 if(n < 1)396 goto badlen;397 us = (uchar*)s;398 c0 = us[0];399 if(c0 >= T3) {400 if(n < 3)401 goto badlen;402 c1 = us[1] ^ Tx;403 c2 = us[2] ^ Tx;404 if((c1|c2) & T2)405 goto bad;406 if(c0 >= T5) {407 if(n < 5)408 goto badlen;409 c3 = us[3] ^ Tx;410 c4 = us[4] ^ Tx;411 if((c3|c4) & T2)412 goto bad;413 if(c0 >= T6) {414 /* 6 bytes */415 if(n < 6)416 goto badlen;417 c5 = us[5] ^ Tx;418 if(c5 & T2)419 goto bad;420 wc = ((((((((((c0 & Mask6) << Bitx) |421 c1) << Bitx) | c2) << Bitx) |422 c3) << Bitx) | c4) << Bitx) | c5;423 if(wc <= Wchar5)424 goto bad;425 *p = wc;426 return 6;427 }428 /* 5 bytes */429 wc = ((((((((c0 & Mask5) << Bitx) |430 c1) << Bitx) | c2) << Bitx) |431 c3) << Bitx) | c4;432 if(wc <= Wchar4)433 goto bad;434 *p = wc;435 return 5;436 }437 if(c0 >= T4) {438 /* 4 bytes */439 if(n < 4)440 goto badlen;441 c3 = us[3] ^ Tx;442 if(c3 & T2)443 goto bad;444 wc = ((((((c0 & Mask4) << Bitx) |445 c1) << Bitx) | c2) << Bitx) |446 c3;447 if(wc <= Wchar3)448 goto bad;449 *p = wc;450 return 4;451 }452 /* 3 bytes */453 wc = ((((c0 & Mask3) << Bitx) |454 c1) << Bitx) | c2;455 if(wc <= Wchar2)456 goto bad;457 *p = wc;458 return 3;459 }460 if(c0 >= T2) {461 /* 2 bytes */462 if(n < 2)463 goto badlen;464 c1 = us[1] ^ Tx;465 if(c1 & T2)466 goto bad;467 wc = ((c0 & Mask2) << Bitx) |468 c1;469 if(wc <= Wchar1)470 goto bad;471 *p = wc;472 return 2;473 }474 /* 1 byte */475 if(c0 >= Tx)476 goto bad;477 *p = c0;478 return 1;480 bad:481 errno = EILSEQ;482 return -1;483 badlen:484 return -2;485 }