Blob
1 #ifdef PLAN92 #include <u.h>3 #include <libc.h>4 #include <bio.h>5 #ifdef PLAN9PORT6 #include <errno.h>7 #else8 extern int errno;9 #endif10 #else11 #include <sys/types.h>12 #include <stdio.h>13 #include <stdlib.h>14 #include <string.h>15 #include <unistd.h>16 #include <errno.h>17 #include "plan9.h"18 #endif19 #include "hdr.h"20 #ifndef EILSEQ21 #define EILSEQ 999822 #endif24 /*25 the our_* routines are implementations for the corresponding library26 routines. for a while, i tried to actually name them wctomb etc27 but stopped that after i found a system which made wchar_t an28 unsigned char.29 */31 int our_wctomb(char *s, unsigned long wc);32 int our_mbtowc(unsigned long *p, char *s, unsigned n);33 int runetoisoutf(char *str, Rune *rune);34 int fullisorune(char *str, int n);35 int isochartorune(Rune *rune, char *str);37 void38 utf_in(int fd, long *notused, struct convert *out)39 {40 char buf[N];41 int i, j, c, n, tot;42 ulong l;44 USED(notused);45 tot = 0;46 while((n = read(fd, buf+tot, N-tot)) >= 0){47 tot += n;48 for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){49 c = our_mbtowc(&l, buf+i, tot-i);50 if(c == -1){51 if(squawk)52 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);53 if(clean){54 i++;55 continue;56 }57 nerrors++;58 l = Runeerror;59 c = 1;60 }61 runes[j++] = l;62 i += c;63 }64 OUT(out, runes, j);65 tot -= i;66 ninput += i;67 if(tot)68 memmove(buf, buf+i, tot);69 if(n == 0)70 break;71 }72 OUT(out, runes, 0);73 }75 void76 utf_out(Rune *base, int n, long *notused)77 {78 char *p;79 Rune *r;81 USED(notused);82 nrunes += n;83 for(r = base, p = obuf; n-- > 0; r++){84 p += our_wctomb(p, *r);85 }86 noutput += p-obuf;87 write(1, obuf, p-obuf);88 }90 void91 isoutf_in(int fd, long *notused, struct convert *out)92 {93 char buf[N];94 int i, j, c, n, tot;96 USED(notused);97 tot = 0;98 while((n = read(fd, buf+tot, N-tot)) >= 0){99 tot += n;100 for(i=j=0; i<tot; ){101 if(!fullisorune(buf+i, tot-i))102 break;103 c = isochartorune(&runes[j], buf+i);104 if(runes[j] == Runeerror && c == 1){105 if(squawk)106 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);107 if(clean){108 i++;109 continue;110 }111 nerrors++;112 }113 j++;114 i += c;115 }116 OUT(out, runes, j);117 tot -= i;118 ninput += i;119 if(tot)120 memmove(buf, buf+i, tot);121 if(n == 0)122 break;123 }124 OUT(out, runes, 0);125 }127 void128 isoutf_out(Rune *base, int n, long *notused)129 {130 char *p;131 Rune *r;133 USED(notused);134 nrunes += n;135 for(r = base, p = obuf; n-- > 0; r++)136 p += runetoisoutf(p, r);137 noutput += p-obuf;138 write(1, obuf, p-obuf);139 }142 enum143 {144 Char1 = Runeself, Rune1 = Runeself,145 Char21 = 0xA1, Rune21 = 0x0100,146 Char22 = 0xF6, Rune22 = 0x4016,147 Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */148 Esc = 0xBE, Bad = Runeerror149 };151 static uchar U[256];152 static uchar T[256];154 static155 void156 mktable(void)157 {158 int i, u;160 for(i=0; i<256; i++) {161 u = i + (0x5E - 0xA0);162 if(i < 0xA0)163 u = i + (0xDF - 0x7F);164 if(i < 0x7F)165 u = i + (0x00 - 0x21);166 if(i < 0x21)167 u = i + (0xBE - 0x00);168 U[i] = u;169 T[u] = i;170 }171 }173 int174 isochartorune(Rune *rune, char *str)175 {176 int c, c1, c2;177 long l;179 if(U[0] == 0)180 mktable();182 /*183 * one character sequence184 * 00000-0009F => 00-9F185 */186 c = *(uchar*)str;187 if(c < Char1) {188 *rune = c;189 return 1;190 }192 /*193 * two character sequence194 * 000A0-000FF => A0; A0-FF195 */196 c1 = *(uchar*)(str+1);197 if(c < Char21) {198 if(c1 >= Rune1 && c1 < Rune21) {199 *rune = c1;200 return 2;201 }202 goto bad;203 }205 /*206 * two character sequence207 * 00100-04015 => A1-F5; 21-7E/A0-FF208 */209 c1 = U[c1];210 if(c1 >= Esc)211 goto bad;212 if(c < Char22) {213 *rune = (c-Char21)*Esc + c1 + Rune21;214 return 2;215 }217 /*218 * three character sequence219 * 04016-38E2D => A6-FB; 21-7E/A0-FF220 */221 c2 = U[*(uchar*)(str+2)];222 if(c2 >= Esc)223 goto bad;224 if(c < Char3) {225 l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;226 if(l >= Rune3)227 goto bad;228 *rune = l;229 return 3;230 }232 /*233 * bad decoding234 */235 bad:236 *rune = Bad;237 return 1;238 }240 int241 runetoisoutf(char *str, Rune *rune)242 {243 long c;245 if(T[0] == 0)246 mktable();248 /*249 * one character sequence250 * 00000-0009F => 00-9F251 */252 c = *rune;253 if(c < Rune1) {254 str[0] = c;255 return 1;256 }258 /*259 * two character sequence260 * 000A0-000FF => A0; A0-FF261 */262 if(c < Rune21) {263 str[0] = (char)Char1;264 str[1] = c;265 return 2;266 }268 /*269 * two character sequence270 * 00100-04015 => A1-F5; 21-7E/A0-FF271 */272 if(c < Rune22) {273 c -= Rune21;274 str[0] = c/Esc + Char21;275 str[1] = T[c%Esc];276 return 2;277 }279 /*280 * three character sequence281 * 04016-38E2D => A6-FB; 21-7E/A0-FF282 */283 c -= Rune22;284 str[0] = c/(Esc*Esc) + Char22;285 str[1] = T[c/Esc%Esc];286 str[2] = T[c%Esc];287 return 3;288 }290 int291 fullisorune(char *str, int n)292 {293 int c;295 if(n > 0) {296 c = *(uchar*)str;297 if(c < Char1)298 return 1;299 if(n > 1)300 if(c < Char22 || n > 2)301 return 1;302 }303 return 0;304 }306 enum307 {308 T1 = 0x00,309 Tx = 0x80,310 T2 = 0xC0,311 T3 = 0xE0,312 T4 = 0xF0,313 T5 = 0xF8,314 T6 = 0xFC,316 Bit1 = 7,317 Bitx = 6,318 Bit2 = 5,319 Bit3 = 4,320 Bit4 = 3,321 Bit5 = 2,322 Bit6 = 2,324 Mask1 = (1<<Bit1)-1,325 Maskx = (1<<Bitx)-1,326 Mask2 = (1<<Bit2)-1,327 Mask3 = (1<<Bit3)-1,328 Mask4 = (1<<Bit4)-1,329 Mask5 = (1<<Bit5)-1,330 Mask6 = (1<<Bit6)-1,332 Wchar1 = (1UL<<Bit1)-1,333 Wchar2 = (1UL<<(Bit2+Bitx))-1,334 Wchar3 = (1UL<<(Bit3+2*Bitx))-1,335 Wchar4 = (1UL<<(Bit4+3*Bitx))-1,336 Wchar5 = (1UL<<(Bit5+4*Bitx))-1337 };339 int340 our_wctomb(char *s, unsigned long wc)341 {342 if(s == 0)343 return 0; /* no shift states */344 if(wc & ~Wchar2) {345 if(wc & ~Wchar4) {346 if(wc & ~Wchar5) {347 /* 6 bytes */348 s[0] = T6 | ((wc >> 5*Bitx) & Mask6);349 s[1] = Tx | ((wc >> 4*Bitx) & Maskx);350 s[2] = Tx | ((wc >> 3*Bitx) & Maskx);351 s[3] = Tx | ((wc >> 2*Bitx) & Maskx);352 s[4] = Tx | ((wc >> 1*Bitx) & Maskx);353 s[5] = Tx | (wc & Maskx);354 return 6;355 }356 /* 5 bytes */357 s[0] = T5 | (wc >> 4*Bitx);358 s[1] = Tx | ((wc >> 3*Bitx) & Maskx);359 s[2] = Tx | ((wc >> 2*Bitx) & Maskx);360 s[3] = Tx | ((wc >> 1*Bitx) & Maskx);361 s[4] = Tx | (wc & Maskx);362 return 5;363 }364 if(wc & ~Wchar3) {365 /* 4 bytes */366 s[0] = T4 | (wc >> 3*Bitx);367 s[1] = Tx | ((wc >> 2*Bitx) & Maskx);368 s[2] = Tx | ((wc >> 1*Bitx) & Maskx);369 s[3] = Tx | (wc & Maskx);370 return 4;371 }372 /* 3 bytes */373 s[0] = T3 | (wc >> 2*Bitx);374 s[1] = Tx | ((wc >> 1*Bitx) & Maskx);375 s[2] = Tx | (wc & Maskx);376 return 3;377 }378 if(wc & ~Wchar1) {379 /* 2 bytes */380 s[0] = T2 | (wc >> 1*Bitx);381 s[1] = Tx | (wc & Maskx);382 return 2;383 }384 /* 1 byte */385 s[0] = T1 | wc;386 return 1;387 }389 int390 our_mbtowc(unsigned long *p, char *s, unsigned n)391 {392 uchar *us;393 int c0, c1, c2, c3, c4, c5;394 unsigned long wc;396 if(s == 0)397 return 0; /* no shift states */399 if(n < 1)400 goto bad;401 us = (uchar*)s;402 c0 = us[0];403 if(c0 >= T3) {404 if(n < 3)405 goto bad;406 c1 = us[1] ^ Tx;407 c2 = us[2] ^ Tx;408 if((c1|c2) & T2)409 goto bad;410 if(c0 >= T5) {411 if(n < 5)412 goto bad;413 c3 = us[3] ^ Tx;414 c4 = us[4] ^ Tx;415 if((c3|c4) & T2)416 goto bad;417 if(c0 >= T6) {418 /* 6 bytes */419 if(n < 6)420 goto bad;421 c5 = us[5] ^ Tx;422 if(c5 & T2)423 goto bad;424 wc = ((((((((((c0 & Mask6) << Bitx) |425 c1) << Bitx) | c2) << Bitx) |426 c3) << Bitx) | c4) << Bitx) | c5;427 if(wc <= Wchar5)428 goto bad;429 *p = wc;430 return 6;431 }432 /* 5 bytes */433 wc = ((((((((c0 & Mask5) << Bitx) |434 c1) << Bitx) | c2) << Bitx) |435 c3) << Bitx) | c4;436 if(wc <= Wchar4)437 goto bad;438 *p = wc;439 return 5;440 }441 if(c0 >= T4) {442 /* 4 bytes */443 if(n < 4)444 goto bad;445 c3 = us[3] ^ Tx;446 if(c3 & T2)447 goto bad;448 wc = ((((((c0 & Mask4) << Bitx) |449 c1) << Bitx) | c2) << Bitx) |450 c3;451 if(wc <= Wchar3)452 goto bad;453 *p = wc;454 return 4;455 }456 /* 3 bytes */457 wc = ((((c0 & Mask3) << Bitx) |458 c1) << Bitx) | c2;459 if(wc <= Wchar2)460 goto bad;461 *p = wc;462 return 3;463 }464 if(c0 >= T2) {465 /* 2 bytes */466 if(n < 2)467 goto bad;468 c1 = us[1] ^ Tx;469 if(c1 & T2)470 goto bad;471 wc = ((c0 & Mask2) << Bitx) |472 c1;473 if(wc <= Wchar1)474 goto bad;475 *p = wc;476 return 2;477 }478 /* 1 byte */479 if(c0 >= Tx)480 goto bad;481 *p = c0;482 return 1;484 bad:485 errno = EILSEQ;486 return -1;487 }