Blob
1 #ifdef PLAN92 #include <u.h>3 #include <libc.h>4 #include <bio.h>5 #ifdef PLAN9PORT6 #include <errno.h>7 #else8 extern int errno;9 #define EILSEQ 12310 #endif11 #else12 #include <sys/types.h>13 #include <stdio.h>14 #include <stdlib.h>15 #include <string.h>16 #include <unistd.h>17 #include <errno.h>18 #include "plan9.h"19 #endif20 #include "hdr.h"22 /*23 the our_* routines are implementations for the corresponding library24 routines. for a while, i tried to actually name them wctomb etc25 but stopped that after i found a system which made wchar_t an26 unsigned char.27 */29 int our_wctomb(char *s, unsigned long wc);30 int our_mbtowc(unsigned long *p, char *s, unsigned n);31 int runetoisoutf(char *str, Rune *rune);32 int fullisorune(char *str, int n);33 int isochartorune(Rune *rune, char *str);35 void36 utf_in(int fd, long *notused, struct convert *out)37 {38 char buf[N];39 int i, j, c, n, tot;40 ulong l;42 USED(notused);43 tot = 0;44 while((n = read(fd, buf+tot, N-tot)) >= 0){45 tot += n;46 for(i=j=0; i<tot; ){47 c = our_mbtowc(&l, buf+i, tot-i);48 if(c == -2)49 break;50 if(c == -1){51 if(squawk)52 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);53 if(clean)54 continue;55 nerrors++;56 l = Runeerror;57 c = 1;58 }59 runes[j++] = l;60 i += c;61 }62 OUT(out, runes, j);63 tot -= i;64 ninput += i;65 if(tot)66 memmove(buf, buf+i, tot);67 if(n == 0)68 break;69 }70 }72 void73 utf_out(Rune *base, int n, long *notused)74 {75 char *p;76 Rune *r;78 USED(notused);79 nrunes += n;80 for(r = base, p = obuf; n-- > 0; r++){81 p += our_wctomb(p, *r);82 }83 noutput += p-obuf;84 write(1, obuf, p-obuf);85 }87 void88 isoutf_in(int fd, long *notused, struct convert *out)89 {90 char buf[N];91 int i, j, c, n, tot;93 USED(notused);94 tot = 0;95 while((n = read(fd, buf+tot, N-tot)) >= 0){96 tot += n;97 for(i=j=0; i<tot; ){98 if(!fullisorune(buf+i, tot-i))99 break;100 c = isochartorune(&runes[j], buf+i);101 if(runes[j] == Runeerror){102 if(squawk)103 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);104 if(clean)105 continue;106 nerrors++;107 }108 j++;109 i += c;110 }111 OUT(out, runes, j);112 tot -= i;113 ninput += i;114 if(tot)115 memmove(buf, buf+i, tot);116 if(n == 0)117 break;118 }119 }121 void122 isoutf_out(Rune *base, int n, long *notused)123 {124 char *p;125 Rune *r;127 USED(notused);128 nrunes += n;129 for(r = base, p = obuf; n-- > 0; r++)130 p += runetoisoutf(p, r);131 noutput += p-obuf;132 write(1, obuf, p-obuf);133 }136 enum137 {138 Char1 = Runeself, Rune1 = Runeself,139 Char21 = 0xA1, Rune21 = 0x0100,140 Char22 = 0xF6, Rune22 = 0x4016,141 Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */142 Esc = 0xBE, Bad = Runeerror143 };145 static uchar U[256];146 static uchar T[256];148 static149 void150 mktable(void)151 {152 int i, u;154 for(i=0; i<256; i++) {155 u = i + (0x5E - 0xA0);156 if(i < 0xA0)157 u = i + (0xDF - 0x7F);158 if(i < 0x7F)159 u = i + (0x00 - 0x21);160 if(i < 0x21)161 u = i + (0xBE - 0x00);162 U[i] = u;163 T[u] = i;164 }165 }167 int168 isochartorune(Rune *rune, char *str)169 {170 int c, c1, c2;171 long l;173 if(U[0] == 0)174 mktable();176 /*177 * one character sequence178 * 00000-0009F => 00-9F179 */180 c = *(uchar*)str;181 if(c < Char1) {182 *rune = c;183 return 1;184 }186 /*187 * two character sequence188 * 000A0-000FF => A0; A0-FF189 */190 c1 = *(uchar*)(str+1);191 if(c < Char21) {192 if(c1 >= Rune1 && c1 < Rune21) {193 *rune = c1;194 return 2;195 }196 goto bad;197 }199 /*200 * two character sequence201 * 00100-04015 => A1-F5; 21-7E/A0-FF202 */203 c1 = U[c1];204 if(c1 >= Esc)205 goto bad;206 if(c < Char22) {207 *rune = (c-Char21)*Esc + c1 + Rune21;208 return 2;209 }211 /*212 * three character sequence213 * 04016-38E2D => A6-FB; 21-7E/A0-FF214 */215 c2 = U[*(uchar*)(str+2)];216 if(c2 >= Esc)217 goto bad;218 if(c < Char3) {219 l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;220 if(l >= Rune3)221 goto bad;222 *rune = l;223 return 3;224 }226 /*227 * bad decoding228 */229 bad:230 *rune = Bad;231 return 1;232 }234 int235 runetoisoutf(char *str, Rune *rune)236 {237 long c;239 if(T[0] == 0)240 mktable();242 /*243 * one character sequence244 * 00000-0009F => 00-9F245 */246 c = *rune;247 if(c < Rune1) {248 str[0] = c;249 return 1;250 }252 /*253 * two character sequence254 * 000A0-000FF => A0; A0-FF255 */256 if(c < Rune21) {257 str[0] = (char)Char1;258 str[1] = c;259 return 2;260 }262 /*263 * two character sequence264 * 00100-04015 => A1-F5; 21-7E/A0-FF265 */266 if(c < Rune22) {267 c -= Rune21;268 str[0] = c/Esc + Char21;269 str[1] = T[c%Esc];270 return 2;271 }273 /*274 * three character sequence275 * 04016-38E2D => A6-FB; 21-7E/A0-FF276 */277 c -= Rune22;278 str[0] = c/(Esc*Esc) + Char22;279 str[1] = T[c/Esc%Esc];280 str[2] = T[c%Esc];281 return 3;282 }284 int285 fullisorune(char *str, int n)286 {287 int c;289 if(n > 0) {290 c = *(uchar*)str;291 if(c < Char1)292 return 1;293 if(n > 1)294 if(c < Char22 || n > 2)295 return 1;296 }297 return 0;298 }300 enum301 {302 T1 = 0x00,303 Tx = 0x80,304 T2 = 0xC0,305 T3 = 0xE0,306 T4 = 0xF0,307 T5 = 0xF8,308 T6 = 0xFC,310 Bit1 = 7,311 Bitx = 6,312 Bit2 = 5,313 Bit3 = 4,314 Bit4 = 3,315 Bit5 = 2,316 Bit6 = 2,318 Mask1 = (1<<Bit1)-1,319 Maskx = (1<<Bitx)-1,320 Mask2 = (1<<Bit2)-1,321 Mask3 = (1<<Bit3)-1,322 Mask4 = (1<<Bit4)-1,323 Mask5 = (1<<Bit5)-1,324 Mask6 = (1<<Bit6)-1,326 Wchar1 = (1UL<<Bit1)-1,327 Wchar2 = (1UL<<(Bit2+Bitx))-1,328 Wchar3 = (1UL<<(Bit3+2*Bitx))-1,329 Wchar4 = (1UL<<(Bit4+3*Bitx))-1,330 Wchar5 = (1UL<<(Bit5+4*Bitx))-1331 };333 int334 our_wctomb(char *s, unsigned long wc)335 {336 if(s == 0)337 return 0; /* no shift states */338 if(wc & ~Wchar2) {339 if(wc & ~Wchar4) {340 if(wc & ~Wchar5) {341 /* 6 bytes */342 s[0] = T6 | ((wc >> 5*Bitx) & Mask6);343 s[1] = Tx | ((wc >> 4*Bitx) & Maskx);344 s[2] = Tx | ((wc >> 3*Bitx) & Maskx);345 s[3] = Tx | ((wc >> 2*Bitx) & Maskx);346 s[4] = Tx | ((wc >> 1*Bitx) & Maskx);347 s[5] = Tx | (wc & Maskx);348 return 6;349 }350 /* 5 bytes */351 s[0] = T5 | (wc >> 4*Bitx);352 s[1] = Tx | ((wc >> 3*Bitx) & Maskx);353 s[2] = Tx | ((wc >> 2*Bitx) & Maskx);354 s[3] = Tx | ((wc >> 1*Bitx) & Maskx);355 s[4] = Tx | (wc & Maskx);356 return 5;357 }358 if(wc & ~Wchar3) {359 /* 4 bytes */360 s[0] = T4 | (wc >> 3*Bitx);361 s[1] = Tx | ((wc >> 2*Bitx) & Maskx);362 s[2] = Tx | ((wc >> 1*Bitx) & Maskx);363 s[3] = Tx | (wc & Maskx);364 return 4;365 }366 /* 3 bytes */367 s[0] = T3 | (wc >> 2*Bitx);368 s[1] = Tx | ((wc >> 1*Bitx) & Maskx);369 s[2] = Tx | (wc & Maskx);370 return 3;371 }372 if(wc & ~Wchar1) {373 /* 2 bytes */374 s[0] = T2 | (wc >> 1*Bitx);375 s[1] = Tx | (wc & Maskx);376 return 2;377 }378 /* 1 byte */379 s[0] = T1 | wc;380 return 1;381 }383 int384 our_mbtowc(unsigned long *p, char *s, unsigned n)385 {386 uchar *us;387 int c0, c1, c2, c3, c4, c5;388 unsigned long wc;390 if(s == 0)391 return 0; /* no shift states */393 if(n < 1)394 goto badlen;395 us = (uchar*)s;396 c0 = us[0];397 if(c0 >= T3) {398 if(n < 3)399 goto badlen;400 c1 = us[1] ^ Tx;401 c2 = us[2] ^ Tx;402 if((c1|c2) & T2)403 goto bad;404 if(c0 >= T5) {405 if(n < 5)406 goto badlen;407 c3 = us[3] ^ Tx;408 c4 = us[4] ^ Tx;409 if((c3|c4) & T2)410 goto bad;411 if(c0 >= T6) {412 /* 6 bytes */413 if(n < 6)414 goto badlen;415 c5 = us[5] ^ Tx;416 if(c5 & T2)417 goto bad;418 wc = ((((((((((c0 & Mask6) << Bitx) |419 c1) << Bitx) | c2) << Bitx) |420 c3) << Bitx) | c4) << Bitx) | c5;421 if(wc <= Wchar5)422 goto bad;423 *p = wc;424 return 6;425 }426 /* 5 bytes */427 wc = ((((((((c0 & Mask5) << Bitx) |428 c1) << Bitx) | c2) << Bitx) |429 c3) << Bitx) | c4;430 if(wc <= Wchar4)431 goto bad;432 *p = wc;433 return 5;434 }435 if(c0 >= T4) {436 /* 4 bytes */437 if(n < 4)438 goto badlen;439 c3 = us[3] ^ Tx;440 if(c3 & T2)441 goto bad;442 wc = ((((((c0 & Mask4) << Bitx) |443 c1) << Bitx) | c2) << Bitx) |444 c3;445 if(wc <= Wchar3)446 goto bad;447 *p = wc;448 return 4;449 }450 /* 3 bytes */451 wc = ((((c0 & Mask3) << Bitx) |452 c1) << Bitx) | c2;453 if(wc <= Wchar2)454 goto bad;455 *p = wc;456 return 3;457 }458 if(c0 >= T2) {459 /* 2 bytes */460 if(n < 2)461 goto badlen;462 c1 = us[1] ^ Tx;463 if(c1 & T2)464 goto bad;465 wc = ((c0 & Mask2) << Bitx) |466 c1;467 if(wc <= Wchar1)468 goto bad;469 *p = wc;470 return 2;471 }472 /* 1 byte */473 if(c0 >= Tx)474 goto bad;475 *p = c0;476 return 1;478 bad:479 errno = EILSEQ;480 return -1;481 badlen:482 return -2;483 }