Blob


1 #ifdef PLAN9
2 #include <u.h>
3 #include <libc.h>
4 #include <bio.h>
5 #ifdef PLAN9PORT
6 #include <errno.h>
7 #else
8 extern int errno;
9 #define EILSEQ 123
10 #endif
11 #else
12 #include <sys/types.h>
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <string.h>
16 #include <unistd.h>
17 #include <errno.h>
18 #include "plan9.h"
19 #endif
20 #include "hdr.h"
22 /*
23 the our_* routines are implementations for the corresponding library
24 routines. for a while, i tried to actually name them wctomb etc
25 but stopped that after i found a system which made wchar_t an
26 unsigned char.
27 */
29 int our_wctomb(char *s, unsigned long wc);
30 int our_mbtowc(unsigned long *p, char *s, unsigned n);
31 int runetoisoutf(char *str, Rune *rune);
32 int fullisorune(char *str, int n);
33 int isochartorune(Rune *rune, char *str);
35 void
36 utf_in(int fd, long *notused, struct convert *out)
37 {
38 char buf[N];
39 int i, j, c, n, tot;
40 ulong l;
42 USED(notused);
43 tot = 0;
44 while((n = read(fd, buf+tot, N-tot)) >= 0){
45 tot += n;
46 for(i=j=0; i<tot; ){
47 c = our_mbtowc(&l, buf+i, tot-i);
48 if(c == -2)
49 break;
50 if(c == -1){
51 if(squawk)
52 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
53 if(clean)
54 continue;
55 nerrors++;
56 l = Runeerror;
57 c = 1;
58 }
59 runes[j++] = l;
60 i += c;
61 }
62 OUT(out, runes, j);
63 tot -= i;
64 ninput += i;
65 if(tot)
66 memmove(buf, buf+i, tot);
67 if(n == 0)
68 break;
69 }
70 }
72 void
73 utf_out(Rune *base, int n, long *notused)
74 {
75 char *p;
76 Rune *r;
78 USED(notused);
79 nrunes += n;
80 for(r = base, p = obuf; n-- > 0; r++){
81 p += our_wctomb(p, *r);
82 }
83 noutput += p-obuf;
84 write(1, obuf, p-obuf);
85 }
87 void
88 isoutf_in(int fd, long *notused, struct convert *out)
89 {
90 char buf[N];
91 int i, j, c, n, tot;
93 USED(notused);
94 tot = 0;
95 while((n = read(fd, buf+tot, N-tot)) >= 0){
96 tot += n;
97 for(i=j=0; i<tot; ){
98 if(!fullisorune(buf+i, tot-i))
99 break;
100 c = isochartorune(&runes[j], buf+i);
101 if(runes[j] == Runeerror){
102 if(squawk)
103 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
104 if(clean)
105 continue;
106 nerrors++;
108 j++;
109 i += c;
111 OUT(out, runes, j);
112 tot -= i;
113 ninput += i;
114 if(tot)
115 memmove(buf, buf+i, tot);
116 if(n == 0)
117 break;
121 void
122 isoutf_out(Rune *base, int n, long *notused)
124 char *p;
125 Rune *r;
127 USED(notused);
128 nrunes += n;
129 for(r = base, p = obuf; n-- > 0; r++)
130 p += runetoisoutf(p, r);
131 noutput += p-obuf;
132 write(1, obuf, p-obuf);
136 enum
138 Char1 = Runeself, Rune1 = Runeself,
139 Char21 = 0xA1, Rune21 = 0x0100,
140 Char22 = 0xF6, Rune22 = 0x4016,
141 Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */
142 Esc = 0xBE, Bad = Runeerror
143 };
145 static uchar U[256];
146 static uchar T[256];
148 static
149 void
150 mktable(void)
152 int i, u;
154 for(i=0; i<256; i++) {
155 u = i + (0x5E - 0xA0);
156 if(i < 0xA0)
157 u = i + (0xDF - 0x7F);
158 if(i < 0x7F)
159 u = i + (0x00 - 0x21);
160 if(i < 0x21)
161 u = i + (0xBE - 0x00);
162 U[i] = u;
163 T[u] = i;
167 int
168 isochartorune(Rune *rune, char *str)
170 int c, c1, c2;
171 long l;
173 if(U[0] == 0)
174 mktable();
176 /*
177 * one character sequence
178 * 00000-0009F => 00-9F
179 */
180 c = *(uchar*)str;
181 if(c < Char1) {
182 *rune = c;
183 return 1;
186 /*
187 * two character sequence
188 * 000A0-000FF => A0; A0-FF
189 */
190 c1 = *(uchar*)(str+1);
191 if(c < Char21) {
192 if(c1 >= Rune1 && c1 < Rune21) {
193 *rune = c1;
194 return 2;
196 goto bad;
199 /*
200 * two character sequence
201 * 00100-04015 => A1-F5; 21-7E/A0-FF
202 */
203 c1 = U[c1];
204 if(c1 >= Esc)
205 goto bad;
206 if(c < Char22) {
207 *rune = (c-Char21)*Esc + c1 + Rune21;
208 return 2;
211 /*
212 * three character sequence
213 * 04016-38E2D => A6-FB; 21-7E/A0-FF
214 */
215 c2 = U[*(uchar*)(str+2)];
216 if(c2 >= Esc)
217 goto bad;
218 if(c < Char3) {
219 l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;
220 if(l >= Rune3)
221 goto bad;
222 *rune = l;
223 return 3;
226 /*
227 * bad decoding
228 */
229 bad:
230 *rune = Bad;
231 return 1;
234 int
235 runetoisoutf(char *str, Rune *rune)
237 long c;
239 if(T[0] == 0)
240 mktable();
242 /*
243 * one character sequence
244 * 00000-0009F => 00-9F
245 */
246 c = *rune;
247 if(c < Rune1) {
248 str[0] = c;
249 return 1;
252 /*
253 * two character sequence
254 * 000A0-000FF => A0; A0-FF
255 */
256 if(c < Rune21) {
257 str[0] = (char)Char1;
258 str[1] = c;
259 return 2;
262 /*
263 * two character sequence
264 * 00100-04015 => A1-F5; 21-7E/A0-FF
265 */
266 if(c < Rune22) {
267 c -= Rune21;
268 str[0] = c/Esc + Char21;
269 str[1] = T[c%Esc];
270 return 2;
273 /*
274 * three character sequence
275 * 04016-38E2D => A6-FB; 21-7E/A0-FF
276 */
277 c -= Rune22;
278 str[0] = c/(Esc*Esc) + Char22;
279 str[1] = T[c/Esc%Esc];
280 str[2] = T[c%Esc];
281 return 3;
284 int
285 fullisorune(char *str, int n)
287 int c;
289 if(n > 0) {
290 c = *(uchar*)str;
291 if(c < Char1)
292 return 1;
293 if(n > 1)
294 if(c < Char22 || n > 2)
295 return 1;
297 return 0;
300 enum
302 T1 = 0x00,
303 Tx = 0x80,
304 T2 = 0xC0,
305 T3 = 0xE0,
306 T4 = 0xF0,
307 T5 = 0xF8,
308 T6 = 0xFC,
310 Bit1 = 7,
311 Bitx = 6,
312 Bit2 = 5,
313 Bit3 = 4,
314 Bit4 = 3,
315 Bit5 = 2,
316 Bit6 = 2,
318 Mask1 = (1<<Bit1)-1,
319 Maskx = (1<<Bitx)-1,
320 Mask2 = (1<<Bit2)-1,
321 Mask3 = (1<<Bit3)-1,
322 Mask4 = (1<<Bit4)-1,
323 Mask5 = (1<<Bit5)-1,
324 Mask6 = (1<<Bit6)-1,
326 Wchar1 = (1UL<<Bit1)-1,
327 Wchar2 = (1UL<<(Bit2+Bitx))-1,
328 Wchar3 = (1UL<<(Bit3+2*Bitx))-1,
329 Wchar4 = (1UL<<(Bit4+3*Bitx))-1,
330 Wchar5 = (1UL<<(Bit5+4*Bitx))-1
331 };
333 int
334 our_wctomb(char *s, unsigned long wc)
336 if(s == 0)
337 return 0; /* no shift states */
338 if(wc & ~Wchar2) {
339 if(wc & ~Wchar4) {
340 if(wc & ~Wchar5) {
341 /* 6 bytes */
342 s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
343 s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
344 s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
345 s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
346 s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
347 s[5] = Tx | (wc & Maskx);
348 return 6;
350 /* 5 bytes */
351 s[0] = T5 | (wc >> 4*Bitx);
352 s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
353 s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
354 s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
355 s[4] = Tx | (wc & Maskx);
356 return 5;
358 if(wc & ~Wchar3) {
359 /* 4 bytes */
360 s[0] = T4 | (wc >> 3*Bitx);
361 s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
362 s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
363 s[3] = Tx | (wc & Maskx);
364 return 4;
366 /* 3 bytes */
367 s[0] = T3 | (wc >> 2*Bitx);
368 s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
369 s[2] = Tx | (wc & Maskx);
370 return 3;
372 if(wc & ~Wchar1) {
373 /* 2 bytes */
374 s[0] = T2 | (wc >> 1*Bitx);
375 s[1] = Tx | (wc & Maskx);
376 return 2;
378 /* 1 byte */
379 s[0] = T1 | wc;
380 return 1;
383 int
384 our_mbtowc(unsigned long *p, char *s, unsigned n)
386 uchar *us;
387 int c0, c1, c2, c3, c4, c5;
388 unsigned long wc;
390 if(s == 0)
391 return 0; /* no shift states */
393 if(n < 1)
394 goto badlen;
395 us = (uchar*)s;
396 c0 = us[0];
397 if(c0 >= T3) {
398 if(n < 3)
399 goto badlen;
400 c1 = us[1] ^ Tx;
401 c2 = us[2] ^ Tx;
402 if((c1|c2) & T2)
403 goto bad;
404 if(c0 >= T5) {
405 if(n < 5)
406 goto badlen;
407 c3 = us[3] ^ Tx;
408 c4 = us[4] ^ Tx;
409 if((c3|c4) & T2)
410 goto bad;
411 if(c0 >= T6) {
412 /* 6 bytes */
413 if(n < 6)
414 goto badlen;
415 c5 = us[5] ^ Tx;
416 if(c5 & T2)
417 goto bad;
418 wc = ((((((((((c0 & Mask6) << Bitx) |
419 c1) << Bitx) | c2) << Bitx) |
420 c3) << Bitx) | c4) << Bitx) | c5;
421 if(wc <= Wchar5)
422 goto bad;
423 *p = wc;
424 return 6;
426 /* 5 bytes */
427 wc = ((((((((c0 & Mask5) << Bitx) |
428 c1) << Bitx) | c2) << Bitx) |
429 c3) << Bitx) | c4;
430 if(wc <= Wchar4)
431 goto bad;
432 *p = wc;
433 return 5;
435 if(c0 >= T4) {
436 /* 4 bytes */
437 if(n < 4)
438 goto badlen;
439 c3 = us[3] ^ Tx;
440 if(c3 & T2)
441 goto bad;
442 wc = ((((((c0 & Mask4) << Bitx) |
443 c1) << Bitx) | c2) << Bitx) |
444 c3;
445 if(wc <= Wchar3)
446 goto bad;
447 *p = wc;
448 return 4;
450 /* 3 bytes */
451 wc = ((((c0 & Mask3) << Bitx) |
452 c1) << Bitx) | c2;
453 if(wc <= Wchar2)
454 goto bad;
455 *p = wc;
456 return 3;
458 if(c0 >= T2) {
459 /* 2 bytes */
460 if(n < 2)
461 goto badlen;
462 c1 = us[1] ^ Tx;
463 if(c1 & T2)
464 goto bad;
465 wc = ((c0 & Mask2) << Bitx) |
466 c1;
467 if(wc <= Wchar1)
468 goto bad;
469 *p = wc;
470 return 2;
472 /* 1 byte */
473 if(c0 >= Tx)
474 goto bad;
475 *p = c0;
476 return 1;
478 bad:
479 errno = EILSEQ;
480 return -1;
481 badlen:
482 return -2;