Blob


1 #ifdef PLAN9
2 #include <u.h>
3 #include <libc.h>
4 #include <bio.h>
5 #ifdef PLAN9PORT
6 #include <errno.h>
7 #else
8 extern int errno;
9 #endif
10 #else
11 #include <sys/types.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <unistd.h>
16 #include <errno.h>
17 #include "plan9.h"
18 #endif
19 #include "hdr.h"
20 #ifndef EILSEQ
21 #define EILSEQ 9998
22 #endif
24 /*
25 the our_* routines are implementations for the corresponding library
26 routines. for a while, i tried to actually name them wctomb etc
27 but stopped that after i found a system which made wchar_t an
28 unsigned char.
29 */
31 int our_wctomb(char *s, unsigned long wc);
32 int our_mbtowc(unsigned long *p, char *s, unsigned n);
33 int runetoisoutf(char *str, Rune *rune);
34 int fullisorune(char *str, int n);
35 int isochartorune(Rune *rune, char *str);
37 void
38 utf_in(int fd, long *notused, struct convert *out)
39 {
40 char buf[N];
41 int i, j, c, n, tot;
42 ulong l;
44 USED(notused);
45 tot = 0;
46 while((n = read(fd, buf+tot, N-tot)) >= 0){
47 tot += n;
48 for(i=j=0; i<tot; ){
49 c = our_mbtowc(&l, buf+i, tot-i);
50 if(c == -2)
51 break;
52 if(c == -1){
53 if(squawk)
54 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
55 if(clean)
56 continue;
57 nerrors++;
58 l = Runeerror;
59 c = 1;
60 }
61 runes[j++] = l;
62 i += c;
63 }
64 OUT(out, runes, j);
65 tot -= i;
66 ninput += i;
67 if(tot)
68 memmove(buf, buf+i, tot);
69 if(n == 0)
70 break;
71 }
72 }
74 void
75 utf_out(Rune *base, int n, long *notused)
76 {
77 char *p;
78 Rune *r;
80 USED(notused);
81 nrunes += n;
82 for(r = base, p = obuf; n-- > 0; r++){
83 p += our_wctomb(p, *r);
84 }
85 noutput += p-obuf;
86 write(1, obuf, p-obuf);
87 }
89 void
90 isoutf_in(int fd, long *notused, struct convert *out)
91 {
92 char buf[N];
93 int i, j, c, n, tot;
95 USED(notused);
96 tot = 0;
97 while((n = read(fd, buf+tot, N-tot)) >= 0){
98 tot += n;
99 for(i=j=0; i<tot; ){
100 if(!fullisorune(buf+i, tot-i))
101 break;
102 c = isochartorune(&runes[j], buf+i);
103 if(runes[j] == Runeerror){
104 if(squawk)
105 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
106 if(clean)
107 continue;
108 nerrors++;
110 j++;
111 i += c;
113 OUT(out, runes, j);
114 tot -= i;
115 ninput += i;
116 if(tot)
117 memmove(buf, buf+i, tot);
118 if(n == 0)
119 break;
123 void
124 isoutf_out(Rune *base, int n, long *notused)
126 char *p;
127 Rune *r;
129 USED(notused);
130 nrunes += n;
131 for(r = base, p = obuf; n-- > 0; r++)
132 p += runetoisoutf(p, r);
133 noutput += p-obuf;
134 write(1, obuf, p-obuf);
138 enum
140 Char1 = Runeself, Rune1 = Runeself,
141 Char21 = 0xA1, Rune21 = 0x0100,
142 Char22 = 0xF6, Rune22 = 0x4016,
143 Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */
144 Esc = 0xBE, Bad = Runeerror
145 };
147 static uchar U[256];
148 static uchar T[256];
150 static
151 void
152 mktable(void)
154 int i, u;
156 for(i=0; i<256; i++) {
157 u = i + (0x5E - 0xA0);
158 if(i < 0xA0)
159 u = i + (0xDF - 0x7F);
160 if(i < 0x7F)
161 u = i + (0x00 - 0x21);
162 if(i < 0x21)
163 u = i + (0xBE - 0x00);
164 U[i] = u;
165 T[u] = i;
169 int
170 isochartorune(Rune *rune, char *str)
172 int c, c1, c2;
173 long l;
175 if(U[0] == 0)
176 mktable();
178 /*
179 * one character sequence
180 * 00000-0009F => 00-9F
181 */
182 c = *(uchar*)str;
183 if(c < Char1) {
184 *rune = c;
185 return 1;
188 /*
189 * two character sequence
190 * 000A0-000FF => A0; A0-FF
191 */
192 c1 = *(uchar*)(str+1);
193 if(c < Char21) {
194 if(c1 >= Rune1 && c1 < Rune21) {
195 *rune = c1;
196 return 2;
198 goto bad;
201 /*
202 * two character sequence
203 * 00100-04015 => A1-F5; 21-7E/A0-FF
204 */
205 c1 = U[c1];
206 if(c1 >= Esc)
207 goto bad;
208 if(c < Char22) {
209 *rune = (c-Char21)*Esc + c1 + Rune21;
210 return 2;
213 /*
214 * three character sequence
215 * 04016-38E2D => A6-FB; 21-7E/A0-FF
216 */
217 c2 = U[*(uchar*)(str+2)];
218 if(c2 >= Esc)
219 goto bad;
220 if(c < Char3) {
221 l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;
222 if(l >= Rune3)
223 goto bad;
224 *rune = l;
225 return 3;
228 /*
229 * bad decoding
230 */
231 bad:
232 *rune = Bad;
233 return 1;
236 int
237 runetoisoutf(char *str, Rune *rune)
239 long c;
241 if(T[0] == 0)
242 mktable();
244 /*
245 * one character sequence
246 * 00000-0009F => 00-9F
247 */
248 c = *rune;
249 if(c < Rune1) {
250 str[0] = c;
251 return 1;
254 /*
255 * two character sequence
256 * 000A0-000FF => A0; A0-FF
257 */
258 if(c < Rune21) {
259 str[0] = (char)Char1;
260 str[1] = c;
261 return 2;
264 /*
265 * two character sequence
266 * 00100-04015 => A1-F5; 21-7E/A0-FF
267 */
268 if(c < Rune22) {
269 c -= Rune21;
270 str[0] = c/Esc + Char21;
271 str[1] = T[c%Esc];
272 return 2;
275 /*
276 * three character sequence
277 * 04016-38E2D => A6-FB; 21-7E/A0-FF
278 */
279 c -= Rune22;
280 str[0] = c/(Esc*Esc) + Char22;
281 str[1] = T[c/Esc%Esc];
282 str[2] = T[c%Esc];
283 return 3;
286 int
287 fullisorune(char *str, int n)
289 int c;
291 if(n > 0) {
292 c = *(uchar*)str;
293 if(c < Char1)
294 return 1;
295 if(n > 1)
296 if(c < Char22 || n > 2)
297 return 1;
299 return 0;
302 enum
304 T1 = 0x00,
305 Tx = 0x80,
306 T2 = 0xC0,
307 T3 = 0xE0,
308 T4 = 0xF0,
309 T5 = 0xF8,
310 T6 = 0xFC,
312 Bit1 = 7,
313 Bitx = 6,
314 Bit2 = 5,
315 Bit3 = 4,
316 Bit4 = 3,
317 Bit5 = 2,
318 Bit6 = 2,
320 Mask1 = (1<<Bit1)-1,
321 Maskx = (1<<Bitx)-1,
322 Mask2 = (1<<Bit2)-1,
323 Mask3 = (1<<Bit3)-1,
324 Mask4 = (1<<Bit4)-1,
325 Mask5 = (1<<Bit5)-1,
326 Mask6 = (1<<Bit6)-1,
328 Wchar1 = (1UL<<Bit1)-1,
329 Wchar2 = (1UL<<(Bit2+Bitx))-1,
330 Wchar3 = (1UL<<(Bit3+2*Bitx))-1,
331 Wchar4 = (1UL<<(Bit4+3*Bitx))-1,
332 Wchar5 = (1UL<<(Bit5+4*Bitx))-1
333 };
335 int
336 our_wctomb(char *s, unsigned long wc)
338 if(s == 0)
339 return 0; /* no shift states */
340 if(wc & ~Wchar2) {
341 if(wc & ~Wchar4) {
342 if(wc & ~Wchar5) {
343 /* 6 bytes */
344 s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
345 s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
346 s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
347 s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
348 s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
349 s[5] = Tx | (wc & Maskx);
350 return 6;
352 /* 5 bytes */
353 s[0] = T5 | (wc >> 4*Bitx);
354 s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
355 s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
356 s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
357 s[4] = Tx | (wc & Maskx);
358 return 5;
360 if(wc & ~Wchar3) {
361 /* 4 bytes */
362 s[0] = T4 | (wc >> 3*Bitx);
363 s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
364 s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
365 s[3] = Tx | (wc & Maskx);
366 return 4;
368 /* 3 bytes */
369 s[0] = T3 | (wc >> 2*Bitx);
370 s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
371 s[2] = Tx | (wc & Maskx);
372 return 3;
374 if(wc & ~Wchar1) {
375 /* 2 bytes */
376 s[0] = T2 | (wc >> 1*Bitx);
377 s[1] = Tx | (wc & Maskx);
378 return 2;
380 /* 1 byte */
381 s[0] = T1 | wc;
382 return 1;
385 int
386 our_mbtowc(unsigned long *p, char *s, unsigned n)
388 uchar *us;
389 int c0, c1, c2, c3, c4, c5;
390 unsigned long wc;
392 if(s == 0)
393 return 0; /* no shift states */
395 if(n < 1)
396 goto badlen;
397 us = (uchar*)s;
398 c0 = us[0];
399 if(c0 >= T3) {
400 if(n < 3)
401 goto badlen;
402 c1 = us[1] ^ Tx;
403 c2 = us[2] ^ Tx;
404 if((c1|c2) & T2)
405 goto bad;
406 if(c0 >= T5) {
407 if(n < 5)
408 goto badlen;
409 c3 = us[3] ^ Tx;
410 c4 = us[4] ^ Tx;
411 if((c3|c4) & T2)
412 goto bad;
413 if(c0 >= T6) {
414 /* 6 bytes */
415 if(n < 6)
416 goto badlen;
417 c5 = us[5] ^ Tx;
418 if(c5 & T2)
419 goto bad;
420 wc = ((((((((((c0 & Mask6) << Bitx) |
421 c1) << Bitx) | c2) << Bitx) |
422 c3) << Bitx) | c4) << Bitx) | c5;
423 if(wc <= Wchar5)
424 goto bad;
425 *p = wc;
426 return 6;
428 /* 5 bytes */
429 wc = ((((((((c0 & Mask5) << Bitx) |
430 c1) << Bitx) | c2) << Bitx) |
431 c3) << Bitx) | c4;
432 if(wc <= Wchar4)
433 goto bad;
434 *p = wc;
435 return 5;
437 if(c0 >= T4) {
438 /* 4 bytes */
439 if(n < 4)
440 goto badlen;
441 c3 = us[3] ^ Tx;
442 if(c3 & T2)
443 goto bad;
444 wc = ((((((c0 & Mask4) << Bitx) |
445 c1) << Bitx) | c2) << Bitx) |
446 c3;
447 if(wc <= Wchar3)
448 goto bad;
449 *p = wc;
450 return 4;
452 /* 3 bytes */
453 wc = ((((c0 & Mask3) << Bitx) |
454 c1) << Bitx) | c2;
455 if(wc <= Wchar2)
456 goto bad;
457 *p = wc;
458 return 3;
460 if(c0 >= T2) {
461 /* 2 bytes */
462 if(n < 2)
463 goto badlen;
464 c1 = us[1] ^ Tx;
465 if(c1 & T2)
466 goto bad;
467 wc = ((c0 & Mask2) << Bitx) |
468 c1;
469 if(wc <= Wchar1)
470 goto bad;
471 *p = wc;
472 return 2;
474 /* 1 byte */
475 if(c0 >= Tx)
476 goto bad;
477 *p = c0;
478 return 1;
480 bad:
481 errno = EILSEQ;
482 return -1;
483 badlen:
484 return -2;