Blob


1 #ifdef PLAN9
2 #include <u.h>
3 #include <libc.h>
4 #include <bio.h>
5 #ifdef PLAN9PORT
6 #include <errno.h>
7 #else
8 extern int errno;
9 #endif
10 #else
11 #include <sys/types.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <unistd.h>
16 #include <errno.h>
17 #include "plan9.h"
18 #endif
19 #include "hdr.h"
20 #ifndef EILSEQ
21 #define EILSEQ 9998
22 #endif
24 /*
25 the our_* routines are implementations for the corresponding library
26 routines. for a while, i tried to actually name them wctomb etc
27 but stopped that after i found a system which made wchar_t an
28 unsigned char.
29 */
31 int our_wctomb(char *s, unsigned long wc);
32 int our_mbtowc(unsigned long *p, char *s, unsigned n);
33 int runetoisoutf(char *str, Rune *rune);
34 int fullisorune(char *str, int n);
35 int isochartorune(Rune *rune, char *str);
37 void
38 utf_in(int fd, long *notused, struct convert *out)
39 {
40 char buf[N];
41 int i, j, c, n, tot;
42 ulong l;
44 USED(notused);
45 tot = 0;
46 while((n = read(fd, buf+tot, N-tot)) >= 0){
47 tot += n;
48 for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){
49 c = our_mbtowc(&l, buf+i, tot-i);
50 if(c == -1){
51 if(squawk)
52 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
53 if(clean){
54 i++;
55 continue;
56 }
57 nerrors++;
58 l = Runeerror;
59 c = 1;
60 }
61 runes[j++] = l;
62 i += c;
63 }
64 OUT(out, runes, j);
65 tot -= i;
66 ninput += i;
67 if(tot)
68 memmove(buf, buf+i, tot);
69 if(n == 0)
70 break;
71 }
72 OUT(out, runes, 0);
73 }
75 void
76 utf_out(Rune *base, int n, long *notused)
77 {
78 char *p;
79 Rune *r;
81 USED(notused);
82 nrunes += n;
83 for(r = base, p = obuf; n-- > 0; r++){
84 p += our_wctomb(p, *r);
85 }
86 noutput += p-obuf;
87 write(1, obuf, p-obuf);
88 }
90 void
91 isoutf_in(int fd, long *notused, struct convert *out)
92 {
93 char buf[N];
94 int i, j, c, n, tot;
96 USED(notused);
97 tot = 0;
98 while((n = read(fd, buf+tot, N-tot)) >= 0){
99 tot += n;
100 for(i=j=0; i<tot; ){
101 if(!fullisorune(buf+i, tot-i))
102 break;
103 c = isochartorune(&runes[j], buf+i);
104 if(runes[j] == Runeerror && c == 1){
105 if(squawk)
106 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
107 if(clean){
108 i++;
109 continue;
111 nerrors++;
113 j++;
114 i += c;
116 OUT(out, runes, j);
117 tot -= i;
118 ninput += i;
119 if(tot)
120 memmove(buf, buf+i, tot);
121 if(n == 0)
122 break;
124 OUT(out, runes, 0);
127 void
128 isoutf_out(Rune *base, int n, long *notused)
130 char *p;
131 Rune *r;
133 USED(notused);
134 nrunes += n;
135 for(r = base, p = obuf; n-- > 0; r++)
136 p += runetoisoutf(p, r);
137 noutput += p-obuf;
138 write(1, obuf, p-obuf);
142 enum
144 Char1 = Runeself, Rune1 = Runeself,
145 Char21 = 0xA1, Rune21 = 0x0100,
146 Char22 = 0xF6, Rune22 = 0x4016,
147 Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */
148 Esc = 0xBE, Bad = Runeerror
149 };
151 static uchar U[256];
152 static uchar T[256];
154 static
155 void
156 mktable(void)
158 int i, u;
160 for(i=0; i<256; i++) {
161 u = i + (0x5E - 0xA0);
162 if(i < 0xA0)
163 u = i + (0xDF - 0x7F);
164 if(i < 0x7F)
165 u = i + (0x00 - 0x21);
166 if(i < 0x21)
167 u = i + (0xBE - 0x00);
168 U[i] = u;
169 T[u] = i;
173 int
174 isochartorune(Rune *rune, char *str)
176 int c, c1, c2;
177 long l;
179 if(U[0] == 0)
180 mktable();
182 /*
183 * one character sequence
184 * 00000-0009F => 00-9F
185 */
186 c = *(uchar*)str;
187 if(c < Char1) {
188 *rune = c;
189 return 1;
192 /*
193 * two character sequence
194 * 000A0-000FF => A0; A0-FF
195 */
196 c1 = *(uchar*)(str+1);
197 if(c < Char21) {
198 if(c1 >= Rune1 && c1 < Rune21) {
199 *rune = c1;
200 return 2;
202 goto bad;
205 /*
206 * two character sequence
207 * 00100-04015 => A1-F5; 21-7E/A0-FF
208 */
209 c1 = U[c1];
210 if(c1 >= Esc)
211 goto bad;
212 if(c < Char22) {
213 *rune = (c-Char21)*Esc + c1 + Rune21;
214 return 2;
217 /*
218 * three character sequence
219 * 04016-38E2D => A6-FB; 21-7E/A0-FF
220 */
221 c2 = U[*(uchar*)(str+2)];
222 if(c2 >= Esc)
223 goto bad;
224 if(c < Char3) {
225 l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;
226 if(l >= Rune3)
227 goto bad;
228 *rune = l;
229 return 3;
232 /*
233 * bad decoding
234 */
235 bad:
236 *rune = Bad;
237 return 1;
240 int
241 runetoisoutf(char *str, Rune *rune)
243 long c;
245 if(T[0] == 0)
246 mktable();
248 /*
249 * one character sequence
250 * 00000-0009F => 00-9F
251 */
252 c = *rune;
253 if(c < Rune1) {
254 str[0] = c;
255 return 1;
258 /*
259 * two character sequence
260 * 000A0-000FF => A0; A0-FF
261 */
262 if(c < Rune21) {
263 str[0] = (char)Char1;
264 str[1] = c;
265 return 2;
268 /*
269 * two character sequence
270 * 00100-04015 => A1-F5; 21-7E/A0-FF
271 */
272 if(c < Rune22) {
273 c -= Rune21;
274 str[0] = c/Esc + Char21;
275 str[1] = T[c%Esc];
276 return 2;
279 /*
280 * three character sequence
281 * 04016-38E2D => A6-FB; 21-7E/A0-FF
282 */
283 c -= Rune22;
284 str[0] = c/(Esc*Esc) + Char22;
285 str[1] = T[c/Esc%Esc];
286 str[2] = T[c%Esc];
287 return 3;
290 int
291 fullisorune(char *str, int n)
293 int c;
295 if(n > 0) {
296 c = *(uchar*)str;
297 if(c < Char1)
298 return 1;
299 if(n > 1)
300 if(c < Char22 || n > 2)
301 return 1;
303 return 0;
306 enum
308 T1 = 0x00,
309 Tx = 0x80,
310 T2 = 0xC0,
311 T3 = 0xE0,
312 T4 = 0xF0,
313 T5 = 0xF8,
314 T6 = 0xFC,
316 Bit1 = 7,
317 Bitx = 6,
318 Bit2 = 5,
319 Bit3 = 4,
320 Bit4 = 3,
321 Bit5 = 2,
322 Bit6 = 2,
324 Mask1 = (1<<Bit1)-1,
325 Maskx = (1<<Bitx)-1,
326 Mask2 = (1<<Bit2)-1,
327 Mask3 = (1<<Bit3)-1,
328 Mask4 = (1<<Bit4)-1,
329 Mask5 = (1<<Bit5)-1,
330 Mask6 = (1<<Bit6)-1,
332 Wchar1 = (1UL<<Bit1)-1,
333 Wchar2 = (1UL<<(Bit2+Bitx))-1,
334 Wchar3 = (1UL<<(Bit3+2*Bitx))-1,
335 Wchar4 = (1UL<<(Bit4+3*Bitx))-1,
336 Wchar5 = (1UL<<(Bit5+4*Bitx))-1
337 };
339 int
340 our_wctomb(char *s, unsigned long wc)
342 if(s == 0)
343 return 0; /* no shift states */
344 if(wc & ~Wchar2) {
345 if(wc & ~Wchar4) {
346 if(wc & ~Wchar5) {
347 /* 6 bytes */
348 s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
349 s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
350 s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
351 s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
352 s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
353 s[5] = Tx | (wc & Maskx);
354 return 6;
356 /* 5 bytes */
357 s[0] = T5 | (wc >> 4*Bitx);
358 s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
359 s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
360 s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
361 s[4] = Tx | (wc & Maskx);
362 return 5;
364 if(wc & ~Wchar3) {
365 /* 4 bytes */
366 s[0] = T4 | (wc >> 3*Bitx);
367 s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
368 s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
369 s[3] = Tx | (wc & Maskx);
370 return 4;
372 /* 3 bytes */
373 s[0] = T3 | (wc >> 2*Bitx);
374 s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
375 s[2] = Tx | (wc & Maskx);
376 return 3;
378 if(wc & ~Wchar1) {
379 /* 2 bytes */
380 s[0] = T2 | (wc >> 1*Bitx);
381 s[1] = Tx | (wc & Maskx);
382 return 2;
384 /* 1 byte */
385 s[0] = T1 | wc;
386 return 1;
389 int
390 our_mbtowc(unsigned long *p, char *s, unsigned n)
392 uchar *us;
393 int c0, c1, c2, c3, c4, c5;
394 unsigned long wc;
396 if(s == 0)
397 return 0; /* no shift states */
399 if(n < 1)
400 goto bad;
401 us = (uchar*)s;
402 c0 = us[0];
403 if(c0 >= T3) {
404 if(n < 3)
405 goto bad;
406 c1 = us[1] ^ Tx;
407 c2 = us[2] ^ Tx;
408 if((c1|c2) & T2)
409 goto bad;
410 if(c0 >= T5) {
411 if(n < 5)
412 goto bad;
413 c3 = us[3] ^ Tx;
414 c4 = us[4] ^ Tx;
415 if((c3|c4) & T2)
416 goto bad;
417 if(c0 >= T6) {
418 /* 6 bytes */
419 if(n < 6)
420 goto bad;
421 c5 = us[5] ^ Tx;
422 if(c5 & T2)
423 goto bad;
424 wc = ((((((((((c0 & Mask6) << Bitx) |
425 c1) << Bitx) | c2) << Bitx) |
426 c3) << Bitx) | c4) << Bitx) | c5;
427 if(wc <= Wchar5)
428 goto bad;
429 *p = wc;
430 return 6;
432 /* 5 bytes */
433 wc = ((((((((c0 & Mask5) << Bitx) |
434 c1) << Bitx) | c2) << Bitx) |
435 c3) << Bitx) | c4;
436 if(wc <= Wchar4)
437 goto bad;
438 *p = wc;
439 return 5;
441 if(c0 >= T4) {
442 /* 4 bytes */
443 if(n < 4)
444 goto bad;
445 c3 = us[3] ^ Tx;
446 if(c3 & T2)
447 goto bad;
448 wc = ((((((c0 & Mask4) << Bitx) |
449 c1) << Bitx) | c2) << Bitx) |
450 c3;
451 if(wc <= Wchar3)
452 goto bad;
453 *p = wc;
454 return 4;
456 /* 3 bytes */
457 wc = ((((c0 & Mask3) << Bitx) |
458 c1) << Bitx) | c2;
459 if(wc <= Wchar2)
460 goto bad;
461 *p = wc;
462 return 3;
464 if(c0 >= T2) {
465 /* 2 bytes */
466 if(n < 2)
467 goto bad;
468 c1 = us[1] ^ Tx;
469 if(c1 & T2)
470 goto bad;
471 wc = ((c0 & Mask2) << Bitx) |
472 c1;
473 if(wc <= Wchar1)
474 goto bad;
475 *p = wc;
476 return 2;
478 /* 1 byte */
479 if(c0 >= Tx)
480 goto bad;
481 *p = c0;
482 return 1;
484 bad:
485 errno = EILSEQ;
486 return -1;