Blob


1 #ifdef PLAN9
2 #include <u.h>
3 #include <libc.h>
4 #include <bio.h>
5 #else
6 #include <sys/types.h>
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <string.h>
10 #include <unistd.h>
11 #include <errno.h>
12 #include "plan9.h"
13 #endif
14 #include "hdr.h"
16 /*
17 the our_* routines are implementations for the corresponding library
18 routines. for a while, i tried to actually name them wctomb etc
19 but stopped that after i found a system which made wchar_t an
20 unsigned char.
21 */
23 #ifdef PLAN9
24 long getrune(Biobuf *);
25 long getisorune(Biobuf *);
26 #else
27 long getrune(FILE *);
28 long getisorune(FILE *);
29 #endif
30 int our_wctomb(char *s, unsigned long wc);
31 int our_mbtowc(unsigned long *p, char *s, unsigned n);
32 int runetoisoutf(char *str, Rune *rune);
33 int fullisorune(char *str, int n);
34 int isochartorune(Rune *rune, char *str);
36 void
37 utf_in(int fd, long *notused, struct convert *out)
38 {
39 #ifndef PLAN9
40 FILE *fp;
41 #else /* PLAN9 */
42 Biobuf b;
43 #endif /* PLAN9 */
44 Rune *r;
45 long l;
47 USED(notused);
48 #ifndef PLAN9
49 if((fp = fdopen(fd, "r")) == NULL){
50 EPR "%s: input setup error: %s\n", argv0, strerror(errno));
51 #else /* PLAN9 */
52 if(Binit(&b, fd, OREAD) < 0){
53 EPR "%s: input setup error: %r\n", argv0);
54 #endif /* PLAN9 */
55 EXIT(1, "input error");
56 }
57 r = runes;
58 for(;;)
59 #ifndef PLAN9
60 switch(l = getrune(fp))
61 #else /* PLAN9 */
62 switch(l = getrune(&b))
63 #endif /* PLAN9 */
64 {
65 case -1:
66 goto done;
67 case -2:
68 if(squawk)
69 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput);
70 if(clean)
71 continue;
72 nerrors++;
73 l = Runeerror;
74 default:
75 *r++ = l;
76 if(r >= &runes[N]){
77 OUT(out, runes, r-runes);
78 r = runes;
79 }
80 }
81 done:
82 if(r > runes)
83 OUT(out, runes, r-runes);
84 }
86 void
87 utf_out(Rune *base, int n, long *notused)
88 {
89 char *p;
90 Rune *r;
92 USED(notused);
93 nrunes += n;
94 for(r = base, p = obuf; n-- > 0; r++){
95 p += our_wctomb(p, *r);
96 }
97 noutput += p-obuf;
98 write(1, obuf, p-obuf);
99 }
101 void
102 isoutf_in(int fd, long *notused, struct convert *out)
104 #ifndef PLAN9
105 FILE *fp;
106 #else /* PLAN9 */
107 Biobuf b;
108 #endif /* PLAN9 */
109 Rune *r;
110 long l;
112 USED(notused);
113 #ifndef PLAN9
114 if((fp = fdopen(fd, "r")) == 0){
115 EPR "%s: input setup error: %s\n", argv0, strerror(errno));
116 #else /* PLAN9 */
117 if(Binit(&b, fd, OREAD) < 0){
118 EPR "%s: input setup error: %r\n", argv0);
119 #endif /* PLAN9 */
120 EXIT(1, "input error");
122 r = runes;
123 for(;;)
124 #ifndef PLAN9
125 switch(l = getisorune(fp))
126 #else /* PLAN9 */
127 switch(l = getisorune(&b))
128 #endif /* PLAN9 */
130 case -1:
131 goto done;
132 case -2:
133 if(squawk)
134 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput);
135 if(clean)
136 continue;
137 nerrors++;
138 l = Runeerror;
139 default:
140 *r++ = l;
141 if(r >= &runes[N]){
142 OUT(out, runes, r-runes);
143 r = runes;
146 done:
147 if(r > runes)
148 OUT(out, runes, r-runes);
151 void
152 isoutf_out(Rune *base, int n, long *notused)
154 char *p;
155 Rune *r;
157 USED(notused);
158 nrunes += n;
159 for(r = base, p = obuf; n-- > 0; r++)
160 p += runetoisoutf(p, r);
161 noutput += p-obuf;
162 write(1, obuf, p-obuf);
165 long
166 #ifndef PLAN9
167 getrune(FILE *fp)
168 #else /* PLAN9 */
169 getrune(Biobuf *bp)
170 #endif /* PLAN9 */
172 int c, i;
173 char str[UTFmax]; /* MB_LEN_MAX really */
174 unsigned long l;
175 int n;
177 for(i = 0;;){
178 #ifndef PLAN9
179 c = getc(fp);
180 #else /* PLAN9 */
181 c = Bgetc(bp);
182 #endif /* PLAN9 */
183 if(c < 0)
184 return(c);
185 ninput++;
186 str[i++] = c;
187 n = our_mbtowc(&l, str, i);
188 if(n == -1)
189 return(-2);
190 if(n > 0)
191 return(l);
195 long
196 #ifndef PLAN9
197 getisorune(FILE *fp)
198 #else /* PLAN9 */
199 getisorune(Biobuf *bp)
200 #endif /* PLAN9 */
202 int c, i;
203 Rune rune;
204 char str[UTFmax]; /* MB_LEN_MAX really */
206 for(i = 0;;){
207 #ifndef PLAN9
208 c = getc(fp);
209 #else /* PLAN9 */
210 c = Bgetc(bp);
211 #endif /* PLAN9 */
212 if(c < 0)
213 return(c);
214 ninput++;
215 str[i++] = c;
216 if(fullisorune(str, i))
217 break;
219 isochartorune(&rune, str);
220 if(rune == Runeerror)
221 return -2;
222 return(rune);
225 enum
227 Char1 = Runeself, Rune1 = Runeself,
228 Char21 = 0xA1, Rune21 = 0x0100,
229 Char22 = 0xF6, Rune22 = 0x4016,
230 Char3 = 0xFC, Rune3 = 0x10000, /* really 0x38E2E */
231 Esc = 0xBE, Bad = Runeerror
232 };
234 static uchar U[256];
235 static uchar T[256];
237 static
238 void
239 mktable(void)
241 int i, u;
243 for(i=0; i<256; i++) {
244 u = i + (0x5E - 0xA0);
245 if(i < 0xA0)
246 u = i + (0xDF - 0x7F);
247 if(i < 0x7F)
248 u = i + (0x00 - 0x21);
249 if(i < 0x21)
250 u = i + (0xBE - 0x00);
251 U[i] = u;
252 T[u] = i;
256 int
257 isochartorune(Rune *rune, char *str)
259 int c, c1, c2;
260 long l;
262 if(U[0] == 0)
263 mktable();
265 /*
266 * one character sequence
267 * 00000-0009F => 00-9F
268 */
269 c = *(uchar*)str;
270 if(c < Char1) {
271 *rune = c;
272 return 1;
275 /*
276 * two character sequence
277 * 000A0-000FF => A0; A0-FF
278 */
279 c1 = *(uchar*)(str+1);
280 if(c < Char21) {
281 if(c1 >= Rune1 && c1 < Rune21) {
282 *rune = c1;
283 return 2;
285 goto bad;
288 /*
289 * two character sequence
290 * 00100-04015 => A1-F5; 21-7E/A0-FF
291 */
292 c1 = U[c1];
293 if(c1 >= Esc)
294 goto bad;
295 if(c < Char22) {
296 *rune = (c-Char21)*Esc + c1 + Rune21;
297 return 2;
300 /*
301 * three character sequence
302 * 04016-38E2D => A6-FB; 21-7E/A0-FF
303 */
304 c2 = U[*(uchar*)(str+2)];
305 if(c2 >= Esc)
306 goto bad;
307 if(c < Char3) {
308 l = (c-Char22)*Esc*Esc + c1*Esc + c2 + Rune22;
309 if(l >= Rune3)
310 goto bad;
311 *rune = l;
312 return 3;
315 /*
316 * bad decoding
317 */
318 bad:
319 *rune = Bad;
320 return 1;
323 int
324 runetoisoutf(char *str, Rune *rune)
326 long c;
328 if(T[0] == 0)
329 mktable();
331 /*
332 * one character sequence
333 * 00000-0009F => 00-9F
334 */
335 c = *rune;
336 if(c < Rune1) {
337 str[0] = c;
338 return 1;
341 /*
342 * two character sequence
343 * 000A0-000FF => A0; A0-FF
344 */
345 if(c < Rune21) {
346 str[0] = (uchar)Char1;
347 str[1] = c;
348 return 2;
351 /*
352 * two character sequence
353 * 00100-04015 => A1-F5; 21-7E/A0-FF
354 */
355 if(c < Rune22) {
356 c -= Rune21;
357 str[0] = c/Esc + Char21;
358 str[1] = T[c%Esc];
359 return 2;
362 /*
363 * three character sequence
364 * 04016-38E2D => A6-FB; 21-7E/A0-FF
365 */
366 c -= Rune22;
367 str[0] = c/(Esc*Esc) + Char22;
368 str[1] = T[c/Esc%Esc];
369 str[2] = T[c%Esc];
370 return 3;
373 int
374 fullisorune(char *str, int n)
376 int c;
378 if(n > 0) {
379 c = *(uchar*)str;
380 if(c < Char1)
381 return 1;
382 if(n > 1)
383 if(c < Char22 || n > 2)
384 return 1;
386 return 0;
389 #ifdef PLAN9
390 int errno;
391 #endif
393 enum
395 T1 = 0x00,
396 Tx = 0x80,
397 T2 = 0xC0,
398 T3 = 0xE0,
399 T4 = 0xF0,
400 T5 = 0xF8,
401 T6 = 0xFC,
403 Bit1 = 7,
404 Bitx = 6,
405 Bit2 = 5,
406 Bit3 = 4,
407 Bit4 = 3,
408 Bit5 = 2,
409 Bit6 = 2,
411 Mask1 = (1<<Bit1)-1,
412 Maskx = (1<<Bitx)-1,
413 Mask2 = (1<<Bit2)-1,
414 Mask3 = (1<<Bit3)-1,
415 Mask4 = (1<<Bit4)-1,
416 Mask5 = (1<<Bit5)-1,
417 Mask6 = (1<<Bit6)-1,
419 Wchar1 = (1UL<<Bit1)-1,
420 Wchar2 = (1UL<<(Bit2+Bitx))-1,
421 Wchar3 = (1UL<<(Bit3+2*Bitx))-1,
422 Wchar4 = (1UL<<(Bit4+3*Bitx))-1,
423 Wchar5 = (1UL<<(Bit5+4*Bitx))-1
425 #ifndef EILSEQ
426 , /* we hate ansi c's comma rules */
427 EILSEQ = 123
428 #endif /* PLAN9 */
429 };
431 int
432 our_wctomb(char *s, unsigned long wc)
434 if(s == 0)
435 return 0; /* no shift states */
436 if(wc & ~Wchar2) {
437 if(wc & ~Wchar4) {
438 if(wc & ~Wchar5) {
439 /* 6 bytes */
440 s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
441 s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
442 s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
443 s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
444 s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
445 s[5] = Tx | (wc & Maskx);
446 return 6;
448 /* 5 bytes */
449 s[0] = T5 | (wc >> 4*Bitx);
450 s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
451 s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
452 s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
453 s[4] = Tx | (wc & Maskx);
454 return 5;
456 if(wc & ~Wchar3) {
457 /* 4 bytes */
458 s[0] = T4 | (wc >> 3*Bitx);
459 s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
460 s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
461 s[3] = Tx | (wc & Maskx);
462 return 4;
464 /* 3 bytes */
465 s[0] = T3 | (wc >> 2*Bitx);
466 s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
467 s[2] = Tx | (wc & Maskx);
468 return 3;
470 if(wc & ~Wchar1) {
471 /* 2 bytes */
472 s[0] = T2 | (wc >> 1*Bitx);
473 s[1] = Tx | (wc & Maskx);
474 return 2;
476 /* 1 byte */
477 s[0] = T1 | wc;
478 return 1;
481 int
482 our_mbtowc(unsigned long *p, char *s, unsigned n)
484 uchar *us;
485 int c0, c1, c2, c3, c4, c5;
486 unsigned long wc;
488 if(s == 0)
489 return 0; /* no shift states */
491 if(n < 1)
492 goto badlen;
493 us = (uchar*)s;
494 c0 = us[0];
495 if(c0 >= T3) {
496 if(n < 3)
497 goto badlen;
498 c1 = us[1] ^ Tx;
499 c2 = us[2] ^ Tx;
500 if((c1|c2) & T2)
501 goto bad;
502 if(c0 >= T5) {
503 if(n < 5)
504 goto badlen;
505 c3 = us[3] ^ Tx;
506 c4 = us[4] ^ Tx;
507 if((c3|c4) & T2)
508 goto bad;
509 if(c0 >= T6) {
510 /* 6 bytes */
511 if(n < 6)
512 goto badlen;
513 c5 = us[5] ^ Tx;
514 if(c5 & T2)
515 goto bad;
516 wc = ((((((((((c0 & Mask6) << Bitx) |
517 c1) << Bitx) | c2) << Bitx) |
518 c3) << Bitx) | c4) << Bitx) | c5;
519 if(wc <= Wchar5)
520 goto bad;
521 *p = wc;
522 return 6;
524 /* 5 bytes */
525 wc = ((((((((c0 & Mask5) << Bitx) |
526 c1) << Bitx) | c2) << Bitx) |
527 c3) << Bitx) | c4;
528 if(wc <= Wchar4)
529 goto bad;
530 *p = wc;
531 return 5;
533 if(c0 >= T4) {
534 /* 4 bytes */
535 if(n < 4)
536 goto badlen;
537 c3 = us[3] ^ Tx;
538 if(c3 & T2)
539 goto bad;
540 wc = ((((((c0 & Mask4) << Bitx) |
541 c1) << Bitx) | c2) << Bitx) |
542 c3;
543 if(wc <= Wchar3)
544 goto bad;
545 *p = wc;
546 return 4;
548 /* 3 bytes */
549 wc = ((((c0 & Mask3) << Bitx) |
550 c1) << Bitx) | c2;
551 if(wc <= Wchar2)
552 goto bad;
553 *p = wc;
554 return 3;
556 if(c0 >= T2) {
557 /* 2 bytes */
558 if(n < 2)
559 goto badlen;
560 c1 = us[1] ^ Tx;
561 if(c1 & T2)
562 goto bad;
563 wc = ((c0 & Mask2) << Bitx) |
564 c1;
565 if(wc <= Wchar1)
566 goto bad;
567 *p = wc;
568 return 2;
570 /* 1 byte */
571 if(c0 >= Tx)
572 goto bad;
573 *p = c0;
574 return 1;
576 bad:
577 errno = EILSEQ;
578 return -1;
579 badlen:
580 return -2;