Blob


1 #ifdef PLAN9
2 #include <u.h>
3 #include <libc.h>
4 #include <bio.h>
5 #ifdef PLAN9PORT
6 #include <errno.h>
7 #else
8 extern int errno;
9 #endif
10 #else
11 #include <sys/types.h>
12 #include <stdio.h>
13 #include <stdlib.h>
14 #include <string.h>
15 #include <unistd.h>
16 #include <errno.h>
17 #include "plan9.h"
18 #endif
19 #include "hdr.h"
20 #ifndef EILSEQ
21 #define EILSEQ 9998
22 #endif
24 /*
25 the our_* routines are implementations for the corresponding library
26 routines. for a while, i tried to actually name them wctomb etc
27 but stopped that after i found a system which made wchar_t an
28 unsigned char.
29 */
31 int our_wctomb(char *s, unsigned long wc);
32 int our_mbtowc(unsigned long *p, char *s, unsigned n);
33 int runetoisoutf(char *str, Rune *rune);
34 int fullisorune(char *str, int n);
35 int isochartorune(Rune *rune, char *str);
37 void
38 utf_in(int fd, long *notused, struct convert *out)
39 {
40 char buf[N];
41 int i, j, c, n, tot;
42 ulong l;
44 USED(notused);
45 tot = 0;
46 while((n = read(fd, buf+tot, N-tot)) >= 0){
47 tot += n;
48 for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){
49 c = our_mbtowc(&l, buf+i, tot-i);
50 if(c == -1){
51 if(squawk)
52 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
53 if(clean){
54 i++;
55 continue;
56 }
57 nerrors++;
58 l = Runeerror;
59 c = 1;
60 }
61 runes[j++] = l;
62 i += c;
63 }
64 OUT(out, runes, j);
65 tot -= i;
66 ninput += i;
67 if(tot)
68 memmove(buf, buf+i, tot);
69 if(n == 0)
70 break;
71 }
72 OUT(out, runes, 0);
73 }
75 void
76 utf_out(Rune *base, int n, long *notused)
77 {
78 char *p;
79 Rune *r;
81 USED(notused);
82 nrunes += n;
83 for(r = base, p = obuf; n-- > 0; r++){
84 p += our_wctomb(p, *r);
85 }
86 noutput += p-obuf;
87 write(1, obuf, p-obuf);
88 }
90 void
91 isoutf_in(int fd, long *notused, struct convert *out)
92 {
93 char buf[N];
94 int i, j, c, n, tot;
96 USED(notused);
97 tot = 0;
98 while((n = read(fd, buf+tot, N-tot)) >= 0){
99 tot += n;
100 for(i=j=0; i<tot; ){
101 if(!fullisorune(buf+i, tot-i))
102 break;
103 c = isochartorune(&runes[j], buf+i);
104 if(runes[j] == Runeerror && c == 1){
105 if(squawk)
106 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
107 if(clean){
108 i++;
109 continue;
111 nerrors++;
113 j++;
114 i += c;
116 OUT(out, runes, j);
117 tot -= i;
118 ninput += i;
119 if(tot)
120 memmove(buf, buf+i, tot);
121 if(n == 0)
122 break;
124 OUT(out, runes, 0);
127 void
128 isoutf_out(Rune *base, int n, long *notused)
130 char *p;
131 Rune *r;
133 USED(notused);
134 nrunes += n;
135 for(r = base, p = obuf; n-- > 0; r++)
136 p += runetoisoutf(p, r);
137 noutput += p-obuf;
138 write(1, obuf, p-obuf);
142 int
143 isochartorune(Rune *rune, char *str)
145 return chartorune(rune, str);
148 int
149 runetoisoutf(char *str, Rune *rune)
151 return runetochar(str, rune);
154 int
155 fullisorune(char *str, int n)
157 return fullrune(str, n);
160 enum
162 T1 = 0x00,
163 Tx = 0x80,
164 T2 = 0xC0,
165 T3 = 0xE0,
166 T4 = 0xF0,
167 T5 = 0xF8,
168 T6 = 0xFC,
170 Bit1 = 7,
171 Bitx = 6,
172 Bit2 = 5,
173 Bit3 = 4,
174 Bit4 = 3,
175 Bit5 = 2,
176 Bit6 = 2,
178 Mask1 = (1<<Bit1)-1,
179 Maskx = (1<<Bitx)-1,
180 Mask2 = (1<<Bit2)-1,
181 Mask3 = (1<<Bit3)-1,
182 Mask4 = (1<<Bit4)-1,
183 Mask5 = (1<<Bit5)-1,
184 Mask6 = (1<<Bit6)-1,
186 Wchar1 = (1UL<<Bit1)-1,
187 Wchar2 = (1UL<<(Bit2+Bitx))-1,
188 Wchar3 = (1UL<<(Bit3+2*Bitx))-1,
189 Wchar4 = (1UL<<(Bit4+3*Bitx))-1,
190 Wchar5 = (1UL<<(Bit5+4*Bitx))-1
191 };
193 int
194 our_wctomb(char *s, unsigned long wc)
196 if(s == 0)
197 return 0; /* no shift states */
198 if(wc & ~Wchar2) {
199 if(wc & ~Wchar4) {
200 if(wc & ~Wchar5) {
201 /* 6 bytes */
202 s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
203 s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
204 s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
205 s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
206 s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
207 s[5] = Tx | (wc & Maskx);
208 return 6;
210 /* 5 bytes */
211 s[0] = T5 | (wc >> 4*Bitx);
212 s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
213 s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
214 s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
215 s[4] = Tx | (wc & Maskx);
216 return 5;
218 if(wc & ~Wchar3) {
219 /* 4 bytes */
220 s[0] = T4 | (wc >> 3*Bitx);
221 s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
222 s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
223 s[3] = Tx | (wc & Maskx);
224 return 4;
226 /* 3 bytes */
227 s[0] = T3 | (wc >> 2*Bitx);
228 s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
229 s[2] = Tx | (wc & Maskx);
230 return 3;
232 if(wc & ~Wchar1) {
233 /* 2 bytes */
234 s[0] = T2 | (wc >> 1*Bitx);
235 s[1] = Tx | (wc & Maskx);
236 return 2;
238 /* 1 byte */
239 s[0] = T1 | wc;
240 return 1;
243 int
244 our_mbtowc(unsigned long *p, char *s, unsigned n)
246 uchar *us;
247 int c0, c1, c2, c3, c4, c5;
248 unsigned long wc;
250 if(s == 0)
251 return 0; /* no shift states */
253 if(n < 1)
254 goto bad;
255 us = (uchar*)s;
256 c0 = us[0];
257 if(c0 >= T3) {
258 if(n < 3)
259 goto bad;
260 c1 = us[1] ^ Tx;
261 c2 = us[2] ^ Tx;
262 if((c1|c2) & T2)
263 goto bad;
264 if(c0 >= T5) {
265 if(n < 5)
266 goto bad;
267 c3 = us[3] ^ Tx;
268 c4 = us[4] ^ Tx;
269 if((c3|c4) & T2)
270 goto bad;
271 if(c0 >= T6) {
272 /* 6 bytes */
273 if(n < 6)
274 goto bad;
275 c5 = us[5] ^ Tx;
276 if(c5 & T2)
277 goto bad;
278 wc = ((((((((((c0 & Mask6) << Bitx) |
279 c1) << Bitx) | c2) << Bitx) |
280 c3) << Bitx) | c4) << Bitx) | c5;
281 if(wc <= Wchar5)
282 goto bad;
283 *p = wc;
284 return 6;
286 /* 5 bytes */
287 wc = ((((((((c0 & Mask5) << Bitx) |
288 c1) << Bitx) | c2) << Bitx) |
289 c3) << Bitx) | c4;
290 if(wc <= Wchar4)
291 goto bad;
292 *p = wc;
293 return 5;
295 if(c0 >= T4) {
296 /* 4 bytes */
297 if(n < 4)
298 goto bad;
299 c3 = us[3] ^ Tx;
300 if(c3 & T2)
301 goto bad;
302 wc = ((((((c0 & Mask4) << Bitx) |
303 c1) << Bitx) | c2) << Bitx) |
304 c3;
305 if(wc <= Wchar3)
306 goto bad;
307 *p = wc;
308 return 4;
310 /* 3 bytes */
311 wc = ((((c0 & Mask3) << Bitx) |
312 c1) << Bitx) | c2;
313 if(wc <= Wchar2)
314 goto bad;
315 *p = wc;
316 return 3;
318 if(c0 >= T2) {
319 /* 2 bytes */
320 if(n < 2)
321 goto bad;
322 c1 = us[1] ^ Tx;
323 if(c1 & T2)
324 goto bad;
325 wc = ((c0 & Mask2) << Bitx) |
326 c1;
327 if(wc <= Wchar1)
328 goto bad;
329 *p = wc;
330 return 2;
332 /* 1 byte */
333 if(c0 >= Tx)
334 goto bad;
335 *p = c0;
336 return 1;
338 bad:
339 errno = EILSEQ;
340 return -1;