Blame


1 a31db67d 2004-04-21 devnull #ifdef PLAN9
2 a31db67d 2004-04-21 devnull #include <u.h>
3 a31db67d 2004-04-21 devnull #include <libc.h>
4 a31db67d 2004-04-21 devnull #include <bio.h>
5 fa059a4e 2005-12-17 devnull #ifdef PLAN9PORT
6 fa059a4e 2005-12-17 devnull #include <errno.h>
7 a31db67d 2004-04-21 devnull #else
8 fa059a4e 2005-12-17 devnull extern int errno;
9 fa059a4e 2005-12-17 devnull #endif
10 fa059a4e 2005-12-17 devnull #else
11 a31db67d 2004-04-21 devnull #include <sys/types.h>
12 a31db67d 2004-04-21 devnull #include <stdio.h>
13 a31db67d 2004-04-21 devnull #include <stdlib.h>
14 a31db67d 2004-04-21 devnull #include <string.h>
15 a31db67d 2004-04-21 devnull #include <unistd.h>
16 a31db67d 2004-04-21 devnull #include <errno.h>
17 a31db67d 2004-04-21 devnull #include "plan9.h"
18 a31db67d 2004-04-21 devnull #endif
19 a31db67d 2004-04-21 devnull #include "hdr.h"
20 35d26aa3 2005-12-26 devnull #ifndef EILSEQ
21 35d26aa3 2005-12-26 devnull #define EILSEQ 9998
22 35d26aa3 2005-12-26 devnull #endif
23 a31db67d 2004-04-21 devnull
24 a31db67d 2004-04-21 devnull /*
25 a31db67d 2004-04-21 devnull the our_* routines are implementations for the corresponding library
26 a31db67d 2004-04-21 devnull routines. for a while, i tried to actually name them wctomb etc
27 a31db67d 2004-04-21 devnull but stopped that after i found a system which made wchar_t an
28 a31db67d 2004-04-21 devnull unsigned char.
29 a31db67d 2004-04-21 devnull */
30 a31db67d 2004-04-21 devnull
31 a31db67d 2004-04-21 devnull int our_wctomb(char *s, unsigned long wc);
32 a31db67d 2004-04-21 devnull int our_mbtowc(unsigned long *p, char *s, unsigned n);
33 a31db67d 2004-04-21 devnull int runetoisoutf(char *str, Rune *rune);
34 a31db67d 2004-04-21 devnull int fullisorune(char *str, int n);
35 a31db67d 2004-04-21 devnull int isochartorune(Rune *rune, char *str);
36 a31db67d 2004-04-21 devnull
37 a31db67d 2004-04-21 devnull void
38 a31db67d 2004-04-21 devnull utf_in(int fd, long *notused, struct convert *out)
39 a31db67d 2004-04-21 devnull {
40 7551b2ec 2005-03-09 devnull char buf[N];
41 7551b2ec 2005-03-09 devnull int i, j, c, n, tot;
42 7551b2ec 2005-03-09 devnull ulong l;
43 a31db67d 2004-04-21 devnull
44 a31db67d 2004-04-21 devnull USED(notused);
45 7551b2ec 2005-03-09 devnull tot = 0;
46 7551b2ec 2005-03-09 devnull while((n = read(fd, buf+tot, N-tot)) >= 0){
47 7551b2ec 2005-03-09 devnull tot += n;
48 2b03bf69 2006-10-12 devnull for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){
49 7551b2ec 2005-03-09 devnull c = our_mbtowc(&l, buf+i, tot-i);
50 7f0d675d 2005-09-13 devnull if(c == -1){
51 7551b2ec 2005-03-09 devnull if(squawk)
52 7551b2ec 2005-03-09 devnull EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
53 536f9b83 2006-05-21 devnull if(clean){
54 536f9b83 2006-05-21 devnull i++;
55 7551b2ec 2005-03-09 devnull continue;
56 536f9b83 2006-05-21 devnull }
57 7551b2ec 2005-03-09 devnull nerrors++;
58 7551b2ec 2005-03-09 devnull l = Runeerror;
59 7f0d675d 2005-09-13 devnull c = 1;
60 a31db67d 2004-04-21 devnull }
61 7551b2ec 2005-03-09 devnull runes[j++] = l;
62 7551b2ec 2005-03-09 devnull i += c;
63 a31db67d 2004-04-21 devnull }
64 7551b2ec 2005-03-09 devnull OUT(out, runes, j);
65 7551b2ec 2005-03-09 devnull tot -= i;
66 7551b2ec 2005-03-09 devnull ninput += i;
67 7551b2ec 2005-03-09 devnull if(tot)
68 7551b2ec 2005-03-09 devnull memmove(buf, buf+i, tot);
69 7551b2ec 2005-03-09 devnull if(n == 0)
70 7551b2ec 2005-03-09 devnull break;
71 7551b2ec 2005-03-09 devnull }
72 536f9b83 2006-05-21 devnull OUT(out, runes, 0);
73 a31db67d 2004-04-21 devnull }
74 a31db67d 2004-04-21 devnull
75 a31db67d 2004-04-21 devnull void
76 a31db67d 2004-04-21 devnull utf_out(Rune *base, int n, long *notused)
77 a31db67d 2004-04-21 devnull {
78 a31db67d 2004-04-21 devnull char *p;
79 a31db67d 2004-04-21 devnull Rune *r;
80 a31db67d 2004-04-21 devnull
81 a31db67d 2004-04-21 devnull USED(notused);
82 a31db67d 2004-04-21 devnull nrunes += n;
83 a31db67d 2004-04-21 devnull for(r = base, p = obuf; n-- > 0; r++){
84 a31db67d 2004-04-21 devnull p += our_wctomb(p, *r);
85 a31db67d 2004-04-21 devnull }
86 a31db67d 2004-04-21 devnull noutput += p-obuf;
87 a31db67d 2004-04-21 devnull write(1, obuf, p-obuf);
88 a31db67d 2004-04-21 devnull }
89 a31db67d 2004-04-21 devnull
90 a31db67d 2004-04-21 devnull void
91 a31db67d 2004-04-21 devnull isoutf_in(int fd, long *notused, struct convert *out)
92 a31db67d 2004-04-21 devnull {
93 7551b2ec 2005-03-09 devnull char buf[N];
94 7551b2ec 2005-03-09 devnull int i, j, c, n, tot;
95 a31db67d 2004-04-21 devnull
96 a31db67d 2004-04-21 devnull USED(notused);
97 7551b2ec 2005-03-09 devnull tot = 0;
98 7551b2ec 2005-03-09 devnull while((n = read(fd, buf+tot, N-tot)) >= 0){
99 7551b2ec 2005-03-09 devnull tot += n;
100 7551b2ec 2005-03-09 devnull for(i=j=0; i<tot; ){
101 7551b2ec 2005-03-09 devnull if(!fullisorune(buf+i, tot-i))
102 7551b2ec 2005-03-09 devnull break;
103 7551b2ec 2005-03-09 devnull c = isochartorune(&runes[j], buf+i);
104 536f9b83 2006-05-21 devnull if(runes[j] == Runeerror && c == 1){
105 7551b2ec 2005-03-09 devnull if(squawk)
106 7551b2ec 2005-03-09 devnull EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);
107 536f9b83 2006-05-21 devnull if(clean){
108 536f9b83 2006-05-21 devnull i++;
109 7551b2ec 2005-03-09 devnull continue;
110 536f9b83 2006-05-21 devnull }
111 7551b2ec 2005-03-09 devnull nerrors++;
112 a31db67d 2004-04-21 devnull }
113 7551b2ec 2005-03-09 devnull j++;
114 7551b2ec 2005-03-09 devnull i += c;
115 a31db67d 2004-04-21 devnull }
116 7551b2ec 2005-03-09 devnull OUT(out, runes, j);
117 7551b2ec 2005-03-09 devnull tot -= i;
118 7551b2ec 2005-03-09 devnull ninput += i;
119 7551b2ec 2005-03-09 devnull if(tot)
120 7551b2ec 2005-03-09 devnull memmove(buf, buf+i, tot);
121 7551b2ec 2005-03-09 devnull if(n == 0)
122 7551b2ec 2005-03-09 devnull break;
123 7551b2ec 2005-03-09 devnull }
124 536f9b83 2006-05-21 devnull OUT(out, runes, 0);
125 a31db67d 2004-04-21 devnull }
126 a31db67d 2004-04-21 devnull
127 a31db67d 2004-04-21 devnull void
128 a31db67d 2004-04-21 devnull isoutf_out(Rune *base, int n, long *notused)
129 a31db67d 2004-04-21 devnull {
130 a31db67d 2004-04-21 devnull char *p;
131 a31db67d 2004-04-21 devnull Rune *r;
132 a31db67d 2004-04-21 devnull
133 a31db67d 2004-04-21 devnull USED(notused);
134 a31db67d 2004-04-21 devnull nrunes += n;
135 a31db67d 2004-04-21 devnull for(r = base, p = obuf; n-- > 0; r++)
136 a31db67d 2004-04-21 devnull p += runetoisoutf(p, r);
137 a31db67d 2004-04-21 devnull noutput += p-obuf;
138 a31db67d 2004-04-21 devnull write(1, obuf, p-obuf);
139 a31db67d 2004-04-21 devnull }
140 a31db67d 2004-04-21 devnull
141 a31db67d 2004-04-21 devnull
142 a31db67d 2004-04-21 devnull int
143 a31db67d 2004-04-21 devnull isochartorune(Rune *rune, char *str)
144 a31db67d 2004-04-21 devnull {
145 a0583cf2 2009-09-13 rsc return chartorune(rune, str);
146 a31db67d 2004-04-21 devnull }
147 a31db67d 2004-04-21 devnull
148 a31db67d 2004-04-21 devnull int
149 a31db67d 2004-04-21 devnull runetoisoutf(char *str, Rune *rune)
150 a31db67d 2004-04-21 devnull {
151 a0583cf2 2009-09-13 rsc return runetochar(str, rune);
152 a31db67d 2004-04-21 devnull }
153 a31db67d 2004-04-21 devnull
154 a31db67d 2004-04-21 devnull int
155 a31db67d 2004-04-21 devnull fullisorune(char *str, int n)
156 a31db67d 2004-04-21 devnull {
157 a0583cf2 2009-09-13 rsc return fullrune(str, n);
158 a31db67d 2004-04-21 devnull }
159 a31db67d 2004-04-21 devnull
160 a31db67d 2004-04-21 devnull enum
161 a31db67d 2004-04-21 devnull {
162 a31db67d 2004-04-21 devnull T1 = 0x00,
163 a31db67d 2004-04-21 devnull Tx = 0x80,
164 a31db67d 2004-04-21 devnull T2 = 0xC0,
165 a31db67d 2004-04-21 devnull T3 = 0xE0,
166 a31db67d 2004-04-21 devnull T4 = 0xF0,
167 a31db67d 2004-04-21 devnull T5 = 0xF8,
168 a31db67d 2004-04-21 devnull T6 = 0xFC,
169 a31db67d 2004-04-21 devnull
170 a31db67d 2004-04-21 devnull Bit1 = 7,
171 a31db67d 2004-04-21 devnull Bitx = 6,
172 a31db67d 2004-04-21 devnull Bit2 = 5,
173 a31db67d 2004-04-21 devnull Bit3 = 4,
174 a31db67d 2004-04-21 devnull Bit4 = 3,
175 a31db67d 2004-04-21 devnull Bit5 = 2,
176 a31db67d 2004-04-21 devnull Bit6 = 2,
177 a31db67d 2004-04-21 devnull
178 a31db67d 2004-04-21 devnull Mask1 = (1<<Bit1)-1,
179 a31db67d 2004-04-21 devnull Maskx = (1<<Bitx)-1,
180 a31db67d 2004-04-21 devnull Mask2 = (1<<Bit2)-1,
181 a31db67d 2004-04-21 devnull Mask3 = (1<<Bit3)-1,
182 a31db67d 2004-04-21 devnull Mask4 = (1<<Bit4)-1,
183 a31db67d 2004-04-21 devnull Mask5 = (1<<Bit5)-1,
184 a31db67d 2004-04-21 devnull Mask6 = (1<<Bit6)-1,
185 a31db67d 2004-04-21 devnull
186 a31db67d 2004-04-21 devnull Wchar1 = (1UL<<Bit1)-1,
187 a31db67d 2004-04-21 devnull Wchar2 = (1UL<<(Bit2+Bitx))-1,
188 a31db67d 2004-04-21 devnull Wchar3 = (1UL<<(Bit3+2*Bitx))-1,
189 a31db67d 2004-04-21 devnull Wchar4 = (1UL<<(Bit4+3*Bitx))-1,
190 a31db67d 2004-04-21 devnull Wchar5 = (1UL<<(Bit5+4*Bitx))-1
191 a31db67d 2004-04-21 devnull };
192 a31db67d 2004-04-21 devnull
193 a31db67d 2004-04-21 devnull int
194 a31db67d 2004-04-21 devnull our_wctomb(char *s, unsigned long wc)
195 a31db67d 2004-04-21 devnull {
196 a31db67d 2004-04-21 devnull if(s == 0)
197 a31db67d 2004-04-21 devnull return 0; /* no shift states */
198 a31db67d 2004-04-21 devnull if(wc & ~Wchar2) {
199 a31db67d 2004-04-21 devnull if(wc & ~Wchar4) {
200 a31db67d 2004-04-21 devnull if(wc & ~Wchar5) {
201 a31db67d 2004-04-21 devnull /* 6 bytes */
202 a31db67d 2004-04-21 devnull s[0] = T6 | ((wc >> 5*Bitx) & Mask6);
203 a31db67d 2004-04-21 devnull s[1] = Tx | ((wc >> 4*Bitx) & Maskx);
204 a31db67d 2004-04-21 devnull s[2] = Tx | ((wc >> 3*Bitx) & Maskx);
205 a31db67d 2004-04-21 devnull s[3] = Tx | ((wc >> 2*Bitx) & Maskx);
206 a31db67d 2004-04-21 devnull s[4] = Tx | ((wc >> 1*Bitx) & Maskx);
207 a31db67d 2004-04-21 devnull s[5] = Tx | (wc & Maskx);
208 a31db67d 2004-04-21 devnull return 6;
209 a31db67d 2004-04-21 devnull }
210 a31db67d 2004-04-21 devnull /* 5 bytes */
211 a31db67d 2004-04-21 devnull s[0] = T5 | (wc >> 4*Bitx);
212 a31db67d 2004-04-21 devnull s[1] = Tx | ((wc >> 3*Bitx) & Maskx);
213 a31db67d 2004-04-21 devnull s[2] = Tx | ((wc >> 2*Bitx) & Maskx);
214 a31db67d 2004-04-21 devnull s[3] = Tx | ((wc >> 1*Bitx) & Maskx);
215 a31db67d 2004-04-21 devnull s[4] = Tx | (wc & Maskx);
216 a31db67d 2004-04-21 devnull return 5;
217 a31db67d 2004-04-21 devnull }
218 a31db67d 2004-04-21 devnull if(wc & ~Wchar3) {
219 a31db67d 2004-04-21 devnull /* 4 bytes */
220 a31db67d 2004-04-21 devnull s[0] = T4 | (wc >> 3*Bitx);
221 a31db67d 2004-04-21 devnull s[1] = Tx | ((wc >> 2*Bitx) & Maskx);
222 a31db67d 2004-04-21 devnull s[2] = Tx | ((wc >> 1*Bitx) & Maskx);
223 a31db67d 2004-04-21 devnull s[3] = Tx | (wc & Maskx);
224 a31db67d 2004-04-21 devnull return 4;
225 a31db67d 2004-04-21 devnull }
226 a31db67d 2004-04-21 devnull /* 3 bytes */
227 a31db67d 2004-04-21 devnull s[0] = T3 | (wc >> 2*Bitx);
228 a31db67d 2004-04-21 devnull s[1] = Tx | ((wc >> 1*Bitx) & Maskx);
229 a31db67d 2004-04-21 devnull s[2] = Tx | (wc & Maskx);
230 a31db67d 2004-04-21 devnull return 3;
231 a31db67d 2004-04-21 devnull }
232 a31db67d 2004-04-21 devnull if(wc & ~Wchar1) {
233 a31db67d 2004-04-21 devnull /* 2 bytes */
234 a31db67d 2004-04-21 devnull s[0] = T2 | (wc >> 1*Bitx);
235 a31db67d 2004-04-21 devnull s[1] = Tx | (wc & Maskx);
236 a31db67d 2004-04-21 devnull return 2;
237 a31db67d 2004-04-21 devnull }
238 a31db67d 2004-04-21 devnull /* 1 byte */
239 a31db67d 2004-04-21 devnull s[0] = T1 | wc;
240 a31db67d 2004-04-21 devnull return 1;
241 a31db67d 2004-04-21 devnull }
242 a31db67d 2004-04-21 devnull
243 a31db67d 2004-04-21 devnull int
244 a31db67d 2004-04-21 devnull our_mbtowc(unsigned long *p, char *s, unsigned n)
245 a31db67d 2004-04-21 devnull {
246 a31db67d 2004-04-21 devnull uchar *us;
247 a31db67d 2004-04-21 devnull int c0, c1, c2, c3, c4, c5;
248 a31db67d 2004-04-21 devnull unsigned long wc;
249 a31db67d 2004-04-21 devnull
250 a31db67d 2004-04-21 devnull if(s == 0)
251 a31db67d 2004-04-21 devnull return 0; /* no shift states */
252 a31db67d 2004-04-21 devnull
253 a31db67d 2004-04-21 devnull if(n < 1)
254 536f9b83 2006-05-21 devnull goto bad;
255 a31db67d 2004-04-21 devnull us = (uchar*)s;
256 a31db67d 2004-04-21 devnull c0 = us[0];
257 a31db67d 2004-04-21 devnull if(c0 >= T3) {
258 a31db67d 2004-04-21 devnull if(n < 3)
259 536f9b83 2006-05-21 devnull goto bad;
260 a31db67d 2004-04-21 devnull c1 = us[1] ^ Tx;
261 a31db67d 2004-04-21 devnull c2 = us[2] ^ Tx;
262 a31db67d 2004-04-21 devnull if((c1|c2) & T2)
263 a31db67d 2004-04-21 devnull goto bad;
264 a31db67d 2004-04-21 devnull if(c0 >= T5) {
265 a31db67d 2004-04-21 devnull if(n < 5)
266 536f9b83 2006-05-21 devnull goto bad;
267 a31db67d 2004-04-21 devnull c3 = us[3] ^ Tx;
268 a31db67d 2004-04-21 devnull c4 = us[4] ^ Tx;
269 a31db67d 2004-04-21 devnull if((c3|c4) & T2)
270 a31db67d 2004-04-21 devnull goto bad;
271 a31db67d 2004-04-21 devnull if(c0 >= T6) {
272 a31db67d 2004-04-21 devnull /* 6 bytes */
273 a31db67d 2004-04-21 devnull if(n < 6)
274 536f9b83 2006-05-21 devnull goto bad;
275 a31db67d 2004-04-21 devnull c5 = us[5] ^ Tx;
276 a31db67d 2004-04-21 devnull if(c5 & T2)
277 a31db67d 2004-04-21 devnull goto bad;
278 a31db67d 2004-04-21 devnull wc = ((((((((((c0 & Mask6) << Bitx) |
279 a31db67d 2004-04-21 devnull c1) << Bitx) | c2) << Bitx) |
280 a31db67d 2004-04-21 devnull c3) << Bitx) | c4) << Bitx) | c5;
281 a31db67d 2004-04-21 devnull if(wc <= Wchar5)
282 a31db67d 2004-04-21 devnull goto bad;
283 a31db67d 2004-04-21 devnull *p = wc;
284 a31db67d 2004-04-21 devnull return 6;
285 a31db67d 2004-04-21 devnull }
286 a31db67d 2004-04-21 devnull /* 5 bytes */
287 a31db67d 2004-04-21 devnull wc = ((((((((c0 & Mask5) << Bitx) |
288 a31db67d 2004-04-21 devnull c1) << Bitx) | c2) << Bitx) |
289 a31db67d 2004-04-21 devnull c3) << Bitx) | c4;
290 a31db67d 2004-04-21 devnull if(wc <= Wchar4)
291 a31db67d 2004-04-21 devnull goto bad;
292 a31db67d 2004-04-21 devnull *p = wc;
293 a31db67d 2004-04-21 devnull return 5;
294 a31db67d 2004-04-21 devnull }
295 a31db67d 2004-04-21 devnull if(c0 >= T4) {
296 a31db67d 2004-04-21 devnull /* 4 bytes */
297 a31db67d 2004-04-21 devnull if(n < 4)
298 536f9b83 2006-05-21 devnull goto bad;
299 a31db67d 2004-04-21 devnull c3 = us[3] ^ Tx;
300 a31db67d 2004-04-21 devnull if(c3 & T2)
301 a31db67d 2004-04-21 devnull goto bad;
302 a31db67d 2004-04-21 devnull wc = ((((((c0 & Mask4) << Bitx) |
303 a31db67d 2004-04-21 devnull c1) << Bitx) | c2) << Bitx) |
304 a31db67d 2004-04-21 devnull c3;
305 a31db67d 2004-04-21 devnull if(wc <= Wchar3)
306 a31db67d 2004-04-21 devnull goto bad;
307 a31db67d 2004-04-21 devnull *p = wc;
308 a31db67d 2004-04-21 devnull return 4;
309 a31db67d 2004-04-21 devnull }
310 a31db67d 2004-04-21 devnull /* 3 bytes */
311 a31db67d 2004-04-21 devnull wc = ((((c0 & Mask3) << Bitx) |
312 a31db67d 2004-04-21 devnull c1) << Bitx) | c2;
313 a31db67d 2004-04-21 devnull if(wc <= Wchar2)
314 a31db67d 2004-04-21 devnull goto bad;
315 a31db67d 2004-04-21 devnull *p = wc;
316 a31db67d 2004-04-21 devnull return 3;
317 a31db67d 2004-04-21 devnull }
318 a31db67d 2004-04-21 devnull if(c0 >= T2) {
319 a31db67d 2004-04-21 devnull /* 2 bytes */
320 a31db67d 2004-04-21 devnull if(n < 2)
321 536f9b83 2006-05-21 devnull goto bad;
322 a31db67d 2004-04-21 devnull c1 = us[1] ^ Tx;
323 a31db67d 2004-04-21 devnull if(c1 & T2)
324 a31db67d 2004-04-21 devnull goto bad;
325 a31db67d 2004-04-21 devnull wc = ((c0 & Mask2) << Bitx) |
326 a31db67d 2004-04-21 devnull c1;
327 a31db67d 2004-04-21 devnull if(wc <= Wchar1)
328 a31db67d 2004-04-21 devnull goto bad;
329 a31db67d 2004-04-21 devnull *p = wc;
330 a31db67d 2004-04-21 devnull return 2;
331 a31db67d 2004-04-21 devnull }
332 a31db67d 2004-04-21 devnull /* 1 byte */
333 a31db67d 2004-04-21 devnull if(c0 >= Tx)
334 a31db67d 2004-04-21 devnull goto bad;
335 a31db67d 2004-04-21 devnull *p = c0;
336 a31db67d 2004-04-21 devnull return 1;
337 a31db67d 2004-04-21 devnull
338 a31db67d 2004-04-21 devnull bad:
339 a31db67d 2004-04-21 devnull errno = EILSEQ;
340 a31db67d 2004-04-21 devnull return -1;
341 a31db67d 2004-04-21 devnull }