Blob
1 #ifdef PLAN92 #include <u.h>3 #include <libc.h>4 #include <bio.h>5 #ifdef PLAN9PORT6 #include <errno.h>7 #else8 extern int errno;9 #endif10 #else11 #include <sys/types.h>12 #include <stdio.h>13 #include <stdlib.h>14 #include <string.h>15 #include <unistd.h>16 #include <errno.h>17 #include "plan9.h"18 #endif19 #include "hdr.h"20 #ifndef EILSEQ21 #define EILSEQ 999822 #endif24 /*25 the our_* routines are implementations for the corresponding library26 routines. for a while, i tried to actually name them wctomb etc27 but stopped that after i found a system which made wchar_t an28 unsigned char.29 */31 int our_wctomb(char *s, unsigned long wc);32 int our_mbtowc(unsigned long *p, char *s, unsigned n);33 int runetoisoutf(char *str, Rune *rune);34 int fullisorune(char *str, int n);35 int isochartorune(Rune *rune, char *str);37 void38 utf_in(int fd, long *notused, struct convert *out)39 {40 char buf[N];41 int i, j, c, n, tot;42 ulong l;44 USED(notused);45 tot = 0;46 while((n = read(fd, buf+tot, N-tot)) >= 0){47 tot += n;48 for(i=j=0; i<=tot-UTFmax || (i<tot && (n==0 || fullrune(buf+i, tot-i))); ){49 c = our_mbtowc(&l, buf+i, tot-i);50 if(c == -1){51 if(squawk)52 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);53 if(clean){54 i++;55 continue;56 }57 nerrors++;58 l = Runeerror;59 c = 1;60 }61 runes[j++] = l;62 i += c;63 }64 OUT(out, runes, j);65 tot -= i;66 ninput += i;67 if(tot)68 memmove(buf, buf+i, tot);69 if(n == 0)70 break;71 }72 OUT(out, runes, 0);73 }75 void76 utf_out(Rune *base, int n, long *notused)77 {78 char *p;79 Rune *r;81 USED(notused);82 nrunes += n;83 for(r = base, p = obuf; n-- > 0; r++){84 p += our_wctomb(p, *r);85 }86 noutput += p-obuf;87 write(1, obuf, p-obuf);88 }90 void91 isoutf_in(int fd, long *notused, struct convert *out)92 {93 char buf[N];94 int i, j, c, n, tot;96 USED(notused);97 tot = 0;98 while((n = read(fd, buf+tot, N-tot)) >= 0){99 tot += n;100 for(i=j=0; i<tot; ){101 if(!fullisorune(buf+i, tot-i))102 break;103 c = isochartorune(&runes[j], buf+i);104 if(runes[j] == Runeerror && c == 1){105 if(squawk)106 EPR "%s: bad UTF sequence near byte %ld in input\n", argv0, ninput+i);107 if(clean){108 i++;109 continue;110 }111 nerrors++;112 }113 j++;114 i += c;115 }116 OUT(out, runes, j);117 tot -= i;118 ninput += i;119 if(tot)120 memmove(buf, buf+i, tot);121 if(n == 0)122 break;123 }124 OUT(out, runes, 0);125 }127 void128 isoutf_out(Rune *base, int n, long *notused)129 {130 char *p;131 Rune *r;133 USED(notused);134 nrunes += n;135 for(r = base, p = obuf; n-- > 0; r++)136 p += runetoisoutf(p, r);137 noutput += p-obuf;138 write(1, obuf, p-obuf);139 }142 int143 isochartorune(Rune *rune, char *str)144 {145 return chartorune(rune, str);146 }148 int149 runetoisoutf(char *str, Rune *rune)150 {151 return runetochar(str, rune);152 }154 int155 fullisorune(char *str, int n)156 {157 return fullrune(str, n);158 }160 enum161 {162 T1 = 0x00,163 Tx = 0x80,164 T2 = 0xC0,165 T3 = 0xE0,166 T4 = 0xF0,167 T5 = 0xF8,168 T6 = 0xFC,170 Bit1 = 7,171 Bitx = 6,172 Bit2 = 5,173 Bit3 = 4,174 Bit4 = 3,175 Bit5 = 2,176 Bit6 = 2,178 Mask1 = (1<<Bit1)-1,179 Maskx = (1<<Bitx)-1,180 Mask2 = (1<<Bit2)-1,181 Mask3 = (1<<Bit3)-1,182 Mask4 = (1<<Bit4)-1,183 Mask5 = (1<<Bit5)-1,184 Mask6 = (1<<Bit6)-1,186 Wchar1 = (1UL<<Bit1)-1,187 Wchar2 = (1UL<<(Bit2+Bitx))-1,188 Wchar3 = (1UL<<(Bit3+2*Bitx))-1,189 Wchar4 = (1UL<<(Bit4+3*Bitx))-1,190 Wchar5 = (1UL<<(Bit5+4*Bitx))-1191 };193 int194 our_wctomb(char *s, unsigned long wc)195 {196 if(s == 0)197 return 0; /* no shift states */198 if(wc & ~Wchar2) {199 if(wc & ~Wchar4) {200 if(wc & ~Wchar5) {201 /* 6 bytes */202 s[0] = T6 | ((wc >> 5*Bitx) & Mask6);203 s[1] = Tx | ((wc >> 4*Bitx) & Maskx);204 s[2] = Tx | ((wc >> 3*Bitx) & Maskx);205 s[3] = Tx | ((wc >> 2*Bitx) & Maskx);206 s[4] = Tx | ((wc >> 1*Bitx) & Maskx);207 s[5] = Tx | (wc & Maskx);208 return 6;209 }210 /* 5 bytes */211 s[0] = T5 | (wc >> 4*Bitx);212 s[1] = Tx | ((wc >> 3*Bitx) & Maskx);213 s[2] = Tx | ((wc >> 2*Bitx) & Maskx);214 s[3] = Tx | ((wc >> 1*Bitx) & Maskx);215 s[4] = Tx | (wc & Maskx);216 return 5;217 }218 if(wc & ~Wchar3) {219 /* 4 bytes */220 s[0] = T4 | (wc >> 3*Bitx);221 s[1] = Tx | ((wc >> 2*Bitx) & Maskx);222 s[2] = Tx | ((wc >> 1*Bitx) & Maskx);223 s[3] = Tx | (wc & Maskx);224 return 4;225 }226 /* 3 bytes */227 s[0] = T3 | (wc >> 2*Bitx);228 s[1] = Tx | ((wc >> 1*Bitx) & Maskx);229 s[2] = Tx | (wc & Maskx);230 return 3;231 }232 if(wc & ~Wchar1) {233 /* 2 bytes */234 s[0] = T2 | (wc >> 1*Bitx);235 s[1] = Tx | (wc & Maskx);236 return 2;237 }238 /* 1 byte */239 s[0] = T1 | wc;240 return 1;241 }243 int244 our_mbtowc(unsigned long *p, char *s, unsigned n)245 {246 uchar *us;247 int c0, c1, c2, c3, c4, c5;248 unsigned long wc;250 if(s == 0)251 return 0; /* no shift states */253 if(n < 1)254 goto bad;255 us = (uchar*)s;256 c0 = us[0];257 if(c0 >= T3) {258 if(n < 3)259 goto bad;260 c1 = us[1] ^ Tx;261 c2 = us[2] ^ Tx;262 if((c1|c2) & T2)263 goto bad;264 if(c0 >= T5) {265 if(n < 5)266 goto bad;267 c3 = us[3] ^ Tx;268 c4 = us[4] ^ Tx;269 if((c3|c4) & T2)270 goto bad;271 if(c0 >= T6) {272 /* 6 bytes */273 if(n < 6)274 goto bad;275 c5 = us[5] ^ Tx;276 if(c5 & T2)277 goto bad;278 wc = ((((((((((c0 & Mask6) << Bitx) |279 c1) << Bitx) | c2) << Bitx) |280 c3) << Bitx) | c4) << Bitx) | c5;281 if(wc <= Wchar5)282 goto bad;283 *p = wc;284 return 6;285 }286 /* 5 bytes */287 wc = ((((((((c0 & Mask5) << Bitx) |288 c1) << Bitx) | c2) << Bitx) |289 c3) << Bitx) | c4;290 if(wc <= Wchar4)291 goto bad;292 *p = wc;293 return 5;294 }295 if(c0 >= T4) {296 /* 4 bytes */297 if(n < 4)298 goto bad;299 c3 = us[3] ^ Tx;300 if(c3 & T2)301 goto bad;302 wc = ((((((c0 & Mask4) << Bitx) |303 c1) << Bitx) | c2) << Bitx) |304 c3;305 if(wc <= Wchar3)306 goto bad;307 *p = wc;308 return 4;309 }310 /* 3 bytes */311 wc = ((((c0 & Mask3) << Bitx) |312 c1) << Bitx) | c2;313 if(wc <= Wchar2)314 goto bad;315 *p = wc;316 return 3;317 }318 if(c0 >= T2) {319 /* 2 bytes */320 if(n < 2)321 goto bad;322 c1 = us[1] ^ Tx;323 if(c1 & T2)324 goto bad;325 wc = ((c0 & Mask2) << Bitx) |326 c1;327 if(wc <= Wchar1)328 goto bad;329 *p = wc;330 return 2;331 }332 /* 1 byte */333 if(c0 >= Tx)334 goto bad;335 *p = c0;336 return 1;338 bad:339 errno = EILSEQ;340 return -1;341 }