commit 31bffaba60d1bdce33fcee001f8bfe718871e714 from: Russ Cox date: Fri Feb 04 20:41:39 2011 UTC eqn: add working mbtowc and wctomb Fixes UTF-8 input files. R=rsc http://codereview.appspot.com/4132042 commit - ff262e102983021abc2223e83131eea1f7eb5d58 commit + 31bffaba60d1bdce33fcee001f8bfe718871e714 blob - e66212ee99820b96da39b4c336eb204e73207ec8 blob + 0ef1b9ea73165b984822a2cbe7854ff4d4f672f7 --- src/cmd/eqn/mkfile +++ src/cmd/eqn/mkfile @@ -15,6 +15,7 @@ OFILES=main.$O\ lookup.$O\ mark.$O\ matrix.$O\ + mbwc.$O\ move.$O\ over.$O\ paren.$O\ blob - /dev/null blob + c97b036ca8ee2f520857f95c466271d0e88efc66 (mode 644) --- /dev/null +++ src/cmd/eqn/mbwc.c @@ -0,0 +1,165 @@ +#include + +/* + * Use the FSS-UTF transformation proposed by posix. + * We define 7 byte types: + * T0 0xxxxxxx 7 free bits + * Tx 10xxxxxx 6 free bits + * T1 110xxxxx 5 free bits + * T2 1110xxxx 4 free bits + * + * Encoding is as follows. + * From hex Thru hex Sequence Bits + * 00000000 0000007F T0 7 + * 00000080 000007FF T1 Tx 11 + * 00000800 0000FFFF T2 Tx Tx 16 + */ + +int +mblen(const char *s, size_t n) +{ + + return mbtowc(0, s, n); +} + +int +mbtowc(wchar_t *pwc, const char *s, size_t n) +{ + int c, c1, c2; + long l; + + if(!s) + return 0; + + if(n < 1) + goto bad; + c = s[0] & 0xff; + if((c & 0x80) == 0x00) { + if(pwc) + *pwc = c; + if(c == 0) + return 0; + return 1; + } + + if(n < 2) + goto bad; + c1 = (s[1] ^ 0x80) & 0xff; + if((c1 & 0xC0) != 0x00) + goto bad; + if((c & 0xE0) == 0xC0) { + l = ((c << 6) | c1) & 0x7FF; + if(l < 0x080) + goto bad; + if(pwc) + *pwc = l; + return 2; + } + + if(n < 3) + goto bad; + c2 = (s[2] ^ 0x80) & 0xff; + if((c2 & 0xC0) != 0x00) + goto bad; + if((c & 0xF0) == 0xE0) { + l = ((((c << 6) | c1) << 6) | c2) & 0xFFFF; + if(l < 0x0800) + goto bad; + if(pwc) + *pwc = l; + return 3; + } + + /* + * bad decoding + */ +bad: + return -1; + +} + +int +wctomb(char *s, wchar_t wchar) +{ + long c; + + if(!s) + return 0; + + c = wchar & 0xFFFF; + if(c < 0x80) { + s[0] = c; + return 1; + } + + if(c < 0x800) { + s[0] = 0xC0 | (c >> 6); + s[1] = 0x80 | (c & 0x3F); + return 2; + } + + s[0] = 0xE0 | (c >> 12); + s[1] = 0x80 | ((c >> 6) & 0x3F); + s[2] = 0x80 | (c & 0x3F); + return 3; +} + +size_t +mbstowcs(wchar_t *pwcs, const char *s, size_t n) +{ + int i, d, c; + + for(i=0; i < n; i++) { + c = *s & 0xff; + if(c < 0x80) { + *pwcs = c; + if(c == 0) + break; + s++; + } else { + d = mbtowc(pwcs, s, 3); + if(d <= 0) + return (size_t)((d<0) ? -1 : i); + s += d; + } + pwcs++; + } + return i; +} + +size_t +wcstombs(char *s, const wchar_t *pwcs, size_t n) +{ + int d; + long c; + char *p, *pe; + char buf[3]; + + p = s; + pe = p+n-3; + while(p < pe) { + c = *pwcs++; + if(c < 0x80) + *p++ = c; + else + p += wctomb(p, c); + if(c == 0) + return p-s; + } + while(p < pe+3) { + c = *pwcs++; + d = wctomb(buf, c); + if(p+d <= pe+3) { + *p++ = buf[0]; + if(d > 1) { + *p++ = buf[1]; + if(d > 2) + *p++ = buf[2]; + } + } + if(c == 0) + break; + } + return p-s; +} +