commit 0cadb4301d18724e7513d7489cb5bebd262c82f1 from: Russ Cox date: Fri Sep 11 21:03:06 2009 UTC convert to 4-byte UTF-8 and 32-bit Rune http://codereview.appspot.com/116075 commit - 4dbf255619efac4f0a00e4216d6c999128910df2 commit + 0cadb4301d18724e7513d7489cb5bebd262c82f1 blob - 480ccad58d4538b5bf912b209231d4b41849e925 blob + 795e83e84fd9969bc146792074860d49546530f8 --- include/fmt.h +++ include/fmt.h @@ -30,7 +30,7 @@ struct Fmt{ void *farg; /* to make flush a closure */ int nfmt; /* num chars formatted so far */ va_list args; /* args passed to dofmt */ - int r; /* % format Rune */ + Rune r; /* % format Rune */ int width; int prec; unsigned long flags; blob - eb5ea42a8fb35a599cf3b1be760a3142fc5f7f34 blob + 44052f417281099c9095d6ffd26991018b6d84b9 --- include/utf.h +++ include/utf.h @@ -4,14 +4,15 @@ extern "C" { #endif -typedef unsigned short Rune; /* 16 bits */ +typedef unsigned int Rune; /* 32 bits */ enum { - UTFmax = 3, /* maximum bytes per rune */ + UTFmax = 4, /* maximum bytes per rune */ Runesync = 0x80, /* cannot represent part of a UTF sequence (<) */ Runeself = 0x80, /* rune and UTF sequences are the same (<) */ - Runeerror = 0xFFFD /* decoding error in UTF */ + Runeerror = 0xFFFD, /* decoding error in UTF */ + Runemax = 0x10FFFF /* maximum rune value */ }; /* Edit .+1,/^$/ | cfn $PLAN9/src/lib9/utf/?*.c | grep -v static |grep -v __ */ blob - d77b52ee4e1db06c4a7074c57608adf683880a90 blob + 6f04595ded3c14c65c884bdb6433b9ab64124284 --- src/cmd/9term/wind.c +++ src/cmd/9term/wind.c @@ -193,7 +193,7 @@ winctl(void *arg) Rune *rp, *bp, *up, *kbdr; uint qh; int nr, nb, c, wid, i, npart, initial, lastb; - char *s, *t, part[3]; + char *s, *t, part[UTFmax]; Window *w; Mousestate *mp, m; enum { WKey, WMouse, WMouseread, WCtl, WCwrite, WCread, WWread, NWALT }; blob - 5197e8a45f7c4a3bc4b6df6167eab470469588cc blob + a58437ba2178b17fe7fd955297dce4fe1a720f38 --- src/cmd/acme/regx.c +++ src/cmd/acme/regx.c @@ -488,7 +488,7 @@ bldcclass(void) exprp++; /* eat '-' */ if((c2 = nextrec()) == ']') goto Error; - classp[n+0] = 0xFFFF; + classp[n+0] = Runemax; classp[n+1] = c1; classp[n+2] = c2; n += 3; @@ -510,7 +510,7 @@ classmatch(int classno, int c, int negate) p = class[classno]; while(*p){ - if(*p == 0xFFFF){ + if(*p == Runemax){ if(p[1]<=c && c<=p[2]) return !negate; p += 3; blob - cdf00b9b087beb0eacef7c37ceb9620033d8d728 blob + 12510689daf638433011cf7f481a585ad46f4024 --- src/cmd/sam/cmd.c +++ src/cmd/sam/cmd.c @@ -71,7 +71,7 @@ int inputc(void) { int n, nbuf; - char buf[3]; + char buf[UTFmax]; Rune r; Again: blob - 3477baeff0492dad47afedb8dc3eb580ade8b44c blob + 3234d73133414b8f6f6745979575e438c6d575e0 --- src/cmd/sam/regexp.c +++ src/cmd/sam/regexp.c @@ -494,7 +494,7 @@ bldcclass(void) exprp++; /* eat '-' */ if((c2 = nextrec()) == ']') goto Error; - classp[n+0] = 0xFFFF; + classp[n+0] = Runemax; classp[n+1] = c1; classp[n+2] = c2; n += 3; @@ -516,7 +516,7 @@ classmatch(int classno, int c, int negate) p = class[classno]; while(*p){ - if(*p == 0xFFFF){ + if(*p == Runemax){ if(p[1]<=c && c<=p[2]) return !negate; p += 3; blob - 5849c83fb955a0f2b4534c6904f42c8a12dd19c4 blob + a78c9c66496d54118c6f150daf516610e6982faf --- src/cmd/sed.c +++ src/cmd/sed.c @@ -615,7 +615,7 @@ compsub(Rune *rhs, Rune *end) while ((r = *cp++) != '\0') { if(r == '\\') { if (rhs < end) - *rhs++ = 0xFFFF; + *rhs++ = Runemax; else return 0; r = *cp++; @@ -1050,7 +1050,7 @@ dosub(Rune *rhsbuf) sp = place(sp, loc1, loc2); continue; } - if (c == 0xFFFF && (c = *rp++) >= '1' && c < MAXSUB+'0') { + if (c == Runemax && (c = *rp++) >= '1' && c < MAXSUB+'0') { n = c-'0'; if (subexp[n].s.rsp && subexp[n].e.rep) { sp = place(sp, subexp[n].s.rsp, subexp[n].e.rep); blob - 39ba747c0b8870a85c3ca588f28a3de7a16a15dc blob + 46e98e15d182be9044d7862a9555dbb5c351acba --- src/cmd/tr.c +++ src/cmd/tr.c @@ -15,7 +15,7 @@ uchar bits[] = { 1, 2, 4, 8, 16, 32, 64, 128 }; #define CLEARBIT(a,c) ((a)[(c)/8] &= ~bits[(c)&07]) #define BITSET(a,c) ((a)[(c)/8] & bits[(c)&07]) -#define MAXRUNE 0xFFFF +#define MAXRUNE Runemax uchar f[(MAXRUNE+1)/8]; uchar t[(MAXRUNE+1)/8]; blob - e52348966da5e4df420195df07ac936bc487555e blob + c97b036ca8ee2f520857f95c466271d0e88efc66 --- src/cmd/troff/mbwc.c +++ src/cmd/troff/mbwc.c @@ -152,9 +152,9 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n) if(p+d <= pe+3) { *p++ = buf[0]; if(d > 1) { - *p++ = buf[2]; + *p++ = buf[1]; if(d > 2) - *p++ = buf[3]; + *p++ = buf[2]; } } if(c == 0) blob - a04472711b6590b8aa95e52960bfdfd447c453b9 blob + aec44b75005e57862fb5e82257ddafe69421addc --- src/cmd/unicode.c +++ src/cmd/unicode.c @@ -51,13 +51,13 @@ range(char *argv[]) return "bad range"; } min = strtoul(q, &q, 16); - if(min<0 || min>0xFFFF || *q!='-') + if(min<0 || min>Runemax || *q!='-') goto err; q++; if(strchr(hex, *q) == 0) goto err; max = strtoul(q, &q, 16); - if(max<0 || max>0xFFFF || maxRunemax || max0xFFFF || *q!=0) + if(m<0 || m>Runemax || *q!=0) goto err; Bprint(&bout, "%C", m); if(!text) blob - 214b71db0e0fc30eb357ed107b3a3f25249509d2 blob + 353c76e37f25e0af8707c5c72199b28bd663eddb --- src/lib9/fmt/dofmt.c +++ src/lib9/fmt/dofmt.c @@ -605,12 +605,13 @@ __flagfmt(Fmt *f) int __badfmt(Fmt *f) { - char x[3]; + char x[2+UTFmax]; + int n; x[0] = '%'; - x[1] = f->r; - x[2] = '%'; - f->prec = 3; - __fmtcpy(f, (const void*)x, 3, 3); + n = 1 + runetochar(x+1, &f->r); + x[n++] = '%'; + f->prec = n; + __fmtcpy(f, (const void*)x, n, n); return 0; } blob - 3d6831b029f61522f122c51bebecf0e488ff88ef blob + f5944806f28d457c6d0875524ca20b338733e440 --- src/lib9/utf/rune.c +++ src/lib9/utf/rune.c @@ -23,16 +23,19 @@ enum Bit2 = 5, Bit3 = 4, Bit4 = 3, + Bit5 = 2, T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ - Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0111 1111 */ - Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0111 1111 1111 */ - Rune3 = (1<<(Bit3+2*Bitx))-1, /* 1111 1111 1111 1111 */ + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ Maskx = (1< T4 Tx Tx Tx + */ + if(UTFmax >= 4) { + c3 = *(uchar*)(str+3) ^ Tx; + if(c3 & Testx) + goto bad; + if(c < T5) { + l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4; + if(l <= Rune3) + goto bad; + if(l > Runemax) + goto bad; + *rune = l; + return 4; + } + } + + /* * bad decoding */ bad: @@ -113,7 +135,7 @@ runetochar(char *str, Rune *rune) /* * two character sequence - * 0080-07FF => T2 Tx + * 00080-007FF => T2 Tx */ if(c <= Rune2) { str[0] = T2 | (c >> 1*Bitx); @@ -123,12 +145,26 @@ runetochar(char *str, Rune *rune) /* * three character sequence - * 0800-FFFF => T3 Tx Tx + * 00800-0FFFF => T3 Tx Tx */ - str[0] = T3 | (c >> 2*Bitx); - str[1] = Tx | ((c >> 1*Bitx) & Maskx); - str[2] = Tx | (c & Maskx); - return 3; + if(c > Runemax) + c = Runeerror; + if(c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* + * four character sequence + * 010000-1FFFFF => T4 Tx Tx Tx + */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; } int @@ -155,7 +191,10 @@ runenlen(Rune *r, int nrune) if(c <= Rune2) nb += 2; else + if(c <= Rune3 || c > Runemax) nb += 3; + else + nb += 4; } return nb; } @@ -165,13 +204,14 @@ fullrune(char *str, int n) { int c; - if(n > 0) { - c = *(uchar*)str; - if(c < Tx) - return 1; - if(n > 1) - if(c < T3 || n > 2) - return 1; - } - return 0; + if(n <= 0) + return 0; + c = *(uchar*)str; + if(c < Tx) + return 1; + if(c < T3) + return n >= 2; + if(UTFmax == 3 || c < T4) + return n >= 3; + return n >= 4; } blob - f1b065bcee408c25c163162aed6da34eb61cc815 blob + 46991e161f4ecc5c6958c9b84c7bc207d9b519f3 --- src/libbio/bgetrune.c +++ src/libbio/bgetrune.c @@ -7,7 +7,7 @@ Bgetrune(Biobuf *bp) { int c, i; Rune rune; - char str[4]; + char str[UTFmax]; c = Bgetc(bp); if(c < Runeself) { /* one char */ blob - a2eaa83ec28212f175c199290c7931c7ed11671f blob + 651ae7b762e78ef77b7d78e06e174deb8c5a8ee0 --- src/libbio/bputrune.c +++ src/libbio/bputrune.c @@ -6,7 +6,7 @@ int Bputrune(Biobuf *bp, long c) { Rune rune; - char str[4]; + char str[UTFmax]; int n; rune = c;