Commit Diff


commit - 4dbf255619efac4f0a00e4216d6c999128910df2
commit + 0cadb4301d18724e7513d7489cb5bebd262c82f1
blob - 480ccad58d4538b5bf912b209231d4b41849e925
blob + 795e83e84fd9969bc146792074860d49546530f8
--- include/fmt.h
+++ include/fmt.h
@@ -30,7 +30,7 @@ struct Fmt{
 	void	*farg;			/* to make flush a closure */
 	int	nfmt;			/* num chars formatted so far */
 	va_list	args;			/* args passed to dofmt */
-	int	r;			/* % format Rune */
+	Rune	r;			/* % format Rune */
 	int	width;
 	int	prec;
 	unsigned long	flags;
blob - eb5ea42a8fb35a599cf3b1be760a3142fc5f7f34
blob + 44052f417281099c9095d6ffd26991018b6d84b9
--- include/utf.h
+++ include/utf.h
@@ -4,14 +4,15 @@
 extern "C" { 
 #endif
 
-typedef unsigned short Rune;	/* 16 bits */
+typedef unsigned int Rune;	/* 32 bits */
 
 enum
 {
-	UTFmax		= 3,		/* maximum bytes per rune */
+	UTFmax		= 4,		/* maximum bytes per rune */
 	Runesync	= 0x80,		/* cannot represent part of a UTF sequence (<) */
 	Runeself	= 0x80,		/* rune and UTF sequences are the same (<) */
-	Runeerror	= 0xFFFD		/* decoding error in UTF */
+	Runeerror	= 0xFFFD,	/* decoding error in UTF */
+	Runemax = 0x10FFFF	/* maximum rune value */
 };
 
 /* Edit .+1,/^$/ | cfn $PLAN9/src/lib9/utf/?*.c | grep -v static |grep -v __ */
blob - d77b52ee4e1db06c4a7074c57608adf683880a90
blob + 6f04595ded3c14c65c884bdb6433b9ab64124284
--- src/cmd/9term/wind.c
+++ src/cmd/9term/wind.c
@@ -193,7 +193,7 @@ winctl(void *arg)
 	Rune *rp, *bp, *up, *kbdr;
 	uint qh;
 	int nr, nb, c, wid, i, npart, initial, lastb;
-	char *s, *t, part[3];
+	char *s, *t, part[UTFmax];
 	Window *w;
 	Mousestate *mp, m;
 	enum { WKey, WMouse, WMouseread, WCtl, WCwrite, WCread, WWread, NWALT };
blob - 5197e8a45f7c4a3bc4b6df6167eab470469588cc
blob + a58437ba2178b17fe7fd955297dce4fe1a720f38
--- src/cmd/acme/regx.c
+++ src/cmd/acme/regx.c
@@ -488,7 +488,7 @@ bldcclass(void)
 			exprp++;	/* eat '-' */
 			if((c2 = nextrec()) == ']')
 				goto Error;
-			classp[n+0] = 0xFFFF;
+			classp[n+0] = Runemax;
 			classp[n+1] = c1;
 			classp[n+2] = c2;
 			n += 3;
@@ -510,7 +510,7 @@ classmatch(int classno, int c, int negate)
 
 	p = class[classno];
 	while(*p){
-		if(*p == 0xFFFF){
+		if(*p == Runemax){
 			if(p[1]<=c && c<=p[2])
 				return !negate;
 			p += 3;
blob - cdf00b9b087beb0eacef7c37ceb9620033d8d728
blob + 12510689daf638433011cf7f481a585ad46f4024
--- src/cmd/sam/cmd.c
+++ src/cmd/sam/cmd.c
@@ -71,7 +71,7 @@ int
 inputc(void)
 {
 	int n, nbuf;
-	char buf[3];
+	char buf[UTFmax];
 	Rune r;
 
     Again:
blob - 3477baeff0492dad47afedb8dc3eb580ade8b44c
blob + 3234d73133414b8f6f6745979575e438c6d575e0
--- src/cmd/sam/regexp.c
+++ src/cmd/sam/regexp.c
@@ -494,7 +494,7 @@ bldcclass(void)
 			exprp++;	/* eat '-' */
 			if((c2 = nextrec()) == ']')
 				goto Error;
-			classp[n+0] = 0xFFFF;
+			classp[n+0] = Runemax;
 			classp[n+1] = c1;
 			classp[n+2] = c2;
 			n += 3;
@@ -516,7 +516,7 @@ classmatch(int classno, int c, int negate)
 
 	p = class[classno];
 	while(*p){
-		if(*p == 0xFFFF){
+		if(*p == Runemax){
 			if(p[1]<=c && c<=p[2])
 				return !negate;
 			p += 3;
blob - 5849c83fb955a0f2b4534c6904f42c8a12dd19c4
blob + a78c9c66496d54118c6f150daf516610e6982faf
--- src/cmd/sed.c
+++ src/cmd/sed.c
@@ -615,7 +615,7 @@ compsub(Rune *rhs, Rune *end)
 	while ((r = *cp++) != '\0') {
 		if(r == '\\') {
 			if (rhs < end)
-				*rhs++ = 0xFFFF;
+				*rhs++ = Runemax;
 			else
 				return 0;
 			r = *cp++;
@@ -1050,7 +1050,7 @@ dosub(Rune *rhsbuf)
 			sp = place(sp, loc1, loc2);
 			continue;
 		}
-		if (c == 0xFFFF && (c = *rp++) >= '1' && c < MAXSUB+'0') {
+		if (c == Runemax && (c = *rp++) >= '1' && c < MAXSUB+'0') {
 			n = c-'0';
 			if (subexp[n].s.rsp && subexp[n].e.rep) {
 				sp = place(sp, subexp[n].s.rsp, subexp[n].e.rep);
blob - 39ba747c0b8870a85c3ca588f28a3de7a16a15dc
blob + 46e98e15d182be9044d7862a9555dbb5c351acba
--- src/cmd/tr.c
+++ src/cmd/tr.c
@@ -15,7 +15,7 @@ uchar	bits[] = { 1, 2, 4, 8, 16, 32, 64, 128 };
 #define	CLEARBIT(a,c)		((a)[(c)/8] &= ~bits[(c)&07])
 #define	BITSET(a,c)		((a)[(c)/8] & bits[(c)&07])
 
-#define	MAXRUNE	0xFFFF
+#define	MAXRUNE	Runemax
 
 uchar	f[(MAXRUNE+1)/8];
 uchar	t[(MAXRUNE+1)/8];
blob - e52348966da5e4df420195df07ac936bc487555e
blob + c97b036ca8ee2f520857f95c466271d0e88efc66
--- src/cmd/troff/mbwc.c
+++ src/cmd/troff/mbwc.c
@@ -152,9 +152,9 @@ wcstombs(char *s, const wchar_t *pwcs, size_t n)
 		if(p+d <= pe+3) {
 			*p++ = buf[0];
 			if(d > 1) {
-				*p++ = buf[2];
+				*p++ = buf[1];
 				if(d > 2)
-					*p++ = buf[3];
+					*p++ = buf[2];
 			}
 		}
 		if(c == 0)
blob - a04472711b6590b8aa95e52960bfdfd447c453b9
blob + aec44b75005e57862fb5e82257ddafe69421addc
--- src/cmd/unicode.c
+++ src/cmd/unicode.c
@@ -51,13 +51,13 @@ range(char *argv[])
 			return "bad range";
 		}
 		min = strtoul(q, &q, 16);
-		if(min<0 || min>0xFFFF || *q!='-')
+		if(min<0 || min>Runemax || *q!='-')
 			goto err;
 		q++;
 		if(strchr(hex, *q) == 0)
 			goto err;
 		max = strtoul(q, &q, 16);
-		if(max<0 || max>0xFFFF || max<min || *q!=0)
+		if(max<0 || max>Runemax || max<min || *q!=0)
 			goto err;
 		i = 0;
 		do{
@@ -111,7 +111,7 @@ chars(char *argv[])
 			return "bad char";
 		}
 		m = strtoul(q, &q, 16);
-		if(m<0 || m>0xFFFF || *q!=0)
+		if(m<0 || m>Runemax || *q!=0)
 			goto err;
 		Bprint(&bout, "%C", m);
 		if(!text)
blob - 214b71db0e0fc30eb357ed107b3a3f25249509d2
blob + 353c76e37f25e0af8707c5c72199b28bd663eddb
--- src/lib9/fmt/dofmt.c
+++ src/lib9/fmt/dofmt.c
@@ -605,12 +605,13 @@ __flagfmt(Fmt *f)
 int
 __badfmt(Fmt *f)
 {
-	char x[3];
+	char x[2+UTFmax];
+	int n;
 
 	x[0] = '%';
-	x[1] = f->r;
-	x[2] = '%';
-	f->prec = 3;
-	__fmtcpy(f, (const void*)x, 3, 3);
+	n = 1 + runetochar(x+1, &f->r);
+	x[n++] = '%';
+	f->prec = n;
+	__fmtcpy(f, (const void*)x, n, n);
 	return 0;
 }
blob - 3d6831b029f61522f122c51bebecf0e488ff88ef
blob + f5944806f28d457c6d0875524ca20b338733e440
--- src/lib9/utf/rune.c
+++ src/lib9/utf/rune.c
@@ -23,16 +23,19 @@ enum
 	Bit2	= 5,
 	Bit3	= 4,
 	Bit4	= 3,
+	Bit5	= 2,
 
 	T1	= ((1<<(Bit1+1))-1) ^ 0xFF,	/* 0000 0000 */
 	Tx	= ((1<<(Bitx+1))-1) ^ 0xFF,	/* 1000 0000 */
 	T2	= ((1<<(Bit2+1))-1) ^ 0xFF,	/* 1100 0000 */
 	T3	= ((1<<(Bit3+1))-1) ^ 0xFF,	/* 1110 0000 */
 	T4	= ((1<<(Bit4+1))-1) ^ 0xFF,	/* 1111 0000 */
+	T5	= ((1<<(Bit5+1))-1) ^ 0xFF,	/* 1111 1000 */
 
-	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0111 1111 */
-	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0111 1111 1111 */
-	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 1111 1111 1111 1111 */
+	Rune1	= (1<<(Bit1+0*Bitx))-1,		/* 0000 0000 0000 0000 0111 1111 */
+	Rune2	= (1<<(Bit2+1*Bitx))-1,		/* 0000 0000 0000 0111 1111 1111 */
+	Rune3	= (1<<(Bit3+2*Bitx))-1,		/* 0000 0000 1111 1111 1111 1111 */
+	Rune4	= (1<<(Bit4+3*Bitx))-1,		/* 0011 1111 1111 1111 1111 1111 */
 
 	Maskx	= (1<<Bitx)-1,			/* 0011 1111 */
 	Testx	= Maskx ^ 0xFF,			/* 1100 0000 */
@@ -43,7 +46,7 @@ enum
 int
 chartorune(Rune *rune, char *str)
 {
-	int c, c1, c2;
+	int c, c1, c2, c3;
 	long l;
 
 	/*
@@ -89,6 +92,25 @@ chartorune(Rune *rune, char *str)
 	}
 
 	/*
+	 * four character sequence
+	 *	10000-10FFFF => T4 Tx Tx Tx
+	 */
+	if(UTFmax >= 4) {
+		c3 = *(uchar*)(str+3) ^ Tx;
+		if(c3 & Testx)
+			goto bad;
+		if(c < T5) {
+			l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
+			if(l <= Rune3)
+				goto bad;
+			if(l > Runemax)
+				goto bad;
+			*rune = l;
+			return 4;
+		}
+	}
+
+	/*
 	 * bad decoding
 	 */
 bad:
@@ -113,7 +135,7 @@ runetochar(char *str, Rune *rune)
 
 	/*
 	 * two character sequence
-	 *	0080-07FF => T2 Tx
+	 *	00080-007FF => T2 Tx
 	 */
 	if(c <= Rune2) {
 		str[0] = T2 | (c >> 1*Bitx);
@@ -123,12 +145,26 @@ runetochar(char *str, Rune *rune)
 
 	/*
 	 * three character sequence
-	 *	0800-FFFF => T3 Tx Tx
+	 *	00800-0FFFF => T3 Tx Tx
 	 */
-	str[0] = T3 |  (c >> 2*Bitx);
-	str[1] = Tx | ((c >> 1*Bitx) & Maskx);
-	str[2] = Tx |  (c & Maskx);
-	return 3;
+	if(c > Runemax)
+		c = Runeerror;
+	if(c <= Rune3) {
+		str[0] = T3 |  (c >> 2*Bitx);
+		str[1] = Tx | ((c >> 1*Bitx) & Maskx);
+		str[2] = Tx |  (c & Maskx);
+		return 3;
+	}
+	
+	/*
+	 * four character sequence
+	 *	010000-1FFFFF => T4 Tx Tx Tx
+	 */
+	str[0] = T4 |  (c >> 3*Bitx);
+	str[1] = Tx | ((c >> 2*Bitx) & Maskx);
+	str[2] = Tx | ((c >> 1*Bitx) & Maskx);
+	str[3] = Tx |  (c & Maskx);
+	return 4;
 }
 
 int
@@ -155,7 +191,10 @@ runenlen(Rune *r, int nrune)
 		if(c <= Rune2)
 			nb += 2;
 		else
+		if(c <= Rune3 || c > Runemax)
 			nb += 3;
+		else
+			nb += 4;
 	}
 	return nb;
 }
@@ -165,13 +204,14 @@ fullrune(char *str, int n)
 {
 	int c;
 
-	if(n > 0) {
-		c = *(uchar*)str;
-		if(c < Tx)
-			return 1;
-		if(n > 1)
-			if(c < T3 || n > 2)
-				return 1;
-	}
-	return 0;
+	if(n <= 0)
+		return 0;
+	c = *(uchar*)str;
+	if(c < Tx)
+		return 1;
+	if(c < T3)
+		return n >= 2;
+	if(UTFmax == 3 || c < T4)
+		return n >= 3;
+	return n >= 4;
 }
blob - f1b065bcee408c25c163162aed6da34eb61cc815
blob + 46991e161f4ecc5c6958c9b84c7bc207d9b519f3
--- src/libbio/bgetrune.c
+++ src/libbio/bgetrune.c
@@ -7,7 +7,7 @@ Bgetrune(Biobuf *bp)
 {
 	int c, i;
 	Rune rune;
-	char str[4];
+	char str[UTFmax];
 
 	c = Bgetc(bp);
 	if(c < Runeself) {		/* one char */
blob - a2eaa83ec28212f175c199290c7931c7ed11671f
blob + 651ae7b762e78ef77b7d78e06e174deb8c5a8ee0
--- src/libbio/bputrune.c
+++ src/libbio/bputrune.c
@@ -6,7 +6,7 @@ int
 Bputrune(Biobuf *bp, long c)
 {
 	Rune rune;
-	char str[4];
+	char str[UTFmax];
 	int n;
 
 	rune = c;