2 91c13e54 2004-02-29 devnull * The authors of this software are Rob Pike and Ken Thompson.
3 91c13e54 2004-02-29 devnull * Copyright (c) 2002 by Lucent Technologies.
4 91c13e54 2004-02-29 devnull * Permission to use, copy, modify, and distribute this software for any
5 91c13e54 2004-02-29 devnull * purpose without fee is hereby granted, provided that this entire notice
6 91c13e54 2004-02-29 devnull * is included in all copies of any software which is or includes a copy
7 91c13e54 2004-02-29 devnull * or modification of this software and in all copies of the supporting
8 91c13e54 2004-02-29 devnull * documentation for such software.
9 91c13e54 2004-02-29 devnull * THIS SOFTWARE IS BEING PROVIDED "AS IS", WITHOUT ANY EXPRESS OR IMPLIED
10 37488654 2004-12-29 devnull * WARRANTY. IN PARTICULAR, NEITHER THE AUTHORS NOR LUCENT TECHNOLOGIES MAKE
11 37488654 2004-12-29 devnull * ANY REPRESENTATION OR WARRANTY OF ANY KIND CONCERNING THE MERCHANTABILITY
12 91c13e54 2004-02-29 devnull * OF THIS SOFTWARE OR ITS FITNESS FOR ANY PARTICULAR PURPOSE.
14 91c13e54 2004-02-29 devnull #include <stdarg.h>
15 91c13e54 2004-02-29 devnull #include <string.h>
16 e5aa96ac 2004-12-26 devnull #include "plan9.h"
17 91c13e54 2004-02-29 devnull #include "utf.h"
21 91c13e54 2004-02-29 devnull Bit1 = 7,
22 91c13e54 2004-02-29 devnull Bitx = 6,
23 91c13e54 2004-02-29 devnull Bit2 = 5,
24 91c13e54 2004-02-29 devnull Bit3 = 4,
25 91c13e54 2004-02-29 devnull Bit4 = 3,
26 0cadb430 2009-09-11 russcox Bit5 = 2,
28 91c13e54 2004-02-29 devnull T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */
29 91c13e54 2004-02-29 devnull Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */
30 91c13e54 2004-02-29 devnull T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */
31 91c13e54 2004-02-29 devnull T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */
32 91c13e54 2004-02-29 devnull T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */
33 0cadb430 2009-09-11 russcox T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */
35 0cadb430 2009-09-11 russcox Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */
36 0cadb430 2009-09-11 russcox Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */
37 0cadb430 2009-09-11 russcox Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */
38 0cadb430 2009-09-11 russcox Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */
40 91c13e54 2004-02-29 devnull Maskx = (1<<Bitx)-1, /* 0011 1111 */
41 91c13e54 2004-02-29 devnull Testx = Maskx ^ 0xFF, /* 1100 0000 */
43 cbeb0b26 2006-04-01 devnull Bad = Runeerror
47 91c13e54 2004-02-29 devnull chartorune(Rune *rune, char *str)
49 0cadb430 2009-09-11 russcox int c, c1, c2, c3;
53 91c13e54 2004-02-29 devnull * one character sequence
54 91c13e54 2004-02-29 devnull * 00000-0007F => T1
56 91c13e54 2004-02-29 devnull c = *(uchar*)str;
57 91c13e54 2004-02-29 devnull if(c < Tx) {
58 91c13e54 2004-02-29 devnull *rune = c;
59 91c13e54 2004-02-29 devnull return 1;
63 91c13e54 2004-02-29 devnull * two character sequence
64 91c13e54 2004-02-29 devnull * 0080-07FF => T2 Tx
66 91c13e54 2004-02-29 devnull c1 = *(uchar*)(str+1) ^ Tx;
67 91c13e54 2004-02-29 devnull if(c1 & Testx)
68 91c13e54 2004-02-29 devnull goto bad;
69 91c13e54 2004-02-29 devnull if(c < T3) {
70 91c13e54 2004-02-29 devnull if(c < T2)
71 91c13e54 2004-02-29 devnull goto bad;
72 91c13e54 2004-02-29 devnull l = ((c << Bitx) | c1) & Rune2;
73 91c13e54 2004-02-29 devnull if(l <= Rune1)
74 91c13e54 2004-02-29 devnull goto bad;
75 91c13e54 2004-02-29 devnull *rune = l;
76 91c13e54 2004-02-29 devnull return 2;
80 91c13e54 2004-02-29 devnull * three character sequence
81 91c13e54 2004-02-29 devnull * 0800-FFFF => T3 Tx Tx
83 91c13e54 2004-02-29 devnull c2 = *(uchar*)(str+2) ^ Tx;
84 91c13e54 2004-02-29 devnull if(c2 & Testx)
85 91c13e54 2004-02-29 devnull goto bad;
86 91c13e54 2004-02-29 devnull if(c < T4) {
87 91c13e54 2004-02-29 devnull l = ((((c << Bitx) | c1) << Bitx) | c2) & Rune3;
88 91c13e54 2004-02-29 devnull if(l <= Rune2)
89 91c13e54 2004-02-29 devnull goto bad;
90 91c13e54 2004-02-29 devnull *rune = l;
91 91c13e54 2004-02-29 devnull return 3;
95 0cadb430 2009-09-11 russcox * four character sequence
96 0cadb430 2009-09-11 russcox * 10000-10FFFF => T4 Tx Tx Tx
98 0cadb430 2009-09-11 russcox if(UTFmax >= 4) {
99 0cadb430 2009-09-11 russcox c3 = *(uchar*)(str+3) ^ Tx;
100 0cadb430 2009-09-11 russcox if(c3 & Testx)
101 0cadb430 2009-09-11 russcox goto bad;
102 0cadb430 2009-09-11 russcox if(c < T5) {
103 0cadb430 2009-09-11 russcox l = ((((((c << Bitx) | c1) << Bitx) | c2) << Bitx) | c3) & Rune4;
104 0cadb430 2009-09-11 russcox if(l <= Rune3)
105 0cadb430 2009-09-11 russcox goto bad;
106 0cadb430 2009-09-11 russcox if(l > Runemax)
107 0cadb430 2009-09-11 russcox goto bad;
108 0cadb430 2009-09-11 russcox *rune = l;
109 0cadb430 2009-09-11 russcox return 4;
114 91c13e54 2004-02-29 devnull * bad decoding
117 91c13e54 2004-02-29 devnull *rune = Bad;
118 91c13e54 2004-02-29 devnull return 1;
122 91c13e54 2004-02-29 devnull runetochar(char *str, Rune *rune)
127 91c13e54 2004-02-29 devnull * one character sequence
128 91c13e54 2004-02-29 devnull * 00000-0007F => 00-7F
130 91c13e54 2004-02-29 devnull c = *rune;
131 91c13e54 2004-02-29 devnull if(c <= Rune1) {
132 91c13e54 2004-02-29 devnull str[0] = c;
133 91c13e54 2004-02-29 devnull return 1;
137 91c13e54 2004-02-29 devnull * two character sequence
138 0cadb430 2009-09-11 russcox * 00080-007FF => T2 Tx
140 91c13e54 2004-02-29 devnull if(c <= Rune2) {
141 91c13e54 2004-02-29 devnull str[0] = T2 | (c >> 1*Bitx);
142 91c13e54 2004-02-29 devnull str[1] = Tx | (c & Maskx);
143 91c13e54 2004-02-29 devnull return 2;
147 91c13e54 2004-02-29 devnull * three character sequence
148 0cadb430 2009-09-11 russcox * 00800-0FFFF => T3 Tx Tx
150 0cadb430 2009-09-11 russcox if(c > Runemax)
151 0cadb430 2009-09-11 russcox c = Runeerror;
152 0cadb430 2009-09-11 russcox if(c <= Rune3) {
153 0cadb430 2009-09-11 russcox str[0] = T3 | (c >> 2*Bitx);
154 0cadb430 2009-09-11 russcox str[1] = Tx | ((c >> 1*Bitx) & Maskx);
155 0cadb430 2009-09-11 russcox str[2] = Tx | (c & Maskx);
156 0cadb430 2009-09-11 russcox return 3;
160 0cadb430 2009-09-11 russcox * four character sequence
161 0cadb430 2009-09-11 russcox * 010000-1FFFFF => T4 Tx Tx Tx
163 0cadb430 2009-09-11 russcox str[0] = T4 | (c >> 3*Bitx);
164 0cadb430 2009-09-11 russcox str[1] = Tx | ((c >> 2*Bitx) & Maskx);
165 0cadb430 2009-09-11 russcox str[2] = Tx | ((c >> 1*Bitx) & Maskx);
166 0cadb430 2009-09-11 russcox str[3] = Tx | (c & Maskx);
167 0cadb430 2009-09-11 russcox return 4;
171 91c13e54 2004-02-29 devnull runelen(long c)
173 91c13e54 2004-02-29 devnull Rune rune;
174 91c13e54 2004-02-29 devnull char str[10];
176 91c13e54 2004-02-29 devnull rune = c;
177 91c13e54 2004-02-29 devnull return runetochar(str, &rune);
181 91c13e54 2004-02-29 devnull runenlen(Rune *r, int nrune)
183 91c13e54 2004-02-29 devnull int nb, c;
186 91c13e54 2004-02-29 devnull while(nrune--) {
187 91c13e54 2004-02-29 devnull c = *r++;
188 91c13e54 2004-02-29 devnull if(c <= Rune1)
191 91c13e54 2004-02-29 devnull if(c <= Rune2)
192 91c13e54 2004-02-29 devnull nb += 2;
194 0cadb430 2009-09-11 russcox if(c <= Rune3 || c > Runemax)
195 91c13e54 2004-02-29 devnull nb += 3;
197 0cadb430 2009-09-11 russcox nb += 4;
199 91c13e54 2004-02-29 devnull return nb;
203 91c13e54 2004-02-29 devnull fullrune(char *str, int n)
207 0cadb430 2009-09-11 russcox if(n <= 0)
208 0cadb430 2009-09-11 russcox return 0;
209 0cadb430 2009-09-11 russcox c = *(uchar*)str;
210 0cadb430 2009-09-11 russcox if(c < Tx)
211 0cadb430 2009-09-11 russcox return 1;
212 0cadb430 2009-09-11 russcox if(c < T3)
213 0cadb430 2009-09-11 russcox return n >= 2;
214 0cadb430 2009-09-11 russcox if(UTFmax == 3 || c < T4)
215 0cadb430 2009-09-11 russcox return n >= 3;
216 0cadb430 2009-09-11 russcox return n >= 4;