op public repos

Blob

Date:: Thu Jul 15 18:55:31 2021 UTC
Message:: keep an inline version of decode
Actions:: History | Blame | Raw File
1 /* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
2  *
3  * Permission is hereby granted, free of charge, to any person
4  * obtaining a copy of this software and associated documentation
5  * files (the "Software"), to deal in the Software without
6  * restriction, including without limitation the rights to use, copy,
7  * modify, merge, publish, distribute, sublicense, and/or sell copies
8  * of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include <assert.h>
25 #include <stddef.h>
26 #include <stdint.h>
27 #include <wchar.h>
28 
29 #include "telescope.h"
30 #include "utf8.h"
31 
32 #define UTF8_ACCEPT 0
33 #define UTF8_REJECT 1
34 
35 static const uint8_t utf8d[] = {
36 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
37 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
38 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
39 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
40 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
41 	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
42 	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
43 	0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
44 	0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
45 	0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
46 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
47 	1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
48 	1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
49 	1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
50 };
51 
52 static inline uint32_t
53 decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte)
54 {
55 	uint32_t type = utf8d[byte];
56 
57 	*codep = (*state != UTF8_ACCEPT) ?
58 		(byte & 0x3fu) | (*codep << 6) :
59 		(0xff >> type) & (byte);
60 
61 	*state = utf8d[256 + *state*16 + type];
62 	return *state;
63 }
64 
65 
66 /* end of the converter, utility functions ahead */
67 
68 /* public version of decode */
69 uint32_t
70 utf8_decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte)
71 {
72 	return decode(state, codep, byte);
73 }
74 
75 /* encode cp in s.  s must be at least 4 bytes wide */
76 size_t
77 utf8_encode(uint32_t cp, char *s)
78 {
79 	if (cp <= 0x7F) {
80                 *s = (uint8_t)cp;
81 		return 1;
82 	} else if (cp <= 0x7FF) {
83                 s[1] = (uint8_t)(( cp        & 0x3F ) + 0x80);
84 		s[0] = (uint8_t)(((cp >>  6) & 0x1F) + 0xC0);
85 		return 2;
86 	} else if (cp <= 0xFFFF) {
87                 s[2] = (uint8_t)(( cp        & 0x3F) + 0x80);
88 		s[1] = (uint8_t)(((cp >>  6) & 0x3F) + 0x80);
89 		s[0] = (uint8_t)(((cp >> 12) & 0x0F) + 0xE0);
90 		return 3;
91 	} else if (cp <= 0x10FFFF) {
92                 s[3] = (uint8_t)(( cp        & 0x3F) + 0x80);
93 		s[2] = (uint8_t)(((cp >>  6) & 0x3F) + 0x80);
94 		s[1] = (uint8_t)(((cp >> 12) & 0x3F) + 0x80);
95 		s[0] = (uint8_t)(((cp >> 18) & 0x07) + 0xF0);
96 		return 4;
97 	} else {
98 		s[0] = '\0';
99 		return 0;
100 	}
101 }
102 
103 char *
104 utf8_nth(char *s, size_t n)
105 {
106 	size_t i;
107 	uint32_t cp = 0, state = 0;
108 
109 	for (i = 0; *s && i < n; ++s)
110 		if (!decode(&state, &cp, *s))
111 			++i;
112 
113 	if (state != UTF8_ACCEPT)
114 		return NULL;
115 	if (i == n)
116 		return s;
117 	return NULL;
118 }
119 
120 size_t
121 utf8_cplen(char *s)
122 {
123 	uint32_t cp = 0, state = 0;
124 	size_t len;
125 
126 	len = 0;
127 	for (; *s; ++s)
128 		if (!decode(&state, &cp, *s))
129 			len++;
130 	return len;
131 }
132 
133 /* returns only 0, 1, 2 or 8.  assumes sizeof(wchar_t) is 4 */
134 size_t
135 utf8_chwidth(uint32_t cp)
136 {
137 	/* XXX: if we're running on a platform where sizeof(wchar_t)
138 	 * == 2 what to do?  The manpage for wcwidth and wcs isn't
139 	 * clear about the encoding, but if it's 16 bit wide I assume
140 	 * it must use UTF-16... right? */
141 	assert(sizeof(wchar_t) == 4);
142 
143 	/*
144 	 * quick and dirty fix for the tabs.  In the future we may
145 	 * want to expand tabs into N spaces, but for the time being
146 	 * this seems to be good enough (tm).
147 	 */
148 	if (cp == '\t')
149 		return 8;
150 
151 	return wcwidth((wchar_t)cp);
152 }
153 
154 /* NOTE: n is the number of codepoints, NOT the byte length.  In
155  * other words, s MUST be NUL-terminated. */
156 size_t
157 utf8_snwidth(const char *s, size_t n)
158 {
159 	size_t i, tot;
160 	uint32_t cp = 0, state = 0;
161 
162 	tot = 0;
163 	for (i = 0; *s && i < n; ++s)
164 		if (!decode(&state, &cp, *s)) {
165 			i++;
166 			tot += utf8_chwidth(cp);
167 		}
168 
169 	return tot;
170 }
171 
172 size_t
173 utf8_swidth(const char *s)
174 {
175 	size_t tot;
176 	uint32_t cp = 0, state = 0;
177 
178 	tot = 0;
179 	for (; *s; ++s)
180 		if (!decode(&state, &cp, *s))
181 			tot += utf8_chwidth(cp);
182 
183 	return tot;
184 }
185 
186 size_t
187 utf8_swidth_between(const char *str, const char *end)
188 {
189 	size_t tot;
190 	uint32_t cp = 0, state = 0;
191 
192 	tot = 0;
193 	for (; *str && str < end; ++str)
194 		if (!decode(&state, &cp, *str))
195 			tot += utf8_chwidth(cp);
196 	return tot;
197 }
198 
199 char *
200 utf8_next_cp(const char *s)
201 {
202 	uint32_t cp = 0, state = 0;
203 
204 	for (; *s; ++s)
205 		if (!decode(&state, &cp, *s))
206 			break;
207 	return (char*)s+1;
208 }
209 
210 char *
211 utf8_prev_cp(const char *start, const char *base)
212 {
213 	uint8_t c;
214 
215 	for (; start > base; start--) {
216 		c = *start;
217                 if ((c & 0xC0) != 0x80)
218 			return (char*)start;
219 	}
220 
221 	return (char*)base;
222 }