Blame


1 1ac119fb 2024-01-23 op /* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
2 1ac119fb 2024-01-23 op *
3 1ac119fb 2024-01-23 op * Permission is hereby granted, free of charge, to any person
4 1ac119fb 2024-01-23 op * obtaining a copy of this software and associated documentation
5 1ac119fb 2024-01-23 op * files (the "Software"), to deal in the Software without
6 1ac119fb 2024-01-23 op * restriction, including without limitation the rights to use, copy,
7 1ac119fb 2024-01-23 op * modify, merge, publish, distribute, sublicense, and/or sell copies
8 1ac119fb 2024-01-23 op * of the Software, and to permit persons to whom the Software is
9 1ac119fb 2024-01-23 op * furnished to do so, subject to the following conditions:
10 1ac119fb 2024-01-23 op *
11 1ac119fb 2024-01-23 op * The above copyright notice and this permission notice shall be
12 1ac119fb 2024-01-23 op * included in all copies or substantial portions of the Software.
13 1ac119fb 2024-01-23 op *
14 1ac119fb 2024-01-23 op * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 1ac119fb 2024-01-23 op * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 1ac119fb 2024-01-23 op * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 1ac119fb 2024-01-23 op * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 1ac119fb 2024-01-23 op * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 1ac119fb 2024-01-23 op * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 1ac119fb 2024-01-23 op * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 1ac119fb 2024-01-23 op * SOFTWARE.
22 1ac119fb 2024-01-23 op */
23 1ac119fb 2024-01-23 op
24 1ac119fb 2024-01-23 op #include "compat.h"
25 1ac119fb 2024-01-23 op
26 1ac119fb 2024-01-23 op #include <assert.h>
27 1ac119fb 2024-01-23 op #include <stddef.h>
28 1ac119fb 2024-01-23 op #include <stdint.h>
29 1ac119fb 2024-01-23 op #include <wchar.h>
30 1ac119fb 2024-01-23 op
31 1ac119fb 2024-01-23 op #include "utf8.h"
32 1ac119fb 2024-01-23 op
33 1ac119fb 2024-01-23 op #define UTF8_ACCEPT 0
34 1ac119fb 2024-01-23 op #define UTF8_REJECT 1
35 1ac119fb 2024-01-23 op
36 1ac119fb 2024-01-23 op static const uint8_t utf8d[] = {
37 1ac119fb 2024-01-23 op 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
38 1ac119fb 2024-01-23 op 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
39 1ac119fb 2024-01-23 op 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
40 1ac119fb 2024-01-23 op 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
41 1ac119fb 2024-01-23 op 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
42 1ac119fb 2024-01-23 op 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
43 1ac119fb 2024-01-23 op 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
44 1ac119fb 2024-01-23 op 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
45 1ac119fb 2024-01-23 op 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
46 1ac119fb 2024-01-23 op 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
47 1ac119fb 2024-01-23 op 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
48 1ac119fb 2024-01-23 op 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
49 1ac119fb 2024-01-23 op 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
50 1ac119fb 2024-01-23 op 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
51 1ac119fb 2024-01-23 op };
52 1ac119fb 2024-01-23 op
53 1ac119fb 2024-01-23 op static inline uint32_t
54 1ac119fb 2024-01-23 op decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte)
55 1ac119fb 2024-01-23 op {
56 1ac119fb 2024-01-23 op uint32_t type = utf8d[byte];
57 1ac119fb 2024-01-23 op
58 1ac119fb 2024-01-23 op *codep = (*state != UTF8_ACCEPT) ?
59 1ac119fb 2024-01-23 op (byte & 0x3fu) | (*codep << 6) :
60 1ac119fb 2024-01-23 op (0xff >> type) & (byte);
61 1ac119fb 2024-01-23 op
62 1ac119fb 2024-01-23 op *state = utf8d[256 + *state*16 + type];
63 1ac119fb 2024-01-23 op return *state;
64 1ac119fb 2024-01-23 op }
65 1ac119fb 2024-01-23 op
66 1ac119fb 2024-01-23 op
67 1ac119fb 2024-01-23 op /* end of the converter, utility functions ahead */
68 1ac119fb 2024-01-23 op
69 1ac119fb 2024-01-23 op #define ZERO_WIDTH_SPACE 0x200B
70 1ac119fb 2024-01-23 op
71 1ac119fb 2024-01-23 op /* public version of decode */
72 1ac119fb 2024-01-23 op uint32_t
73 1ac119fb 2024-01-23 op utf8_decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte)
74 1ac119fb 2024-01-23 op {
75 1ac119fb 2024-01-23 op return decode(state, codep, byte);
76 1ac119fb 2024-01-23 op }
77 1ac119fb 2024-01-23 op
78 1ac119fb 2024-01-23 op /* encode cp in s. s must be at least 4 bytes wide */
79 1ac119fb 2024-01-23 op size_t
80 1ac119fb 2024-01-23 op utf8_encode(uint32_t cp, char *s)
81 1ac119fb 2024-01-23 op {
82 1ac119fb 2024-01-23 op if (cp <= 0x7F) {
83 1ac119fb 2024-01-23 op *s = (uint8_t)cp;
84 1ac119fb 2024-01-23 op return 1;
85 1ac119fb 2024-01-23 op } else if (cp <= 0x7FF) {
86 1ac119fb 2024-01-23 op s[1] = (uint8_t)(( cp & 0x3F ) + 0x80);
87 1ac119fb 2024-01-23 op s[0] = (uint8_t)(((cp >> 6) & 0x1F) + 0xC0);
88 1ac119fb 2024-01-23 op return 2;
89 1ac119fb 2024-01-23 op } else if (cp <= 0xFFFF) {
90 1ac119fb 2024-01-23 op s[2] = (uint8_t)(( cp & 0x3F) + 0x80);
91 1ac119fb 2024-01-23 op s[1] = (uint8_t)(((cp >> 6) & 0x3F) + 0x80);
92 1ac119fb 2024-01-23 op s[0] = (uint8_t)(((cp >> 12) & 0x0F) + 0xE0);
93 1ac119fb 2024-01-23 op return 3;
94 1ac119fb 2024-01-23 op } else if (cp <= 0x10FFFF) {
95 1ac119fb 2024-01-23 op s[3] = (uint8_t)(( cp & 0x3F) + 0x80);
96 1ac119fb 2024-01-23 op s[2] = (uint8_t)(((cp >> 6) & 0x3F) + 0x80);
97 1ac119fb 2024-01-23 op s[1] = (uint8_t)(((cp >> 12) & 0x3F) + 0x80);
98 1ac119fb 2024-01-23 op s[0] = (uint8_t)(((cp >> 18) & 0x07) + 0xF0);
99 1ac119fb 2024-01-23 op return 4;
100 1ac119fb 2024-01-23 op } else {
101 1ac119fb 2024-01-23 op s[0] = '\0';
102 1ac119fb 2024-01-23 op return 0;
103 1ac119fb 2024-01-23 op }
104 1ac119fb 2024-01-23 op }
105 1ac119fb 2024-01-23 op
106 1ac119fb 2024-01-23 op char *
107 1ac119fb 2024-01-23 op utf8_nth(char *s, size_t n)
108 1ac119fb 2024-01-23 op {
109 1ac119fb 2024-01-23 op size_t i;
110 1ac119fb 2024-01-23 op uint32_t cp = 0, state = 0;
111 1ac119fb 2024-01-23 op
112 1ac119fb 2024-01-23 op for (i = 0; *s && i < n; ++s)
113 1ac119fb 2024-01-23 op if (!decode(&state, &cp, *s))
114 1ac119fb 2024-01-23 op ++i;
115 1ac119fb 2024-01-23 op
116 1ac119fb 2024-01-23 op if (state != UTF8_ACCEPT)
117 1ac119fb 2024-01-23 op return NULL;
118 1ac119fb 2024-01-23 op if (i == n)
119 1ac119fb 2024-01-23 op return s;
120 1ac119fb 2024-01-23 op return NULL;
121 1ac119fb 2024-01-23 op }
122 1ac119fb 2024-01-23 op
123 1ac119fb 2024-01-23 op size_t
124 1ac119fb 2024-01-23 op utf8_cplen(char *s)
125 1ac119fb 2024-01-23 op {
126 1ac119fb 2024-01-23 op uint32_t cp = 0, state = 0;
127 1ac119fb 2024-01-23 op size_t len;
128 1ac119fb 2024-01-23 op
129 1ac119fb 2024-01-23 op len = 0;
130 1ac119fb 2024-01-23 op for (; *s; ++s)
131 1ac119fb 2024-01-23 op if (!decode(&state, &cp, *s))
132 1ac119fb 2024-01-23 op len++;
133 1ac119fb 2024-01-23 op return len;
134 1ac119fb 2024-01-23 op }
135 1ac119fb 2024-01-23 op
136 1ac119fb 2024-01-23 op size_t
137 1ac119fb 2024-01-23 op utf8_ncplen(const char *s, size_t slen)
138 1ac119fb 2024-01-23 op {
139 1ac119fb 2024-01-23 op uint32_t cp = 0, state = 0;
140 1ac119fb 2024-01-23 op size_t len = 0;
141 1ac119fb 2024-01-23 op
142 1ac119fb 2024-01-23 op for (; slen > 0 && *s; ++s, --slen)
143 1ac119fb 2024-01-23 op if (!decode(&state, &cp, *s))
144 1ac119fb 2024-01-23 op len++;
145 1ac119fb 2024-01-23 op return len;
146 1ac119fb 2024-01-23 op }
147 1ac119fb 2024-01-23 op
148 1ac119fb 2024-01-23 op /* returns only 0, 1, 2 or 8. assumes sizeof(wchar_t) is 4 */
149 1ac119fb 2024-01-23 op size_t
150 1ac119fb 2024-01-23 op utf8_chwidth(uint32_t cp)
151 1ac119fb 2024-01-23 op {
152 1ac119fb 2024-01-23 op /* XXX: if we're running on a platform where sizeof(wchar_t)
153 1ac119fb 2024-01-23 op * == 2 what to do? The manpage for wcwidth and wcs isn't
154 1ac119fb 2024-01-23 op * clear about the encoding, but if it's 16 bit wide I assume
155 1ac119fb 2024-01-23 op * it must use UTF-16... right? */
156 1ac119fb 2024-01-23 op assert(sizeof(wchar_t) == 4);
157 1ac119fb 2024-01-23 op
158 1ac119fb 2024-01-23 op /*
159 1ac119fb 2024-01-23 op * quick and dirty fix for the tabs. In the future we may
160 1ac119fb 2024-01-23 op * want to expand tabs into N spaces, but for the time being
161 1ac119fb 2024-01-23 op * this seems to be good enough (tm).
162 1ac119fb 2024-01-23 op */
163 1ac119fb 2024-01-23 op if (cp == '\t')
164 1ac119fb 2024-01-23 op return 8;
165 1ac119fb 2024-01-23 op
166 1ac119fb 2024-01-23 op return wcwidth((wchar_t)cp);
167 1ac119fb 2024-01-23 op }
168 1ac119fb 2024-01-23 op
169 1ac119fb 2024-01-23 op /* NOTE: n is the number of codepoints, NOT the byte length. In
170 1ac119fb 2024-01-23 op * other words, s MUST be NUL-terminated. */
171 1ac119fb 2024-01-23 op size_t
172 1ac119fb 2024-01-23 op utf8_snwidth(const char *s, size_t n)
173 1ac119fb 2024-01-23 op {
174 1ac119fb 2024-01-23 op size_t i, tot;
175 1ac119fb 2024-01-23 op uint32_t cp = 0, state = 0;
176 1ac119fb 2024-01-23 op
177 1ac119fb 2024-01-23 op tot = 0;
178 1ac119fb 2024-01-23 op for (i = 0; *s && i < n; ++s)
179 1ac119fb 2024-01-23 op if (!decode(&state, &cp, *s)) {
180 1ac119fb 2024-01-23 op i++;
181 1ac119fb 2024-01-23 op tot += utf8_chwidth(cp);
182 1ac119fb 2024-01-23 op }
183 1ac119fb 2024-01-23 op
184 1ac119fb 2024-01-23 op return tot;
185 1ac119fb 2024-01-23 op }
186 1ac119fb 2024-01-23 op
187 1ac119fb 2024-01-23 op size_t
188 1ac119fb 2024-01-23 op utf8_swidth(const char *s)
189 1ac119fb 2024-01-23 op {
190 1ac119fb 2024-01-23 op size_t tot;
191 1ac119fb 2024-01-23 op uint32_t cp = 0, state = 0;
192 1ac119fb 2024-01-23 op
193 1ac119fb 2024-01-23 op tot = 0;
194 1ac119fb 2024-01-23 op for (; *s; ++s)
195 1ac119fb 2024-01-23 op if (!decode(&state, &cp, *s))
196 1ac119fb 2024-01-23 op tot += utf8_chwidth(cp);
197 1ac119fb 2024-01-23 op
198 1ac119fb 2024-01-23 op return tot;
199 1ac119fb 2024-01-23 op }
200 1ac119fb 2024-01-23 op
201 1ac119fb 2024-01-23 op size_t
202 1ac119fb 2024-01-23 op utf8_swidth_between(const char *str, const char *end)
203 1ac119fb 2024-01-23 op {
204 1ac119fb 2024-01-23 op size_t tot;
205 1ac119fb 2024-01-23 op uint32_t cp = 0, state = 0;
206 1ac119fb 2024-01-23 op
207 1ac119fb 2024-01-23 op tot = 0;
208 1ac119fb 2024-01-23 op for (; *str && str < end; ++str)
209 1ac119fb 2024-01-23 op if (!decode(&state, &cp, *str))
210 1ac119fb 2024-01-23 op tot += utf8_chwidth(cp);
211 1ac119fb 2024-01-23 op return tot;
212 1ac119fb 2024-01-23 op }
213 1ac119fb 2024-01-23 op
214 1ac119fb 2024-01-23 op char *
215 1ac119fb 2024-01-23 op utf8_next_cp(const char *s)
216 1ac119fb 2024-01-23 op {
217 1ac119fb 2024-01-23 op uint32_t cp = 0, state = 0;
218 1ac119fb 2024-01-23 op
219 1ac119fb 2024-01-23 op for (; *s; ++s)
220 1ac119fb 2024-01-23 op if (!decode(&state, &cp, *s))
221 1ac119fb 2024-01-23 op break;
222 1ac119fb 2024-01-23 op return (char*)s+1;
223 1ac119fb 2024-01-23 op }
224 1ac119fb 2024-01-23 op
225 1ac119fb 2024-01-23 op char *
226 1ac119fb 2024-01-23 op utf8_prev_cp(const char *start, const char *base)
227 1ac119fb 2024-01-23 op {
228 1ac119fb 2024-01-23 op uint8_t c;
229 1ac119fb 2024-01-23 op
230 1ac119fb 2024-01-23 op for (; start > base; start--) {
231 1ac119fb 2024-01-23 op c = *start;
232 1ac119fb 2024-01-23 op if ((c & 0xC0) != 0x80)
233 1ac119fb 2024-01-23 op return (char*)start;
234 1ac119fb 2024-01-23 op }
235 1ac119fb 2024-01-23 op
236 1ac119fb 2024-01-23 op return (char*)base;
237 1ac119fb 2024-01-23 op }
238 1ac119fb 2024-01-23 op
239 1ac119fb 2024-01-23 op /*
240 1ac119fb 2024-01-23 op * XXX: This is not correct. There are codepoints classified as
241 1ac119fb 2024-01-23 op * "emoji", but these can be joined toghether to form more complex
242 1ac119fb 2024-01-23 op * emoji. There is an official list of what these valid combinations
243 1ac119fb 2024-01-23 op * are, but it would require a costly lookup (a trie can be used to
244 1ac119fb 2024-01-23 op * reduce the times, but...). The following approach is conceptually
245 1ac119fb 2024-01-23 op * simpler: if there is a sequence of "emoji codepoints" (or ZWS) and
246 1ac119fb 2024-01-23 op * then a space, consider everything before the space a single emoji.
247 1ac119fb 2024-01-23 op * It needs a special check for numbers (yes, 0..9 and # are
248 1ac119fb 2024-01-23 op * technically speaking emojis) but otherwise seems to work well in
249 1ac119fb 2024-01-23 op * practice.
250 1ac119fb 2024-01-23 op */
251 1ac119fb 2024-01-23 op int
252 1ac119fb 2024-01-23 op emojied_line(const char *s, const char **space_ret)
253 1ac119fb 2024-01-23 op {
254 1ac119fb 2024-01-23 op uint32_t cp = 0, state = 0;
255 1ac119fb 2024-01-23 op int only_numbers = 1;
256 1ac119fb 2024-01-23 op
257 1ac119fb 2024-01-23 op for (; *s; ++s) {
258 1ac119fb 2024-01-23 op if (!decode(&state, &cp, *s)) {
259 1ac119fb 2024-01-23 op if (cp == ZERO_WIDTH_SPACE)
260 1ac119fb 2024-01-23 op continue;
261 1ac119fb 2024-01-23 op if (cp == ' ') {
262 1ac119fb 2024-01-23 op *space_ret = s;
263 1ac119fb 2024-01-23 op return !only_numbers;
264 1ac119fb 2024-01-23 op }
265 1ac119fb 2024-01-23 op if (!is_emoji(cp))
266 1ac119fb 2024-01-23 op return 0;
267 1ac119fb 2024-01-23 op if (cp < '0' || cp > '9')
268 1ac119fb 2024-01-23 op only_numbers = 0;
269 1ac119fb 2024-01-23 op }
270 1ac119fb 2024-01-23 op }
271 1ac119fb 2024-01-23 op
272 1ac119fb 2024-01-23 op return 0;
273 1ac119fb 2024-01-23 op }