Blob


1 /* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
2 *
3 * Permission is hereby granted, free of charge, to any person
4 * obtaining a copy of this software and associated documentation
5 * files (the "Software"), to deal in the Software without
6 * restriction, including without limitation the rights to use, copy,
7 * modify, merge, publish, distribute, sublicense, and/or sell copies
8 * of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
24 #include "compat.h"
26 #include <assert.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 #include <wchar.h>
31 #include "utf8.h"
33 #define UTF8_ACCEPT 0
34 #define UTF8_REJECT 1
36 static const uint8_t utf8d[] = {
37 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
38 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
39 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
40 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
41 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
42 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
43 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
44 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
45 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
46 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
47 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
48 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
49 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
50 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
51 };
53 static inline uint32_t
54 decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte)
55 {
56 uint32_t type = utf8d[byte];
58 *codep = (*state != UTF8_ACCEPT) ?
59 (byte & 0x3fu) | (*codep << 6) :
60 (0xff >> type) & (byte);
62 *state = utf8d[256 + *state*16 + type];
63 return *state;
64 }
67 /* end of the converter, utility functions ahead */
69 #define ZERO_WIDTH_SPACE 0x200B
71 /* public version of decode */
72 uint32_t
73 utf8_decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte)
74 {
75 return decode(state, codep, byte);
76 }
78 /* encode cp in s. s must be at least 4 bytes wide */
79 size_t
80 utf8_encode(uint32_t cp, char *s)
81 {
82 if (cp <= 0x7F) {
83 *s = (uint8_t)cp;
84 return 1;
85 } else if (cp <= 0x7FF) {
86 s[1] = (uint8_t)(( cp & 0x3F ) + 0x80);
87 s[0] = (uint8_t)(((cp >> 6) & 0x1F) + 0xC0);
88 return 2;
89 } else if (cp <= 0xFFFF) {
90 s[2] = (uint8_t)(( cp & 0x3F) + 0x80);
91 s[1] = (uint8_t)(((cp >> 6) & 0x3F) + 0x80);
92 s[0] = (uint8_t)(((cp >> 12) & 0x0F) + 0xE0);
93 return 3;
94 } else if (cp <= 0x10FFFF) {
95 s[3] = (uint8_t)(( cp & 0x3F) + 0x80);
96 s[2] = (uint8_t)(((cp >> 6) & 0x3F) + 0x80);
97 s[1] = (uint8_t)(((cp >> 12) & 0x3F) + 0x80);
98 s[0] = (uint8_t)(((cp >> 18) & 0x07) + 0xF0);
99 return 4;
100 } else {
101 s[0] = '\0';
102 return 0;
106 char *
107 utf8_nth(char *s, size_t n)
109 size_t i;
110 uint32_t cp = 0, state = 0;
112 for (i = 0; *s && i < n; ++s)
113 if (!decode(&state, &cp, *s))
114 ++i;
116 if (state != UTF8_ACCEPT)
117 return NULL;
118 if (i == n)
119 return s;
120 return NULL;
123 size_t
124 utf8_cplen(char *s)
126 uint32_t cp = 0, state = 0;
127 size_t len;
129 len = 0;
130 for (; *s; ++s)
131 if (!decode(&state, &cp, *s))
132 len++;
133 return len;
136 size_t
137 utf8_ncplen(const char *s, size_t slen)
139 uint32_t cp = 0, state = 0;
140 size_t len = 0;
142 for (; slen > 0 && *s; ++s, --slen)
143 if (!decode(&state, &cp, *s))
144 len++;
145 return len;
148 /* returns only 0, 1, 2 or 8. assumes sizeof(wchar_t) is 4 */
149 size_t
150 utf8_chwidth(uint32_t cp)
152 /* XXX: if we're running on a platform where sizeof(wchar_t)
153 * == 2 what to do? The manpage for wcwidth and wcs isn't
154 * clear about the encoding, but if it's 16 bit wide I assume
155 * it must use UTF-16... right? */
156 assert(sizeof(wchar_t) == 4);
158 /*
159 * quick and dirty fix for the tabs. In the future we may
160 * want to expand tabs into N spaces, but for the time being
161 * this seems to be good enough (tm).
162 */
163 if (cp == '\t')
164 return 8;
166 return wcwidth((wchar_t)cp);
169 /* NOTE: n is the number of codepoints, NOT the byte length. In
170 * other words, s MUST be NUL-terminated. */
171 size_t
172 utf8_snwidth(const char *s, size_t n)
174 size_t i, tot;
175 uint32_t cp = 0, state = 0;
177 tot = 0;
178 for (i = 0; *s && i < n; ++s)
179 if (!decode(&state, &cp, *s)) {
180 i++;
181 tot += utf8_chwidth(cp);
184 return tot;
187 size_t
188 utf8_swidth(const char *s)
190 size_t tot;
191 uint32_t cp = 0, state = 0;
193 tot = 0;
194 for (; *s; ++s)
195 if (!decode(&state, &cp, *s))
196 tot += utf8_chwidth(cp);
198 return tot;
201 size_t
202 utf8_swidth_between(const char *str, const char *end)
204 size_t tot;
205 uint32_t cp = 0, state = 0;
207 tot = 0;
208 for (; *str && str < end; ++str)
209 if (!decode(&state, &cp, *str))
210 tot += utf8_chwidth(cp);
211 return tot;
214 char *
215 utf8_next_cp(const char *s)
217 uint32_t cp = 0, state = 0;
219 for (; *s; ++s)
220 if (!decode(&state, &cp, *s))
221 break;
222 return (char*)s+1;
225 char *
226 utf8_prev_cp(const char *start, const char *base)
228 uint8_t c;
230 for (; start > base; start--) {
231 c = *start;
232 if ((c & 0xC0) != 0x80)
233 return (char*)start;
236 return (char*)base;
239 /*
240 * XXX: This is not correct. There are codepoints classified as
241 * "emoji", but these can be joined toghether to form more complex
242 * emoji. There is an official list of what these valid combinations
243 * are, but it would require a costly lookup (a trie can be used to
244 * reduce the times, but...). The following approach is conceptually
245 * simpler: if there is a sequence of "emoji codepoints" (or ZWS) and
246 * then a space, consider everything before the space a single emoji.
247 * It needs a special check for numbers (yes, 0..9 and # are
248 * technically speaking emojis) but otherwise seems to work well in
249 * practice.
250 */
251 int
252 emojied_line(const char *s, const char **space_ret)
254 uint32_t cp = 0, state = 0;
255 int only_numbers = 1;
257 for (; *s; ++s) {
258 if (!decode(&state, &cp, *s)) {
259 if (cp == ZERO_WIDTH_SPACE)
260 continue;
261 if (cp == ' ') {
262 *space_ret = s;
263 return !only_numbers;
265 if (!is_emoji(cp))
266 return 0;
267 if (cp < '0' || cp > '9')
268 only_numbers = 0;
272 return 0;