Blame


1 3448adb0 2022-11-02 op /* See LICENSE file for copyright and license details. */
2 3448adb0 2022-11-02 op #include <stddef.h>
3 3448adb0 2022-11-02 op #include <stdint.h>
4 3448adb0 2022-11-02 op #include <stdio.h>
5 3448adb0 2022-11-02 op #include <string.h>
6 3448adb0 2022-11-02 op
7 3448adb0 2022-11-02 op #include "../grapheme.h"
8 3448adb0 2022-11-02 op #include "util.h"
9 3448adb0 2022-11-02 op
10 3448adb0 2022-11-02 op static const struct {
11 3448adb0 2022-11-02 op char *arr; /* UTF-8 byte sequence */
12 3448adb0 2022-11-02 op size_t len; /* length of UTF-8 byte sequence */
13 3448adb0 2022-11-02 op size_t exp_len; /* expected length returned */
14 3448adb0 2022-11-02 op uint_least32_t exp_cp; /* expected codepoint returned */
15 3448adb0 2022-11-02 op } dec_test[] = {
16 3448adb0 2022-11-02 op {
17 3448adb0 2022-11-02 op /* empty sequence
18 3448adb0 2022-11-02 op * [ ] ->
19 3448adb0 2022-11-02 op * INVALID
20 3448adb0 2022-11-02 op */
21 3448adb0 2022-11-02 op .arr = NULL,
22 3448adb0 2022-11-02 op .len = 0,
23 3448adb0 2022-11-02 op .exp_len = 0,
24 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
25 3448adb0 2022-11-02 op },
26 3448adb0 2022-11-02 op {
27 3448adb0 2022-11-02 op /* invalid lead byte
28 3448adb0 2022-11-02 op * [ 11111101 ] ->
29 3448adb0 2022-11-02 op * INVALID
30 3448adb0 2022-11-02 op */
31 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xFD },
32 3448adb0 2022-11-02 op .len = 1,
33 3448adb0 2022-11-02 op .exp_len = 1,
34 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
35 3448adb0 2022-11-02 op },
36 3448adb0 2022-11-02 op {
37 3448adb0 2022-11-02 op /* valid 1-byte sequence
38 3448adb0 2022-11-02 op * [ 00000001 ] ->
39 3448adb0 2022-11-02 op * 0000001
40 3448adb0 2022-11-02 op */
41 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0x01 },
42 3448adb0 2022-11-02 op .len = 1,
43 3448adb0 2022-11-02 op .exp_len = 1,
44 3448adb0 2022-11-02 op .exp_cp = 0x1,
45 3448adb0 2022-11-02 op },
46 3448adb0 2022-11-02 op {
47 3448adb0 2022-11-02 op /* valid 2-byte sequence
48 3448adb0 2022-11-02 op * [ 11000011 10111111 ] ->
49 3448adb0 2022-11-02 op * 00011111111
50 3448adb0 2022-11-02 op */
51 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xC3, 0xBF },
52 3448adb0 2022-11-02 op .len = 2,
53 3448adb0 2022-11-02 op .exp_len = 2,
54 3448adb0 2022-11-02 op .exp_cp = 0xFF,
55 3448adb0 2022-11-02 op },
56 3448adb0 2022-11-02 op {
57 3448adb0 2022-11-02 op /* invalid 2-byte sequence (second byte missing)
58 3448adb0 2022-11-02 op * [ 11000011 ] ->
59 3448adb0 2022-11-02 op * INVALID
60 3448adb0 2022-11-02 op */
61 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xC3 },
62 3448adb0 2022-11-02 op .len = 1,
63 3448adb0 2022-11-02 op .exp_len = 2,
64 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
65 3448adb0 2022-11-02 op },
66 3448adb0 2022-11-02 op {
67 3448adb0 2022-11-02 op /* invalid 2-byte sequence (second byte malformed)
68 3448adb0 2022-11-02 op * [ 11000011 11111111 ] ->
69 3448adb0 2022-11-02 op * INVALID
70 3448adb0 2022-11-02 op */
71 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xC3, 0xFF },
72 3448adb0 2022-11-02 op .len = 2,
73 3448adb0 2022-11-02 op .exp_len = 1,
74 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
75 3448adb0 2022-11-02 op },
76 3448adb0 2022-11-02 op {
77 3448adb0 2022-11-02 op /* invalid 2-byte sequence (overlong encoded)
78 3448adb0 2022-11-02 op * [ 11000001 10111111 ] ->
79 3448adb0 2022-11-02 op * INVALID
80 3448adb0 2022-11-02 op */
81 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xC1, 0xBF },
82 3448adb0 2022-11-02 op .len = 2,
83 3448adb0 2022-11-02 op .exp_len = 2,
84 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
85 3448adb0 2022-11-02 op },
86 3448adb0 2022-11-02 op {
87 3448adb0 2022-11-02 op /* valid 3-byte sequence
88 3448adb0 2022-11-02 op * [ 11100000 10111111 10111111 ] ->
89 3448adb0 2022-11-02 op * 0000111111111111
90 3448adb0 2022-11-02 op */
91 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF },
92 3448adb0 2022-11-02 op .len = 3,
93 3448adb0 2022-11-02 op .exp_len = 3,
94 3448adb0 2022-11-02 op .exp_cp = 0xFFF,
95 3448adb0 2022-11-02 op },
96 3448adb0 2022-11-02 op {
97 3448adb0 2022-11-02 op /* invalid 3-byte sequence (second byte missing)
98 3448adb0 2022-11-02 op * [ 11100000 ] ->
99 3448adb0 2022-11-02 op * INVALID
100 3448adb0 2022-11-02 op */
101 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xE0 },
102 3448adb0 2022-11-02 op .len = 1,
103 3448adb0 2022-11-02 op .exp_len = 3,
104 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
105 3448adb0 2022-11-02 op },
106 3448adb0 2022-11-02 op {
107 3448adb0 2022-11-02 op /* invalid 3-byte sequence (second byte malformed)
108 3448adb0 2022-11-02 op * [ 11100000 01111111 10111111 ] ->
109 3448adb0 2022-11-02 op * INVALID
110 3448adb0 2022-11-02 op */
111 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xE0, 0x7F, 0xBF },
112 3448adb0 2022-11-02 op .len = 3,
113 3448adb0 2022-11-02 op .exp_len = 1,
114 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
115 3448adb0 2022-11-02 op },
116 3448adb0 2022-11-02 op {
117 3448adb0 2022-11-02 op /* invalid 3-byte sequence (short string, second byte malformed)
118 3448adb0 2022-11-02 op * [ 11100000 01111111 ] ->
119 3448adb0 2022-11-02 op * INVALID
120 3448adb0 2022-11-02 op */
121 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xE0, 0x7F },
122 3448adb0 2022-11-02 op .len = 2,
123 3448adb0 2022-11-02 op .exp_len = 1,
124 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
125 3448adb0 2022-11-02 op },
126 3448adb0 2022-11-02 op {
127 3448adb0 2022-11-02 op /* invalid 3-byte sequence (third byte missing)
128 3448adb0 2022-11-02 op * [ 11100000 10111111 ] ->
129 3448adb0 2022-11-02 op * INVALID
130 3448adb0 2022-11-02 op */
131 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xE0, 0xBF },
132 3448adb0 2022-11-02 op .len = 2,
133 3448adb0 2022-11-02 op .exp_len = 3,
134 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
135 3448adb0 2022-11-02 op },
136 3448adb0 2022-11-02 op {
137 3448adb0 2022-11-02 op /* invalid 3-byte sequence (third byte malformed)
138 3448adb0 2022-11-02 op * [ 11100000 10111111 01111111 ] ->
139 3448adb0 2022-11-02 op * INVALID
140 3448adb0 2022-11-02 op */
141 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0x7F },
142 3448adb0 2022-11-02 op .len = 3,
143 3448adb0 2022-11-02 op .exp_len = 2,
144 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
145 3448adb0 2022-11-02 op },
146 3448adb0 2022-11-02 op {
147 3448adb0 2022-11-02 op /* invalid 3-byte sequence (overlong encoded)
148 3448adb0 2022-11-02 op * [ 11100000 10011111 10111111 ] ->
149 3448adb0 2022-11-02 op * INVALID
150 3448adb0 2022-11-02 op */
151 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xE0, 0x9F, 0xBF },
152 3448adb0 2022-11-02 op .len = 3,
153 3448adb0 2022-11-02 op .exp_len = 3,
154 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
155 3448adb0 2022-11-02 op },
156 3448adb0 2022-11-02 op {
157 3448adb0 2022-11-02 op /* invalid 3-byte sequence (UTF-16 surrogate half)
158 3448adb0 2022-11-02 op * [ 11101101 10100000 10000000 ] ->
159 3448adb0 2022-11-02 op * INVALID
160 3448adb0 2022-11-02 op */
161 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xED, 0xA0, 0x80 },
162 3448adb0 2022-11-02 op .len = 3,
163 3448adb0 2022-11-02 op .exp_len = 3,
164 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
165 3448adb0 2022-11-02 op },
166 3448adb0 2022-11-02 op {
167 3448adb0 2022-11-02 op /* valid 4-byte sequence
168 3448adb0 2022-11-02 op * [ 11110011 10111111 10111111 10111111 ] ->
169 3448adb0 2022-11-02 op * 011111111111111111111
170 3448adb0 2022-11-02 op */
171 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF },
172 3448adb0 2022-11-02 op .len = 4,
173 3448adb0 2022-11-02 op .exp_len = 4,
174 3448adb0 2022-11-02 op .exp_cp = UINT32_C(0xFFFFF),
175 3448adb0 2022-11-02 op },
176 3448adb0 2022-11-02 op {
177 3448adb0 2022-11-02 op /* invalid 4-byte sequence (second byte missing)
178 3448adb0 2022-11-02 op * [ 11110011 ] ->
179 3448adb0 2022-11-02 op * INVALID
180 3448adb0 2022-11-02 op */
181 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3 },
182 3448adb0 2022-11-02 op .len = 1,
183 3448adb0 2022-11-02 op .exp_len = 4,
184 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
185 3448adb0 2022-11-02 op },
186 3448adb0 2022-11-02 op {
187 3448adb0 2022-11-02 op /* invalid 4-byte sequence (second byte malformed)
188 3448adb0 2022-11-02 op * [ 11110011 01111111 10111111 10111111 ] ->
189 3448adb0 2022-11-02 op * INVALID
190 3448adb0 2022-11-02 op */
191 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF, 0xBF },
192 3448adb0 2022-11-02 op .len = 4,
193 3448adb0 2022-11-02 op .exp_len = 1,
194 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
195 3448adb0 2022-11-02 op },
196 3448adb0 2022-11-02 op {
197 3448adb0 2022-11-02 op /* invalid 4-byte sequence (short string 1, second byte malformed)
198 3448adb0 2022-11-02 op * [ 11110011 011111111 ] ->
199 3448adb0 2022-11-02 op * INVALID
200 3448adb0 2022-11-02 op */
201 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0x7F },
202 3448adb0 2022-11-02 op .len = 2,
203 3448adb0 2022-11-02 op .exp_len = 1,
204 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
205 3448adb0 2022-11-02 op },
206 3448adb0 2022-11-02 op {
207 3448adb0 2022-11-02 op /* invalid 4-byte sequence (short string 2, second byte malformed)
208 3448adb0 2022-11-02 op * [ 11110011 011111111 10111111 ] ->
209 3448adb0 2022-11-02 op * INVALID
210 3448adb0 2022-11-02 op */
211 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF },
212 3448adb0 2022-11-02 op .len = 3,
213 3448adb0 2022-11-02 op .exp_len = 1,
214 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
215 3448adb0 2022-11-02 op },
216 3448adb0 2022-11-02 op
217 3448adb0 2022-11-02 op {
218 3448adb0 2022-11-02 op /* invalid 4-byte sequence (third byte missing)
219 3448adb0 2022-11-02 op * [ 11110011 10111111 ] ->
220 3448adb0 2022-11-02 op * INVALID
221 3448adb0 2022-11-02 op */
222 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0xBF },
223 3448adb0 2022-11-02 op .len = 2,
224 3448adb0 2022-11-02 op .exp_len = 4,
225 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
226 3448adb0 2022-11-02 op },
227 3448adb0 2022-11-02 op {
228 3448adb0 2022-11-02 op /* invalid 4-byte sequence (third byte malformed)
229 3448adb0 2022-11-02 op * [ 11110011 10111111 01111111 10111111 ] ->
230 3448adb0 2022-11-02 op * INVALID
231 3448adb0 2022-11-02 op */
232 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F, 0xBF },
233 3448adb0 2022-11-02 op .len = 4,
234 3448adb0 2022-11-02 op .exp_len = 2,
235 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
236 3448adb0 2022-11-02 op },
237 3448adb0 2022-11-02 op {
238 3448adb0 2022-11-02 op /* invalid 4-byte sequence (short string, third byte malformed)
239 3448adb0 2022-11-02 op * [ 11110011 10111111 01111111 ] ->
240 3448adb0 2022-11-02 op * INVALID
241 3448adb0 2022-11-02 op */
242 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F },
243 3448adb0 2022-11-02 op .len = 3,
244 3448adb0 2022-11-02 op .exp_len = 2,
245 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
246 3448adb0 2022-11-02 op },
247 3448adb0 2022-11-02 op {
248 3448adb0 2022-11-02 op /* invalid 4-byte sequence (fourth byte missing)
249 3448adb0 2022-11-02 op * [ 11110011 10111111 10111111 ] ->
250 3448adb0 2022-11-02 op * INVALID
251 3448adb0 2022-11-02 op */
252 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF },
253 3448adb0 2022-11-02 op .len = 3,
254 3448adb0 2022-11-02 op .exp_len = 4,
255 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
256 3448adb0 2022-11-02 op },
257 3448adb0 2022-11-02 op {
258 3448adb0 2022-11-02 op /* invalid 4-byte sequence (fourth byte malformed)
259 3448adb0 2022-11-02 op * [ 11110011 10111111 10111111 01111111 ] ->
260 3448adb0 2022-11-02 op * INVALID
261 3448adb0 2022-11-02 op */
262 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0x7F },
263 3448adb0 2022-11-02 op .len = 4,
264 3448adb0 2022-11-02 op .exp_len = 3,
265 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
266 3448adb0 2022-11-02 op },
267 3448adb0 2022-11-02 op {
268 3448adb0 2022-11-02 op /* invalid 4-byte sequence (overlong encoded)
269 3448adb0 2022-11-02 op * [ 11110000 10000000 10000001 10111111 ] ->
270 3448adb0 2022-11-02 op * INVALID
271 3448adb0 2022-11-02 op */
272 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF0, 0x80, 0x81, 0xBF },
273 3448adb0 2022-11-02 op .len = 4,
274 3448adb0 2022-11-02 op .exp_len = 4,
275 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
276 3448adb0 2022-11-02 op },
277 3448adb0 2022-11-02 op {
278 3448adb0 2022-11-02 op /* invalid 4-byte sequence (UTF-16-unrepresentable)
279 3448adb0 2022-11-02 op * [ 11110100 10010000 10000000 10000000 ] ->
280 3448adb0 2022-11-02 op * INVALID
281 3448adb0 2022-11-02 op */
282 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF4, 0x90, 0x80, 0x80 },
283 3448adb0 2022-11-02 op .len = 4,
284 3448adb0 2022-11-02 op .exp_len = 4,
285 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
286 3448adb0 2022-11-02 op },
287 3448adb0 2022-11-02 op };
288 3448adb0 2022-11-02 op
289 3448adb0 2022-11-02 op int
290 3448adb0 2022-11-02 op main(int argc, char *argv[])
291 3448adb0 2022-11-02 op {
292 3448adb0 2022-11-02 op size_t i, failed;
293 3448adb0 2022-11-02 op
294 3448adb0 2022-11-02 op (void)argc;
295 3448adb0 2022-11-02 op
296 3448adb0 2022-11-02 op /* UTF-8 decoder test */
297 3448adb0 2022-11-02 op for (i = 0, failed = 0; i < LEN(dec_test); i++) {
298 3448adb0 2022-11-02 op size_t len;
299 3448adb0 2022-11-02 op uint_least32_t cp;
300 3448adb0 2022-11-02 op
301 3448adb0 2022-11-02 op len = grapheme_decode_utf8(dec_test[i].arr,
302 3448adb0 2022-11-02 op dec_test[i].len, &cp);
303 3448adb0 2022-11-02 op
304 3448adb0 2022-11-02 op if (len != dec_test[i].exp_len ||
305 3448adb0 2022-11-02 op cp != dec_test[i].exp_cp) {
306 3448adb0 2022-11-02 op fprintf(stderr, "%s: Failed test %zu: "
307 3448adb0 2022-11-02 op "Expected (%zx,%u), but got (%zx,%u).\n",
308 3448adb0 2022-11-02 op argv[0], i, dec_test[i].exp_len,
309 3448adb0 2022-11-02 op dec_test[i].exp_cp, len, cp);
310 3448adb0 2022-11-02 op failed++;
311 3448adb0 2022-11-02 op }
312 3448adb0 2022-11-02 op }
313 3448adb0 2022-11-02 op printf("%s: %zu/%zu unit tests passed.\n", argv[0],
314 3448adb0 2022-11-02 op LEN(dec_test) - failed, LEN(dec_test));
315 3448adb0 2022-11-02 op
316 3448adb0 2022-11-02 op return (failed > 0) ? 1 : 0;
317 3448adb0 2022-11-02 op }