1 3448adb0 2022-11-02 op /* See LICENSE file for copyright and license details. */
2 3448adb0 2022-11-02 op #include <stddef.h>
3 3448adb0 2022-11-02 op #include <stdint.h>
4 3448adb0 2022-11-02 op #include <stdio.h>
5 3448adb0 2022-11-02 op #include <string.h>
7 3448adb0 2022-11-02 op #include "../grapheme.h"
8 3448adb0 2022-11-02 op #include "util.h"
10 3448adb0 2022-11-02 op static const struct {
11 3448adb0 2022-11-02 op char *arr; /* UTF-8 byte sequence */
12 3448adb0 2022-11-02 op size_t len; /* length of UTF-8 byte sequence */
13 3448adb0 2022-11-02 op size_t exp_len; /* expected length returned */
14 3448adb0 2022-11-02 op uint_least32_t exp_cp; /* expected codepoint returned */
15 3448adb0 2022-11-02 op } dec_test[] = {
17 3448adb0 2022-11-02 op /* empty sequence
24 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
27 3448adb0 2022-11-02 op /* invalid lead byte
28 3448adb0 2022-11-02 op * [ 11111101 ] ->
31 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xFD },
34 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
37 3448adb0 2022-11-02 op /* valid 1-byte sequence
38 3448adb0 2022-11-02 op * [ 00000001 ] ->
41 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0x01 },
44 3448adb0 2022-11-02 op .exp_cp = 0x1,
47 3448adb0 2022-11-02 op /* valid 2-byte sequence
48 3448adb0 2022-11-02 op * [ 11000011 10111111 ] ->
51 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xC3, 0xBF },
54 3448adb0 2022-11-02 op .exp_cp = 0xFF,
57 3448adb0 2022-11-02 op /* invalid 2-byte sequence (second byte missing)
58 3448adb0 2022-11-02 op * [ 11000011 ] ->
61 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xC3 },
64 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
67 3448adb0 2022-11-02 op /* invalid 2-byte sequence (second byte malformed)
68 3448adb0 2022-11-02 op * [ 11000011 11111111 ] ->
71 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xC3, 0xFF },
74 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
77 3448adb0 2022-11-02 op /* invalid 2-byte sequence (overlong encoded)
78 3448adb0 2022-11-02 op * [ 11000001 10111111 ] ->
81 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xC1, 0xBF },
84 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
87 3448adb0 2022-11-02 op /* valid 3-byte sequence
88 3448adb0 2022-11-02 op * [ 11100000 10111111 10111111 ] ->
89 3448adb0 2022-11-02 op * 0000111111111111
91 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF },
94 3448adb0 2022-11-02 op .exp_cp = 0xFFF,
97 3448adb0 2022-11-02 op /* invalid 3-byte sequence (second byte missing)
98 3448adb0 2022-11-02 op * [ 11100000 ] ->
101 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xE0 },
103 3448adb0 2022-11-02 op .exp_len = 3,
104 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
107 3448adb0 2022-11-02 op /* invalid 3-byte sequence (second byte malformed)
108 3448adb0 2022-11-02 op * [ 11100000 01111111 10111111 ] ->
111 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xE0, 0x7F, 0xBF },
113 3448adb0 2022-11-02 op .exp_len = 1,
114 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
117 3448adb0 2022-11-02 op /* invalid 3-byte sequence (short string, second byte malformed)
118 3448adb0 2022-11-02 op * [ 11100000 01111111 ] ->
121 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xE0, 0x7F },
123 3448adb0 2022-11-02 op .exp_len = 1,
124 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
127 3448adb0 2022-11-02 op /* invalid 3-byte sequence (third byte missing)
128 3448adb0 2022-11-02 op * [ 11100000 10111111 ] ->
131 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xE0, 0xBF },
133 3448adb0 2022-11-02 op .exp_len = 3,
134 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
137 3448adb0 2022-11-02 op /* invalid 3-byte sequence (third byte malformed)
138 3448adb0 2022-11-02 op * [ 11100000 10111111 01111111 ] ->
141 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0x7F },
143 3448adb0 2022-11-02 op .exp_len = 2,
144 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
147 3448adb0 2022-11-02 op /* invalid 3-byte sequence (overlong encoded)
148 3448adb0 2022-11-02 op * [ 11100000 10011111 10111111 ] ->
151 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xE0, 0x9F, 0xBF },
153 3448adb0 2022-11-02 op .exp_len = 3,
154 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
157 3448adb0 2022-11-02 op /* invalid 3-byte sequence (UTF-16 surrogate half)
158 3448adb0 2022-11-02 op * [ 11101101 10100000 10000000 ] ->
161 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xED, 0xA0, 0x80 },
163 3448adb0 2022-11-02 op .exp_len = 3,
164 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
167 3448adb0 2022-11-02 op /* valid 4-byte sequence
168 3448adb0 2022-11-02 op * [ 11110011 10111111 10111111 10111111 ] ->
169 3448adb0 2022-11-02 op * 011111111111111111111
171 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF },
173 3448adb0 2022-11-02 op .exp_len = 4,
174 3448adb0 2022-11-02 op .exp_cp = UINT32_C(0xFFFFF),
177 3448adb0 2022-11-02 op /* invalid 4-byte sequence (second byte missing)
178 3448adb0 2022-11-02 op * [ 11110011 ] ->
181 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3 },
183 3448adb0 2022-11-02 op .exp_len = 4,
184 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
187 3448adb0 2022-11-02 op /* invalid 4-byte sequence (second byte malformed)
188 3448adb0 2022-11-02 op * [ 11110011 01111111 10111111 10111111 ] ->
191 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF, 0xBF },
193 3448adb0 2022-11-02 op .exp_len = 1,
194 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
197 3448adb0 2022-11-02 op /* invalid 4-byte sequence (short string 1, second byte malformed)
198 3448adb0 2022-11-02 op * [ 11110011 011111111 ] ->
201 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0x7F },
203 3448adb0 2022-11-02 op .exp_len = 1,
204 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
207 3448adb0 2022-11-02 op /* invalid 4-byte sequence (short string 2, second byte malformed)
208 3448adb0 2022-11-02 op * [ 11110011 011111111 10111111 ] ->
211 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF },
213 3448adb0 2022-11-02 op .exp_len = 1,
214 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
218 3448adb0 2022-11-02 op /* invalid 4-byte sequence (third byte missing)
219 3448adb0 2022-11-02 op * [ 11110011 10111111 ] ->
222 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0xBF },
224 3448adb0 2022-11-02 op .exp_len = 4,
225 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
228 3448adb0 2022-11-02 op /* invalid 4-byte sequence (third byte malformed)
229 3448adb0 2022-11-02 op * [ 11110011 10111111 01111111 10111111 ] ->
232 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F, 0xBF },
234 3448adb0 2022-11-02 op .exp_len = 2,
235 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
238 3448adb0 2022-11-02 op /* invalid 4-byte sequence (short string, third byte malformed)
239 3448adb0 2022-11-02 op * [ 11110011 10111111 01111111 ] ->
242 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F },
244 3448adb0 2022-11-02 op .exp_len = 2,
245 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
248 3448adb0 2022-11-02 op /* invalid 4-byte sequence (fourth byte missing)
249 3448adb0 2022-11-02 op * [ 11110011 10111111 10111111 ] ->
252 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF },
254 3448adb0 2022-11-02 op .exp_len = 4,
255 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
258 3448adb0 2022-11-02 op /* invalid 4-byte sequence (fourth byte malformed)
259 3448adb0 2022-11-02 op * [ 11110011 10111111 10111111 01111111 ] ->
262 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0x7F },
264 3448adb0 2022-11-02 op .exp_len = 3,
265 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
268 3448adb0 2022-11-02 op /* invalid 4-byte sequence (overlong encoded)
269 3448adb0 2022-11-02 op * [ 11110000 10000000 10000001 10111111 ] ->
272 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF0, 0x80, 0x81, 0xBF },
274 3448adb0 2022-11-02 op .exp_len = 4,
275 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
278 3448adb0 2022-11-02 op /* invalid 4-byte sequence (UTF-16-unrepresentable)
279 3448adb0 2022-11-02 op * [ 11110100 10010000 10000000 10000000 ] ->
282 3448adb0 2022-11-02 op .arr = (char *)(unsigned char[]){ 0xF4, 0x90, 0x80, 0x80 },
284 3448adb0 2022-11-02 op .exp_len = 4,
285 3448adb0 2022-11-02 op .exp_cp = GRAPHEME_INVALID_CODEPOINT,
290 3448adb0 2022-11-02 op main(int argc, char *argv[])
292 3448adb0 2022-11-02 op size_t i, failed;
296 3448adb0 2022-11-02 op /* UTF-8 decoder test */
297 3448adb0 2022-11-02 op for (i = 0, failed = 0; i < LEN(dec_test); i++) {
299 3448adb0 2022-11-02 op uint_least32_t cp;
301 3448adb0 2022-11-02 op len = grapheme_decode_utf8(dec_test[i].arr,
302 3448adb0 2022-11-02 op dec_test[i].len, &cp);
304 3448adb0 2022-11-02 op if (len != dec_test[i].exp_len ||
305 3448adb0 2022-11-02 op cp != dec_test[i].exp_cp) {
306 3448adb0 2022-11-02 op fprintf(stderr, "%s: Failed test %zu: "
307 3448adb0 2022-11-02 op "Expected (%zx,%u), but got (%zx,%u).\n",
308 3448adb0 2022-11-02 op argv[0], i, dec_test[i].exp_len,
309 3448adb0 2022-11-02 op dec_test[i].exp_cp, len, cp);
313 3448adb0 2022-11-02 op printf("%s: %zu/%zu unit tests passed.\n", argv[0],
314 3448adb0 2022-11-02 op LEN(dec_test) - failed, LEN(dec_test));
316 3448adb0 2022-11-02 op return (failed > 0) ? 1 : 0;