1 /* See LICENSE file for copyright and license details. */
7 #include "../grapheme.h"
11 char *arr; /* UTF-8 byte sequence */
12 size_t len; /* length of UTF-8 byte sequence */
13 size_t exp_len; /* expected length returned */
14 uint_least32_t exp_cp; /* expected codepoint returned */
24 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
31 .arr = (char *)(unsigned char[]){ 0xFD },
34 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
37 /* valid 1-byte sequence
41 .arr = (char *)(unsigned char[]){ 0x01 },
47 /* valid 2-byte sequence
48 * [ 11000011 10111111 ] ->
51 .arr = (char *)(unsigned char[]){ 0xC3, 0xBF },
57 /* invalid 2-byte sequence (second byte missing)
61 .arr = (char *)(unsigned char[]){ 0xC3 },
64 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
67 /* invalid 2-byte sequence (second byte malformed)
68 * [ 11000011 11111111 ] ->
71 .arr = (char *)(unsigned char[]){ 0xC3, 0xFF },
74 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
77 /* invalid 2-byte sequence (overlong encoded)
78 * [ 11000001 10111111 ] ->
81 .arr = (char *)(unsigned char[]){ 0xC1, 0xBF },
84 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
87 /* valid 3-byte sequence
88 * [ 11100000 10111111 10111111 ] ->
91 .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF },
97 /* invalid 3-byte sequence (second byte missing)
101 .arr = (char *)(unsigned char[]){ 0xE0 },
104 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
107 /* invalid 3-byte sequence (second byte malformed)
108 * [ 11100000 01111111 10111111 ] ->
111 .arr = (char *)(unsigned char[]){ 0xE0, 0x7F, 0xBF },
114 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
117 /* invalid 3-byte sequence (short string, second byte malformed)
118 * [ 11100000 01111111 ] ->
121 .arr = (char *)(unsigned char[]){ 0xE0, 0x7F },
124 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
127 /* invalid 3-byte sequence (third byte missing)
128 * [ 11100000 10111111 ] ->
131 .arr = (char *)(unsigned char[]){ 0xE0, 0xBF },
134 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
137 /* invalid 3-byte sequence (third byte malformed)
138 * [ 11100000 10111111 01111111 ] ->
141 .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0x7F },
144 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
147 /* invalid 3-byte sequence (overlong encoded)
148 * [ 11100000 10011111 10111111 ] ->
151 .arr = (char *)(unsigned char[]){ 0xE0, 0x9F, 0xBF },
154 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
157 /* invalid 3-byte sequence (UTF-16 surrogate half)
158 * [ 11101101 10100000 10000000 ] ->
161 .arr = (char *)(unsigned char[]){ 0xED, 0xA0, 0x80 },
164 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
167 /* valid 4-byte sequence
168 * [ 11110011 10111111 10111111 10111111 ] ->
169 * 011111111111111111111
171 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF },
174 .exp_cp = UINT32_C(0xFFFFF),
177 /* invalid 4-byte sequence (second byte missing)
181 .arr = (char *)(unsigned char[]){ 0xF3 },
184 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
187 /* invalid 4-byte sequence (second byte malformed)
188 * [ 11110011 01111111 10111111 10111111 ] ->
191 .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF, 0xBF },
194 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
197 /* invalid 4-byte sequence (short string 1, second byte malformed)
198 * [ 11110011 011111111 ] ->
201 .arr = (char *)(unsigned char[]){ 0xF3, 0x7F },
204 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
207 /* invalid 4-byte sequence (short string 2, second byte malformed)
208 * [ 11110011 011111111 10111111 ] ->
211 .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF },
214 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
218 /* invalid 4-byte sequence (third byte missing)
219 * [ 11110011 10111111 ] ->
222 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF },
225 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
228 /* invalid 4-byte sequence (third byte malformed)
229 * [ 11110011 10111111 01111111 10111111 ] ->
232 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F, 0xBF },
235 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
238 /* invalid 4-byte sequence (short string, third byte malformed)
239 * [ 11110011 10111111 01111111 ] ->
242 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F },
245 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
248 /* invalid 4-byte sequence (fourth byte missing)
249 * [ 11110011 10111111 10111111 ] ->
252 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF },
255 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
258 /* invalid 4-byte sequence (fourth byte malformed)
259 * [ 11110011 10111111 10111111 01111111 ] ->
262 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0x7F },
265 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
268 /* invalid 4-byte sequence (overlong encoded)
269 * [ 11110000 10000000 10000001 10111111 ] ->
272 .arr = (char *)(unsigned char[]){ 0xF0, 0x80, 0x81, 0xBF },
275 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
278 /* invalid 4-byte sequence (UTF-16-unrepresentable)
279 * [ 11110100 10010000 10000000 10000000 ] ->
282 .arr = (char *)(unsigned char[]){ 0xF4, 0x90, 0x80, 0x80 },
285 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
290 main(int argc, char *argv[])
296 /* UTF-8 decoder test */
297 for (i = 0, failed = 0; i < LEN(dec_test); i++) {
301 len = grapheme_decode_utf8(dec_test[i].arr,
302 dec_test[i].len, &cp);
304 if (len != dec_test[i].exp_len ||
305 cp != dec_test[i].exp_cp) {
306 fprintf(stderr, "%s: Failed test %zu: "
307 "Expected (%zx,%u), but got (%zx,%u).\n",
308 argv[0], i, dec_test[i].exp_len,
309 dec_test[i].exp_cp, len, cp);
313 printf("%s: %zu/%zu unit tests passed.\n", argv[0],
314 LEN(dec_test) - failed, LEN(dec_test));
316 return (failed > 0) ? 1 : 0;