Blob


1 /* See LICENSE file for copyright and license details. */
2 #include <stddef.h>
3 #include <stdint.h>
4 #include <stdio.h>
5 #include <string.h>
7 #include "../grapheme.h"
8 #include "util.h"
10 static const struct {
11 char *arr; /* UTF-8 byte sequence */
12 size_t len; /* length of UTF-8 byte sequence */
13 size_t exp_len; /* expected length returned */
14 uint_least32_t exp_cp; /* expected codepoint returned */
15 } dec_test[] = {
16 {
17 /* empty sequence
18 * [ ] ->
19 * INVALID
20 */
21 .arr = NULL,
22 .len = 0,
23 .exp_len = 0,
24 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
25 },
26 {
27 /* invalid lead byte
28 * [ 11111101 ] ->
29 * INVALID
30 */
31 .arr = (char *)(unsigned char[]){ 0xFD },
32 .len = 1,
33 .exp_len = 1,
34 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
35 },
36 {
37 /* valid 1-byte sequence
38 * [ 00000001 ] ->
39 * 0000001
40 */
41 .arr = (char *)(unsigned char[]){ 0x01 },
42 .len = 1,
43 .exp_len = 1,
44 .exp_cp = 0x1,
45 },
46 {
47 /* valid 2-byte sequence
48 * [ 11000011 10111111 ] ->
49 * 00011111111
50 */
51 .arr = (char *)(unsigned char[]){ 0xC3, 0xBF },
52 .len = 2,
53 .exp_len = 2,
54 .exp_cp = 0xFF,
55 },
56 {
57 /* invalid 2-byte sequence (second byte missing)
58 * [ 11000011 ] ->
59 * INVALID
60 */
61 .arr = (char *)(unsigned char[]){ 0xC3 },
62 .len = 1,
63 .exp_len = 2,
64 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
65 },
66 {
67 /* invalid 2-byte sequence (second byte malformed)
68 * [ 11000011 11111111 ] ->
69 * INVALID
70 */
71 .arr = (char *)(unsigned char[]){ 0xC3, 0xFF },
72 .len = 2,
73 .exp_len = 1,
74 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
75 },
76 {
77 /* invalid 2-byte sequence (overlong encoded)
78 * [ 11000001 10111111 ] ->
79 * INVALID
80 */
81 .arr = (char *)(unsigned char[]){ 0xC1, 0xBF },
82 .len = 2,
83 .exp_len = 2,
84 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
85 },
86 {
87 /* valid 3-byte sequence
88 * [ 11100000 10111111 10111111 ] ->
89 * 0000111111111111
90 */
91 .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF },
92 .len = 3,
93 .exp_len = 3,
94 .exp_cp = 0xFFF,
95 },
96 {
97 /* invalid 3-byte sequence (second byte missing)
98 * [ 11100000 ] ->
99 * INVALID
100 */
101 .arr = (char *)(unsigned char[]){ 0xE0 },
102 .len = 1,
103 .exp_len = 3,
104 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
105 },
107 /* invalid 3-byte sequence (second byte malformed)
108 * [ 11100000 01111111 10111111 ] ->
109 * INVALID
110 */
111 .arr = (char *)(unsigned char[]){ 0xE0, 0x7F, 0xBF },
112 .len = 3,
113 .exp_len = 1,
114 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
115 },
117 /* invalid 3-byte sequence (short string, second byte malformed)
118 * [ 11100000 01111111 ] ->
119 * INVALID
120 */
121 .arr = (char *)(unsigned char[]){ 0xE0, 0x7F },
122 .len = 2,
123 .exp_len = 1,
124 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
125 },
127 /* invalid 3-byte sequence (third byte missing)
128 * [ 11100000 10111111 ] ->
129 * INVALID
130 */
131 .arr = (char *)(unsigned char[]){ 0xE0, 0xBF },
132 .len = 2,
133 .exp_len = 3,
134 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
135 },
137 /* invalid 3-byte sequence (third byte malformed)
138 * [ 11100000 10111111 01111111 ] ->
139 * INVALID
140 */
141 .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0x7F },
142 .len = 3,
143 .exp_len = 2,
144 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
145 },
147 /* invalid 3-byte sequence (overlong encoded)
148 * [ 11100000 10011111 10111111 ] ->
149 * INVALID
150 */
151 .arr = (char *)(unsigned char[]){ 0xE0, 0x9F, 0xBF },
152 .len = 3,
153 .exp_len = 3,
154 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
155 },
157 /* invalid 3-byte sequence (UTF-16 surrogate half)
158 * [ 11101101 10100000 10000000 ] ->
159 * INVALID
160 */
161 .arr = (char *)(unsigned char[]){ 0xED, 0xA0, 0x80 },
162 .len = 3,
163 .exp_len = 3,
164 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
165 },
167 /* valid 4-byte sequence
168 * [ 11110011 10111111 10111111 10111111 ] ->
169 * 011111111111111111111
170 */
171 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF },
172 .len = 4,
173 .exp_len = 4,
174 .exp_cp = UINT32_C(0xFFFFF),
175 },
177 /* invalid 4-byte sequence (second byte missing)
178 * [ 11110011 ] ->
179 * INVALID
180 */
181 .arr = (char *)(unsigned char[]){ 0xF3 },
182 .len = 1,
183 .exp_len = 4,
184 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
185 },
187 /* invalid 4-byte sequence (second byte malformed)
188 * [ 11110011 01111111 10111111 10111111 ] ->
189 * INVALID
190 */
191 .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF, 0xBF },
192 .len = 4,
193 .exp_len = 1,
194 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
195 },
197 /* invalid 4-byte sequence (short string 1, second byte malformed)
198 * [ 11110011 011111111 ] ->
199 * INVALID
200 */
201 .arr = (char *)(unsigned char[]){ 0xF3, 0x7F },
202 .len = 2,
203 .exp_len = 1,
204 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
205 },
207 /* invalid 4-byte sequence (short string 2, second byte malformed)
208 * [ 11110011 011111111 10111111 ] ->
209 * INVALID
210 */
211 .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF },
212 .len = 3,
213 .exp_len = 1,
214 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
215 },
218 /* invalid 4-byte sequence (third byte missing)
219 * [ 11110011 10111111 ] ->
220 * INVALID
221 */
222 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF },
223 .len = 2,
224 .exp_len = 4,
225 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
226 },
228 /* invalid 4-byte sequence (third byte malformed)
229 * [ 11110011 10111111 01111111 10111111 ] ->
230 * INVALID
231 */
232 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F, 0xBF },
233 .len = 4,
234 .exp_len = 2,
235 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
236 },
238 /* invalid 4-byte sequence (short string, third byte malformed)
239 * [ 11110011 10111111 01111111 ] ->
240 * INVALID
241 */
242 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F },
243 .len = 3,
244 .exp_len = 2,
245 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
246 },
248 /* invalid 4-byte sequence (fourth byte missing)
249 * [ 11110011 10111111 10111111 ] ->
250 * INVALID
251 */
252 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF },
253 .len = 3,
254 .exp_len = 4,
255 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
256 },
258 /* invalid 4-byte sequence (fourth byte malformed)
259 * [ 11110011 10111111 10111111 01111111 ] ->
260 * INVALID
261 */
262 .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0x7F },
263 .len = 4,
264 .exp_len = 3,
265 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
266 },
268 /* invalid 4-byte sequence (overlong encoded)
269 * [ 11110000 10000000 10000001 10111111 ] ->
270 * INVALID
271 */
272 .arr = (char *)(unsigned char[]){ 0xF0, 0x80, 0x81, 0xBF },
273 .len = 4,
274 .exp_len = 4,
275 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
276 },
278 /* invalid 4-byte sequence (UTF-16-unrepresentable)
279 * [ 11110100 10010000 10000000 10000000 ] ->
280 * INVALID
281 */
282 .arr = (char *)(unsigned char[]){ 0xF4, 0x90, 0x80, 0x80 },
283 .len = 4,
284 .exp_len = 4,
285 .exp_cp = GRAPHEME_INVALID_CODEPOINT,
286 },
287 };
289 int
290 main(int argc, char *argv[])
292 size_t i, failed;
294 (void)argc;
296 /* UTF-8 decoder test */
297 for (i = 0, failed = 0; i < LEN(dec_test); i++) {
298 size_t len;
299 uint_least32_t cp;
301 len = grapheme_decode_utf8(dec_test[i].arr,
302 dec_test[i].len, &cp);
304 if (len != dec_test[i].exp_len ||
305 cp != dec_test[i].exp_cp) {
306 fprintf(stderr, "%s: Failed test %zu: "
307 "Expected (%zx,%u), but got (%zx,%u).\n",
308 argv[0], i, dec_test[i].exp_len,
309 dec_test[i].exp_cp, len, cp);
310 failed++;
313 printf("%s: %zu/%zu unit tests passed.\n", argv[0],
314 LEN(dec_test) - failed, LEN(dec_test));
316 return (failed > 0) ? 1 : 0;