/* See LICENSE file for copyright and license details. */ #include #include #include #include #include "../grapheme.h" #include "util.h" static const struct { char *arr; /* UTF-8 byte sequence */ size_t len; /* length of UTF-8 byte sequence */ size_t exp_len; /* expected length returned */ uint_least32_t exp_cp; /* expected codepoint returned */ } dec_test[] = { { /* empty sequence * [ ] -> * INVALID */ .arr = NULL, .len = 0, .exp_len = 0, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid lead byte * [ 11111101 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xFD }, .len = 1, .exp_len = 1, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* valid 1-byte sequence * [ 00000001 ] -> * 0000001 */ .arr = (char *)(unsigned char[]){ 0x01 }, .len = 1, .exp_len = 1, .exp_cp = 0x1, }, { /* valid 2-byte sequence * [ 11000011 10111111 ] -> * 00011111111 */ .arr = (char *)(unsigned char[]){ 0xC3, 0xBF }, .len = 2, .exp_len = 2, .exp_cp = 0xFF, }, { /* invalid 2-byte sequence (second byte missing) * [ 11000011 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xC3 }, .len = 1, .exp_len = 2, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 2-byte sequence (second byte malformed) * [ 11000011 11111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xC3, 0xFF }, .len = 2, .exp_len = 1, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 2-byte sequence (overlong encoded) * [ 11000001 10111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xC1, 0xBF }, .len = 2, .exp_len = 2, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* valid 3-byte sequence * [ 11100000 10111111 10111111 ] -> * 0000111111111111 */ .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0xBF }, .len = 3, .exp_len = 3, .exp_cp = 0xFFF, }, { /* invalid 3-byte sequence (second byte missing) * [ 11100000 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xE0 }, .len = 1, .exp_len = 3, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 3-byte sequence (second byte malformed) * [ 11100000 01111111 10111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xE0, 0x7F, 0xBF }, .len = 3, .exp_len = 1, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 3-byte sequence (short string, second byte malformed) * [ 11100000 01111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xE0, 0x7F }, .len = 2, .exp_len = 1, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 3-byte sequence (third byte missing) * [ 11100000 10111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xE0, 0xBF }, .len = 2, .exp_len = 3, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 3-byte sequence (third byte malformed) * [ 11100000 10111111 01111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xE0, 0xBF, 0x7F }, .len = 3, .exp_len = 2, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 3-byte sequence (overlong encoded) * [ 11100000 10011111 10111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xE0, 0x9F, 0xBF }, .len = 3, .exp_len = 3, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 3-byte sequence (UTF-16 surrogate half) * [ 11101101 10100000 10000000 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xED, 0xA0, 0x80 }, .len = 3, .exp_len = 3, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* valid 4-byte sequence * [ 11110011 10111111 10111111 10111111 ] -> * 011111111111111111111 */ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0xBF }, .len = 4, .exp_len = 4, .exp_cp = UINT32_C(0xFFFFF), }, { /* invalid 4-byte sequence (second byte missing) * [ 11110011 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xF3 }, .len = 1, .exp_len = 4, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 4-byte sequence (second byte malformed) * [ 11110011 01111111 10111111 10111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF, 0xBF }, .len = 4, .exp_len = 1, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 4-byte sequence (short string 1, second byte malformed) * [ 11110011 011111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xF3, 0x7F }, .len = 2, .exp_len = 1, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 4-byte sequence (short string 2, second byte malformed) * [ 11110011 011111111 10111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xF3, 0x7F, 0xBF }, .len = 3, .exp_len = 1, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 4-byte sequence (third byte missing) * [ 11110011 10111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF }, .len = 2, .exp_len = 4, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 4-byte sequence (third byte malformed) * [ 11110011 10111111 01111111 10111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F, 0xBF }, .len = 4, .exp_len = 2, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 4-byte sequence (short string, third byte malformed) * [ 11110011 10111111 01111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0x7F }, .len = 3, .exp_len = 2, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 4-byte sequence (fourth byte missing) * [ 11110011 10111111 10111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF }, .len = 3, .exp_len = 4, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 4-byte sequence (fourth byte malformed) * [ 11110011 10111111 10111111 01111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xF3, 0xBF, 0xBF, 0x7F }, .len = 4, .exp_len = 3, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 4-byte sequence (overlong encoded) * [ 11110000 10000000 10000001 10111111 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xF0, 0x80, 0x81, 0xBF }, .len = 4, .exp_len = 4, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, { /* invalid 4-byte sequence (UTF-16-unrepresentable) * [ 11110100 10010000 10000000 10000000 ] -> * INVALID */ .arr = (char *)(unsigned char[]){ 0xF4, 0x90, 0x80, 0x80 }, .len = 4, .exp_len = 4, .exp_cp = GRAPHEME_INVALID_CODEPOINT, }, }; int main(int argc, char *argv[]) { size_t i, failed; (void)argc; /* UTF-8 decoder test */ for (i = 0, failed = 0; i < LEN(dec_test); i++) { size_t len; uint_least32_t cp; len = grapheme_decode_utf8(dec_test[i].arr, dec_test[i].len, &cp); if (len != dec_test[i].exp_len || cp != dec_test[i].exp_cp) { fprintf(stderr, "%s: Failed test %zu: " "Expected (%zx,%u), but got (%zx,%u).\n", argv[0], i, dec_test[i].exp_len, dec_test[i].exp_cp, len, cp); failed++; } } printf("%s: %zu/%zu unit tests passed.\n", argv[0], LEN(dec_test) - failed, LEN(dec_test)); return (failed > 0) ? 1 : 0; }