Blame


1 a8a1f439 2021-07-07 op /*
2 a8a1f439 2021-07-07 op * Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
3 ef04b551 2021-01-09 op *
4 ef04b551 2021-01-09 op * Permission is hereby granted, free of charge, to any person
5 ef04b551 2021-01-09 op * obtaining a copy of this software and associated documentation
6 ef04b551 2021-01-09 op * files (the "Software"), to deal in the Software without
7 ef04b551 2021-01-09 op * restriction, including without limitation the rights to use, copy,
8 ef04b551 2021-01-09 op * modify, merge, publish, distribute, sublicense, and/or sell copies
9 ef04b551 2021-01-09 op * of the Software, and to permit persons to whom the Software is
10 ef04b551 2021-01-09 op * furnished to do so, subject to the following conditions:
11 ef04b551 2021-01-09 op *
12 ef04b551 2021-01-09 op * The above copyright notice and this permission notice shall be
13 ef04b551 2021-01-09 op * included in all copies or substantial portions of the Software.
14 ef04b551 2021-01-09 op *
15 ef04b551 2021-01-09 op * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16 ef04b551 2021-01-09 op * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17 ef04b551 2021-01-09 op * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18 ef04b551 2021-01-09 op * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19 ef04b551 2021-01-09 op * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20 ef04b551 2021-01-09 op * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21 ef04b551 2021-01-09 op * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22 ef04b551 2021-01-09 op * SOFTWARE.
23 ef04b551 2021-01-09 op */
24 ef04b551 2021-01-09 op
25 52418c8d 2021-02-12 op #include "gmid.h"
26 52418c8d 2021-02-12 op
27 ef04b551 2021-01-09 op #include <stddef.h>
28 ef04b551 2021-01-09 op #include <stdint.h>
29 ef04b551 2021-01-09 op
30 ef04b551 2021-01-09 op #define UTF8_ACCEPT 0
31 ef04b551 2021-01-09 op #define UTF8_REJECT 1
32 ef04b551 2021-01-09 op
33 ef04b551 2021-01-09 op static const uint8_t utf8d[] = {
34 ef04b551 2021-01-09 op 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
35 ef04b551 2021-01-09 op 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
36 ef04b551 2021-01-09 op 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
37 ef04b551 2021-01-09 op 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
38 ef04b551 2021-01-09 op 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
39 ef04b551 2021-01-09 op 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
40 ef04b551 2021-01-09 op 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
41 ef04b551 2021-01-09 op 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
42 ef04b551 2021-01-09 op 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
43 ef04b551 2021-01-09 op 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
44 ef04b551 2021-01-09 op 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
45 ef04b551 2021-01-09 op 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
46 ef04b551 2021-01-09 op 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
47 ef04b551 2021-01-09 op 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
48 ef04b551 2021-01-09 op };
49 ef04b551 2021-01-09 op
50 945d22d1 2021-01-10 op static inline uint32_t
51 ef04b551 2021-01-09 op utf8_decode(uint32_t* state, uint32_t* codep, uint8_t byte) {
52 ef04b551 2021-01-09 op uint32_t type = utf8d[byte];
53 ef04b551 2021-01-09 op
54 ef04b551 2021-01-09 op *codep = (*state != UTF8_ACCEPT) ?
55 ef04b551 2021-01-09 op (byte & 0x3fu) | (*codep << 6) :
56 ef04b551 2021-01-09 op (0xff >> type) & (byte);
57 ef04b551 2021-01-09 op
58 ef04b551 2021-01-09 op *state = utf8d[256 + *state*16 + type];
59 ef04b551 2021-01-09 op return *state;
60 ef04b551 2021-01-09 op }
61 ef04b551 2021-01-09 op
62 ef04b551 2021-01-09 op /* for the iri parser. Modelled after printCodePoints */
63 ef04b551 2021-01-09 op int
64 ef04b551 2021-01-09 op valid_multibyte_utf8(struct parser *p)
65 ef04b551 2021-01-09 op {
66 ef04b551 2021-01-09 op uint32_t cp = 0, state = 0;
67 ef04b551 2021-01-09 op
68 4842c72d 2021-10-18 op for (; *p->iri; p->iri++)
69 3c1cf9d0 2021-01-11 op if (!utf8_decode(&state, &cp, *p->iri))
70 ef04b551 2021-01-09 op break;
71 ef04b551 2021-01-09 op
72 ef04b551 2021-01-09 op /* reject the ASCII range */
73 ef04b551 2021-01-09 op if (state || cp <= 0x7F) {
74 ef04b551 2021-01-09 op /* XXX: do some error recovery? */
75 ef04b551 2021-01-09 op if (state)
76 ef04b551 2021-01-09 op p->err = "invalid UTF-8 character";
77 ef04b551 2021-01-09 op return 0;
78 ef04b551 2021-01-09 op }
79 ef04b551 2021-01-09 op return 1;
80 ef04b551 2021-01-09 op }
81 3300cbe0 2021-01-27 op
82 3300cbe0 2021-01-27 op char *
83 3300cbe0 2021-01-27 op utf8_nth(char *s, size_t n)
84 3300cbe0 2021-01-27 op {
85 3300cbe0 2021-01-27 op size_t i;
86 3300cbe0 2021-01-27 op uint32_t cp = 0, state = 0;
87 3300cbe0 2021-01-27 op
88 3300cbe0 2021-01-27 op for (i = 0; *s && i < n; ++s)
89 3300cbe0 2021-01-27 op if (!utf8_decode(&state, &cp, *s))
90 3300cbe0 2021-01-27 op ++i;
91 3300cbe0 2021-01-27 op
92 3300cbe0 2021-01-27 op if (state != UTF8_ACCEPT)
93 3300cbe0 2021-01-27 op return NULL;
94 3300cbe0 2021-01-27 op if (i == n)
95 3300cbe0 2021-01-27 op return s;
96 3300cbe0 2021-01-27 op return NULL;
97 3300cbe0 2021-01-27 op }