Blame


1 3448adb0 2022-11-02 op /* See LICENSE file for copyright and license details. */
2 3448adb0 2022-11-02 op #include <stddef.h>
3 3448adb0 2022-11-02 op #include <stdint.h>
4 3448adb0 2022-11-02 op
5 3448adb0 2022-11-02 op #include "../grapheme.h"
6 3448adb0 2022-11-02 op #include "util.h"
7 3448adb0 2022-11-02 op
8 3448adb0 2022-11-02 op #define BETWEEN(c, l, u) ((c) >= (l) && (c) <= (u))
9 3448adb0 2022-11-02 op
10 3448adb0 2022-11-02 op /* lookup-table for the types of sequence first bytes */
11 3448adb0 2022-11-02 op static const struct {
12 3448adb0 2022-11-02 op uint_least8_t lower; /* lower bound of sequence first byte */
13 3448adb0 2022-11-02 op uint_least8_t upper; /* upper bound of sequence first byte */
14 3448adb0 2022-11-02 op uint_least32_t mincp; /* smallest non-overlong encoded codepoint */
15 3448adb0 2022-11-02 op uint_least32_t maxcp; /* largest encodable codepoint */
16 3448adb0 2022-11-02 op /*
17 3448adb0 2022-11-02 op * implicit: table-offset represents the number of following
18 3448adb0 2022-11-02 op * bytes of the form 10xxxxxx (6 bits capacity each)
19 3448adb0 2022-11-02 op */
20 3448adb0 2022-11-02 op } lut[] = {
21 3448adb0 2022-11-02 op [0] = {
22 3448adb0 2022-11-02 op /* 0xxxxxxx */
23 3448adb0 2022-11-02 op .lower = 0x00, /* 00000000 */
24 3448adb0 2022-11-02 op .upper = 0x7F, /* 01111111 */
25 3448adb0 2022-11-02 op .mincp = (uint_least32_t)0,
26 3448adb0 2022-11-02 op .maxcp = ((uint_least32_t)1 << 7) - 1, /* 7 bits capacity */
27 3448adb0 2022-11-02 op },
28 3448adb0 2022-11-02 op [1] = {
29 3448adb0 2022-11-02 op /* 110xxxxx */
30 3448adb0 2022-11-02 op .lower = 0xC0, /* 11000000 */
31 3448adb0 2022-11-02 op .upper = 0xDF, /* 11011111 */
32 3448adb0 2022-11-02 op .mincp = (uint_least32_t)1 << 7,
33 3448adb0 2022-11-02 op .maxcp = ((uint_least32_t)1 << 11) - 1, /* 5+6=11 bits capacity */
34 3448adb0 2022-11-02 op },
35 3448adb0 2022-11-02 op [2] = {
36 3448adb0 2022-11-02 op /* 1110xxxx */
37 3448adb0 2022-11-02 op .lower = 0xE0, /* 11100000 */
38 3448adb0 2022-11-02 op .upper = 0xEF, /* 11101111 */
39 3448adb0 2022-11-02 op .mincp = (uint_least32_t)1 << 11,
40 3448adb0 2022-11-02 op .maxcp = ((uint_least32_t)1 << 16) - 1, /* 4+6+6=16 bits capacity */
41 3448adb0 2022-11-02 op },
42 3448adb0 2022-11-02 op [3] = {
43 3448adb0 2022-11-02 op /* 11110xxx */
44 3448adb0 2022-11-02 op .lower = 0xF0, /* 11110000 */
45 3448adb0 2022-11-02 op .upper = 0xF7, /* 11110111 */
46 3448adb0 2022-11-02 op .mincp = (uint_least32_t)1 << 16,
47 3448adb0 2022-11-02 op .maxcp = ((uint_least32_t)1 << 21) - 1, /* 3+6+6+6=21 bits capacity */
48 3448adb0 2022-11-02 op },
49 3448adb0 2022-11-02 op };
50 3448adb0 2022-11-02 op
51 3448adb0 2022-11-02 op size_t
52 3448adb0 2022-11-02 op grapheme_decode_utf8(const char *str, size_t len, uint_least32_t *cp)
53 3448adb0 2022-11-02 op {
54 3448adb0 2022-11-02 op size_t off, i;
55 3448adb0 2022-11-02 op uint_least32_t tmp;
56 3448adb0 2022-11-02 op
57 3448adb0 2022-11-02 op if (cp == NULL) {
58 3448adb0 2022-11-02 op /*
59 3448adb0 2022-11-02 op * instead of checking every time if cp is NULL within
60 3448adb0 2022-11-02 op * the decoder, simply point it at a dummy variable here.
61 3448adb0 2022-11-02 op */
62 3448adb0 2022-11-02 op cp = &tmp;
63 3448adb0 2022-11-02 op }
64 3448adb0 2022-11-02 op
65 3448adb0 2022-11-02 op if (str == NULL || len == 0) {
66 3448adb0 2022-11-02 op /* a sequence must be at least 1 byte long */
67 3448adb0 2022-11-02 op *cp = GRAPHEME_INVALID_CODEPOINT;
68 3448adb0 2022-11-02 op return 0;
69 3448adb0 2022-11-02 op }
70 3448adb0 2022-11-02 op
71 3448adb0 2022-11-02 op /* identify sequence type with the first byte */
72 3448adb0 2022-11-02 op for (off = 0; off < LEN(lut); off++) {
73 3448adb0 2022-11-02 op if (BETWEEN(((const unsigned char *)str)[0], lut[off].lower,
74 3448adb0 2022-11-02 op lut[off].upper)) {
75 3448adb0 2022-11-02 op /*
76 3448adb0 2022-11-02 op * first byte is within the bounds; fill
77 3448adb0 2022-11-02 op * p with the the first bits contained in
78 3448adb0 2022-11-02 op * the first byte (by subtracting the high bits)
79 3448adb0 2022-11-02 op */
80 3448adb0 2022-11-02 op *cp = ((const unsigned char *)str)[0] - lut[off].lower;
81 3448adb0 2022-11-02 op break;
82 3448adb0 2022-11-02 op }
83 3448adb0 2022-11-02 op }
84 3448adb0 2022-11-02 op if (off == LEN(lut)) {
85 3448adb0 2022-11-02 op /*
86 3448adb0 2022-11-02 op * first byte does not match a sequence type;
87 3448adb0 2022-11-02 op * set cp as invalid and return 1 byte processed
88 3448adb0 2022-11-02 op *
89 3448adb0 2022-11-02 op * this also includes the cases where bits higher than
90 3448adb0 2022-11-02 op * the 8th are set on systems with CHAR_BIT > 8
91 3448adb0 2022-11-02 op */
92 3448adb0 2022-11-02 op *cp = GRAPHEME_INVALID_CODEPOINT;
93 3448adb0 2022-11-02 op return 1;
94 3448adb0 2022-11-02 op }
95 3448adb0 2022-11-02 op if (1 + off > len) {
96 3448adb0 2022-11-02 op /*
97 3448adb0 2022-11-02 op * input is not long enough, set cp as invalid
98 3448adb0 2022-11-02 op */
99 3448adb0 2022-11-02 op *cp = GRAPHEME_INVALID_CODEPOINT;
100 3448adb0 2022-11-02 op
101 3448adb0 2022-11-02 op /*
102 3448adb0 2022-11-02 op * count the following continuation bytes, but nothing
103 3448adb0 2022-11-02 op * else in case we have a "rogue" case where e.g. such a
104 3448adb0 2022-11-02 op * sequence starter occurs right before a NUL-byte.
105 3448adb0 2022-11-02 op */
106 3448adb0 2022-11-02 op for (i = 0; 1 + i < len; i++) {
107 3448adb0 2022-11-02 op if(!BETWEEN(((const unsigned char *)str)[1 + i],
108 3448adb0 2022-11-02 op 0x80, 0xBF)) {
109 3448adb0 2022-11-02 op break;
110 3448adb0 2022-11-02 op }
111 3448adb0 2022-11-02 op }
112 3448adb0 2022-11-02 op
113 3448adb0 2022-11-02 op /*
114 3448adb0 2022-11-02 op * if the continuation bytes do not continue until
115 3448adb0 2022-11-02 op * the end, return the incomplete sequence length.
116 3448adb0 2022-11-02 op * Otherwise return the number of bytes we actually
117 3448adb0 2022-11-02 op * expected, which is larger than n.
118 3448adb0 2022-11-02 op */
119 3448adb0 2022-11-02 op return ((1 + i) < len) ? (1 + i) : (1 + off);
120 3448adb0 2022-11-02 op }
121 3448adb0 2022-11-02 op
122 3448adb0 2022-11-02 op /*
123 3448adb0 2022-11-02 op * process 'off' following bytes, each of the form 10xxxxxx
124 3448adb0 2022-11-02 op * (i.e. between 0x80 (10000000) and 0xBF (10111111))
125 3448adb0 2022-11-02 op */
126 3448adb0 2022-11-02 op for (i = 1; i <= off; i++) {
127 3448adb0 2022-11-02 op if(!BETWEEN(((const unsigned char *)str)[i], 0x80, 0xBF)) {
128 3448adb0 2022-11-02 op /*
129 3448adb0 2022-11-02 op * byte does not match format; return
130 3448adb0 2022-11-02 op * number of bytes processed excluding the
131 3448adb0 2022-11-02 op * unexpected character as recommended since
132 3448adb0 2022-11-02 op * Unicode 6 (chapter 3)
133 3448adb0 2022-11-02 op *
134 3448adb0 2022-11-02 op * this also includes the cases where bits
135 3448adb0 2022-11-02 op * higher than the 8th are set on systems
136 3448adb0 2022-11-02 op * with CHAR_BIT > 8
137 3448adb0 2022-11-02 op */
138 3448adb0 2022-11-02 op *cp = GRAPHEME_INVALID_CODEPOINT;
139 3448adb0 2022-11-02 op return 1 + (i - 1);
140 3448adb0 2022-11-02 op }
141 3448adb0 2022-11-02 op /*
142 3448adb0 2022-11-02 op * shift codepoint by 6 bits and add the 6 stored bits
143 3448adb0 2022-11-02 op * in s[i] to it using the bitmask 0x3F (00111111)
144 3448adb0 2022-11-02 op */
145 3448adb0 2022-11-02 op *cp = (*cp << 6) | (((const unsigned char *)str)[i] & 0x3F);
146 3448adb0 2022-11-02 op }
147 3448adb0 2022-11-02 op
148 3448adb0 2022-11-02 op if (*cp < lut[off].mincp ||
149 3448adb0 2022-11-02 op BETWEEN(*cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
150 3448adb0 2022-11-02 op *cp > UINT32_C(0x10FFFF)) {
151 3448adb0 2022-11-02 op /*
152 3448adb0 2022-11-02 op * codepoint is overlong encoded in the sequence, is a
153 3448adb0 2022-11-02 op * high or low UTF-16 surrogate half (0xD800..0xDFFF) or
154 3448adb0 2022-11-02 op * not representable in UTF-16 (>0x10FFFF) (RFC-3629
155 3448adb0 2022-11-02 op * specifies the latter two conditions)
156 3448adb0 2022-11-02 op */
157 3448adb0 2022-11-02 op *cp = GRAPHEME_INVALID_CODEPOINT;
158 3448adb0 2022-11-02 op }
159 3448adb0 2022-11-02 op
160 3448adb0 2022-11-02 op return 1 + off;
161 3448adb0 2022-11-02 op }
162 3448adb0 2022-11-02 op
163 3448adb0 2022-11-02 op size_t
164 3448adb0 2022-11-02 op grapheme_encode_utf8(uint_least32_t cp, char *str, size_t len)
165 3448adb0 2022-11-02 op {
166 3448adb0 2022-11-02 op size_t off, i;
167 3448adb0 2022-11-02 op
168 3448adb0 2022-11-02 op if (BETWEEN(cp, UINT32_C(0xD800), UINT32_C(0xDFFF)) ||
169 3448adb0 2022-11-02 op cp > UINT32_C(0x10FFFF)) {
170 3448adb0 2022-11-02 op /*
171 3448adb0 2022-11-02 op * codepoint is a high or low UTF-16 surrogate half
172 3448adb0 2022-11-02 op * (0xD800..0xDFFF) or not representable in UTF-16
173 3448adb0 2022-11-02 op * (>0x10FFFF), which RFC-3629 deems invalid for UTF-8.
174 3448adb0 2022-11-02 op */
175 3448adb0 2022-11-02 op cp = GRAPHEME_INVALID_CODEPOINT;
176 3448adb0 2022-11-02 op }
177 3448adb0 2022-11-02 op
178 3448adb0 2022-11-02 op /* determine necessary sequence type */
179 3448adb0 2022-11-02 op for (off = 0; off < LEN(lut); off++) {
180 3448adb0 2022-11-02 op if (cp <= lut[off].maxcp) {
181 3448adb0 2022-11-02 op break;
182 3448adb0 2022-11-02 op }
183 3448adb0 2022-11-02 op }
184 3448adb0 2022-11-02 op if (1 + off > len || str == NULL || len == 0) {
185 3448adb0 2022-11-02 op /*
186 3448adb0 2022-11-02 op * specified buffer is too small to store sequence or
187 3448adb0 2022-11-02 op * the caller just wanted to know how many bytes the
188 3448adb0 2022-11-02 op * codepoint needs by passing a NULL-buffer.
189 3448adb0 2022-11-02 op */
190 3448adb0 2022-11-02 op return 1 + off;
191 3448adb0 2022-11-02 op }
192 3448adb0 2022-11-02 op
193 3448adb0 2022-11-02 op /* build sequence by filling cp-bits into each byte */
194 3448adb0 2022-11-02 op
195 3448adb0 2022-11-02 op /*
196 3448adb0 2022-11-02 op * lut[off].lower is the bit-format for the first byte and
197 3448adb0 2022-11-02 op * the bits to fill into it are determined by shifting the
198 3448adb0 2022-11-02 op * cp 6 times the number of following bytes, as each
199 3448adb0 2022-11-02 op * following byte stores 6 bits, yielding the wanted bits.
200 3448adb0 2022-11-02 op *
201 3448adb0 2022-11-02 op * We do not overwrite the mask because we guaranteed earlier
202 3448adb0 2022-11-02 op * that there are no bits higher than the mask allows.
203 3448adb0 2022-11-02 op */
204 3448adb0 2022-11-02 op ((unsigned char *)str)[0] = lut[off].lower |
205 3448adb0 2022-11-02 op (uint_least8_t)(cp >> (6 * off));
206 3448adb0 2022-11-02 op
207 3448adb0 2022-11-02 op for (i = 1; i <= off; i++) {
208 3448adb0 2022-11-02 op /*
209 3448adb0 2022-11-02 op * the bit-format for following bytes is 10000000 (0x80)
210 3448adb0 2022-11-02 op * and it each stores 6 bits in the 6 low bits that we
211 3448adb0 2022-11-02 op * extract from the properly-shifted value using the
212 3448adb0 2022-11-02 op * mask 00111111 (0x3F)
213 3448adb0 2022-11-02 op */
214 3448adb0 2022-11-02 op ((unsigned char *)str)[i] = 0x80 |
215 3448adb0 2022-11-02 op ((cp >> (6 * (off - i))) & 0x3F);
216 3448adb0 2022-11-02 op }
217 3448adb0 2022-11-02 op
218 3448adb0 2022-11-02 op return 1 + off;
219 3448adb0 2022-11-02 op }