Blame


1 3448adb0 2022-11-02 op /* See LICENSE file for copyright and license details. */
2 3448adb0 2022-11-02 op #include <limits.h>
3 3448adb0 2022-11-02 op #include <stdbool.h>
4 3448adb0 2022-11-02 op #include <stddef.h>
5 3448adb0 2022-11-02 op
6 3448adb0 2022-11-02 op #include "../gen/character.h"
7 3448adb0 2022-11-02 op #include "../grapheme.h"
8 3448adb0 2022-11-02 op #include "util.h"
9 3448adb0 2022-11-02 op
10 3448adb0 2022-11-02 op struct character_break_state {
11 3448adb0 2022-11-02 op uint_least8_t prop;
12 3448adb0 2022-11-02 op bool prop_set;
13 3448adb0 2022-11-02 op bool gb11_flag;
14 3448adb0 2022-11-02 op bool gb12_13_flag;
15 3448adb0 2022-11-02 op };
16 3448adb0 2022-11-02 op
17 3448adb0 2022-11-02 op static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
18 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_OTHER] =
19 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
20 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
21 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
22 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_CR] =
23 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */
24 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_EXTEND] =
25 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
26 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
27 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
28 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
29 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
30 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
31 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
32 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_HANGUL_L] =
33 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */
34 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */
35 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */
36 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */
37 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
38 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
39 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
40 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_HANGUL_V] =
41 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
42 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
43 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
44 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
45 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
46 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_HANGUL_T] =
47 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
48 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
49 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
50 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
51 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_HANGUL_LV] =
52 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
53 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
54 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
55 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
56 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
57 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_HANGUL_LVT] =
58 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
59 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
60 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
61 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
62 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_PREPEND] =
63 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
64 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
65 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */
66 3448adb0 2022-11-02 op (UINT16_C(0xFFFF) &
67 3448adb0 2022-11-02 op ~(UINT16_C(1) << CHAR_BREAK_PROP_CR |
68 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_LF |
69 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_CONTROL
70 3448adb0 2022-11-02 op )
71 3448adb0 2022-11-02 op ), /* GB9b */
72 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
73 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
74 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
75 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
76 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_SPACINGMARK] =
77 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
78 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
79 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
80 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_ZWJ] =
81 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
82 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
83 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
84 3448adb0 2022-11-02 op };
85 3448adb0 2022-11-02 op static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
86 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
87 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
88 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
89 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
90 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
91 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
92 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |
93 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ,
94 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
95 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
96 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
97 3448adb0 2022-11-02 op };
98 3448adb0 2022-11-02 op static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
99 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
100 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
101 3448adb0 2022-11-02 op };
102 3448adb0 2022-11-02 op static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
103 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
104 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
105 3448adb0 2022-11-02 op };
106 3448adb0 2022-11-02 op static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
107 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
108 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
109 3448adb0 2022-11-02 op };
110 3448adb0 2022-11-02 op
111 3448adb0 2022-11-02 op static inline enum char_break_property
112 3448adb0 2022-11-02 op get_break_prop(uint_least32_t cp)
113 3448adb0 2022-11-02 op {
114 3448adb0 2022-11-02 op if (likely(cp <= UINT32_C(0x10FFFF))) {
115 3448adb0 2022-11-02 op return (enum char_break_property)
116 3448adb0 2022-11-02 op char_break_minor[char_break_major[cp >> 8] + (cp & 0xFF)];
117 3448adb0 2022-11-02 op } else {
118 3448adb0 2022-11-02 op return CHAR_BREAK_PROP_OTHER;
119 3448adb0 2022-11-02 op }
120 3448adb0 2022-11-02 op }
121 3448adb0 2022-11-02 op
122 3448adb0 2022-11-02 op static inline void
123 3448adb0 2022-11-02 op state_serialize(const struct character_break_state *in, uint_least16_t *out)
124 3448adb0 2022-11-02 op {
125 3448adb0 2022-11-02 op *out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */
126 3448adb0 2022-11-02 op (uint_least16_t)(((uint_least16_t)(in->prop_set)) << 8) | /* 9th bit */
127 3448adb0 2022-11-02 op (uint_least16_t)(((uint_least16_t)(in->gb11_flag)) << 9) | /* 10th bit */
128 3448adb0 2022-11-02 op (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) << 10); /* 11th bit */
129 3448adb0 2022-11-02 op }
130 3448adb0 2022-11-02 op
131 3448adb0 2022-11-02 op static inline void
132 3448adb0 2022-11-02 op state_deserialize(uint_least16_t in, struct character_break_state *out)
133 3448adb0 2022-11-02 op {
134 3448adb0 2022-11-02 op out->prop = in & UINT8_C(0xFF);
135 3448adb0 2022-11-02 op out->prop_set = in & (UINT16_C(1) << 8);
136 3448adb0 2022-11-02 op out->gb11_flag = in & (UINT16_C(1) << 9);
137 3448adb0 2022-11-02 op out->gb12_13_flag = in & (UINT16_C(1) << 10);
138 3448adb0 2022-11-02 op }
139 3448adb0 2022-11-02 op
140 3448adb0 2022-11-02 op bool
141 3448adb0 2022-11-02 op grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, uint_least16_t *s)
142 3448adb0 2022-11-02 op {
143 3448adb0 2022-11-02 op struct character_break_state state;
144 3448adb0 2022-11-02 op enum char_break_property cp0_prop, cp1_prop;
145 3448adb0 2022-11-02 op bool notbreak = false;
146 3448adb0 2022-11-02 op
147 3448adb0 2022-11-02 op if (likely(s)) {
148 3448adb0 2022-11-02 op state_deserialize(*s, &state);
149 3448adb0 2022-11-02 op
150 3448adb0 2022-11-02 op if (likely(state.prop_set)) {
151 3448adb0 2022-11-02 op cp0_prop = state.prop;
152 3448adb0 2022-11-02 op } else {
153 3448adb0 2022-11-02 op cp0_prop = get_break_prop(cp0);
154 3448adb0 2022-11-02 op }
155 3448adb0 2022-11-02 op cp1_prop = get_break_prop(cp1);
156 3448adb0 2022-11-02 op
157 3448adb0 2022-11-02 op /* preserve prop of right codepoint for next iteration */
158 3448adb0 2022-11-02 op state.prop = (uint_least8_t)cp1_prop;
159 3448adb0 2022-11-02 op state.prop_set = true;
160 3448adb0 2022-11-02 op
161 3448adb0 2022-11-02 op /* update flags */
162 3448adb0 2022-11-02 op state.gb11_flag =
163 3448adb0 2022-11-02 op flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
164 3448adb0 2022-11-02 op state.gb11_flag] &
165 3448adb0 2022-11-02 op UINT16_C(1) << cp1_prop;
166 3448adb0 2022-11-02 op state.gb12_13_flag =
167 3448adb0 2022-11-02 op flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS *
168 3448adb0 2022-11-02 op state.gb12_13_flag] &
169 3448adb0 2022-11-02 op UINT16_C(1) << cp1_prop;
170 3448adb0 2022-11-02 op
171 3448adb0 2022-11-02 op /*
172 3448adb0 2022-11-02 op * Apply grapheme cluster breaking algorithm (UAX #29), see
173 3448adb0 2022-11-02 op * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
174 3448adb0 2022-11-02 op */
175 3448adb0 2022-11-02 op notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
176 3448adb0 2022-11-02 op (dont_break_gb11[cp0_prop + state.gb11_flag *
177 3448adb0 2022-11-02 op NUM_CHAR_BREAK_PROPS] &
178 3448adb0 2022-11-02 op (UINT16_C(1) << cp1_prop)) ||
179 3448adb0 2022-11-02 op (dont_break_gb12_13[cp0_prop + state.gb12_13_flag *
180 3448adb0 2022-11-02 op NUM_CHAR_BREAK_PROPS] &
181 3448adb0 2022-11-02 op (UINT16_C(1) << cp1_prop));
182 3448adb0 2022-11-02 op
183 3448adb0 2022-11-02 op /* update or reset flags (when we have a break) */
184 3448adb0 2022-11-02 op if (likely(!notbreak)) {
185 3448adb0 2022-11-02 op state.gb11_flag = state.gb12_13_flag = false;
186 3448adb0 2022-11-02 op }
187 3448adb0 2022-11-02 op
188 3448adb0 2022-11-02 op state_serialize(&state, s);
189 3448adb0 2022-11-02 op } else {
190 3448adb0 2022-11-02 op cp0_prop = get_break_prop(cp0);
191 3448adb0 2022-11-02 op cp1_prop = get_break_prop(cp1);
192 3448adb0 2022-11-02 op
193 3448adb0 2022-11-02 op /*
194 3448adb0 2022-11-02 op * Apply grapheme cluster breaking algorithm (UAX #29), see
195 3448adb0 2022-11-02 op * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
196 3448adb0 2022-11-02 op *
197 3448adb0 2022-11-02 op * Given we have no state, this behaves as if the state-booleans
198 3448adb0 2022-11-02 op * were all set to false
199 3448adb0 2022-11-02 op */
200 3448adb0 2022-11-02 op notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
201 3448adb0 2022-11-02 op (dont_break_gb11[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
202 3448adb0 2022-11-02 op (dont_break_gb12_13[cp0_prop] & (UINT16_C(1) << cp1_prop));
203 3448adb0 2022-11-02 op }
204 3448adb0 2022-11-02 op
205 3448adb0 2022-11-02 op return !notbreak;
206 3448adb0 2022-11-02 op }
207 3448adb0 2022-11-02 op
208 3448adb0 2022-11-02 op static size_t
209 3448adb0 2022-11-02 op next_character_break(HERODOTUS_READER *r)
210 3448adb0 2022-11-02 op {
211 3448adb0 2022-11-02 op uint_least16_t state = 0;
212 3448adb0 2022-11-02 op uint_least32_t cp0 = 0, cp1 = 0;
213 3448adb0 2022-11-02 op
214 3448adb0 2022-11-02 op for (herodotus_read_codepoint(r, true, &cp0);
215 3448adb0 2022-11-02 op herodotus_read_codepoint(r, false, &cp1) == HERODOTUS_STATUS_SUCCESS;
216 3448adb0 2022-11-02 op herodotus_read_codepoint(r, true, &cp0)) {
217 3448adb0 2022-11-02 op if (grapheme_is_character_break(cp0, cp1, &state)) {
218 3448adb0 2022-11-02 op break;
219 3448adb0 2022-11-02 op }
220 3448adb0 2022-11-02 op }
221 3448adb0 2022-11-02 op
222 3448adb0 2022-11-02 op return herodotus_reader_number_read(r);
223 3448adb0 2022-11-02 op }
224 3448adb0 2022-11-02 op
225 3448adb0 2022-11-02 op size_t
226 3448adb0 2022-11-02 op grapheme_next_character_break(const uint_least32_t *str, size_t len)
227 3448adb0 2022-11-02 op {
228 3448adb0 2022-11-02 op HERODOTUS_READER r;
229 3448adb0 2022-11-02 op
230 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
231 3448adb0 2022-11-02 op
232 3448adb0 2022-11-02 op return next_character_break(&r);
233 3448adb0 2022-11-02 op }
234 3448adb0 2022-11-02 op
235 3448adb0 2022-11-02 op size_t
236 3448adb0 2022-11-02 op grapheme_next_character_break_utf8(const char *str, size_t len)
237 3448adb0 2022-11-02 op {
238 3448adb0 2022-11-02 op HERODOTUS_READER r;
239 3448adb0 2022-11-02 op
240 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
241 3448adb0 2022-11-02 op
242 3448adb0 2022-11-02 op return next_character_break(&r);
243 3448adb0 2022-11-02 op }