Blame


1 3448adb0 2022-11-02 op /* See LICENSE file for copyright and license details. */
2 3448adb0 2022-11-02 op #include <stdbool.h>
3 3448adb0 2022-11-02 op #include <stddef.h>
4 3448adb0 2022-11-02 op
5 3448adb0 2022-11-02 op #include "../gen/word.h"
6 3448adb0 2022-11-02 op #include "../grapheme.h"
7 3448adb0 2022-11-02 op #include "util.h"
8 3448adb0 2022-11-02 op
9 3448adb0 2022-11-02 op struct word_break_state
10 3448adb0 2022-11-02 op {
11 3448adb0 2022-11-02 op bool ri_even;
12 3448adb0 2022-11-02 op };
13 3448adb0 2022-11-02 op
14 3448adb0 2022-11-02 op static inline uint_least8_t
15 3448adb0 2022-11-02 op get_word_break_prop(uint_least32_t cp)
16 3448adb0 2022-11-02 op {
17 3448adb0 2022-11-02 op if (likely(cp <= 0x10FFFF)) {
18 3448adb0 2022-11-02 op return (uint_least8_t)
19 3448adb0 2022-11-02 op word_break_minor[word_break_major[cp >> 8] + (cp & 0xff)];
20 3448adb0 2022-11-02 op } else {
21 3448adb0 2022-11-02 op return WORD_BREAK_PROP_OTHER;
22 3448adb0 2022-11-02 op }
23 3448adb0 2022-11-02 op }
24 3448adb0 2022-11-02 op
25 3448adb0 2022-11-02 op static bool
26 3448adb0 2022-11-02 op is_skippable_word_prop(uint_least8_t prop)
27 3448adb0 2022-11-02 op {
28 3448adb0 2022-11-02 op return prop == WORD_BREAK_PROP_EXTEND ||
29 3448adb0 2022-11-02 op prop == WORD_BREAK_PROP_FORMAT ||
30 3448adb0 2022-11-02 op prop == WORD_BREAK_PROP_ZWJ;
31 3448adb0 2022-11-02 op }
32 3448adb0 2022-11-02 op
33 3448adb0 2022-11-02 op static void
34 3448adb0 2022-11-02 op word_skip_shift_callback(uint_least8_t prop, void *s)
35 3448adb0 2022-11-02 op {
36 3448adb0 2022-11-02 op struct word_break_state *state = (struct word_break_state *)s;
37 3448adb0 2022-11-02 op
38 3448adb0 2022-11-02 op if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
39 3448adb0 2022-11-02 op /*
40 3448adb0 2022-11-02 op * The property we just shifted in is
41 3448adb0 2022-11-02 op * a regional indicator, increasing the
42 3448adb0 2022-11-02 op * number of consecutive RIs on the left
43 3448adb0 2022-11-02 op * side of the breakpoint by one, changing
44 3448adb0 2022-11-02 op * the oddness.
45 3448adb0 2022-11-02 op *
46 3448adb0 2022-11-02 op */
47 3448adb0 2022-11-02 op state->ri_even = !(state->ri_even);
48 3448adb0 2022-11-02 op } else {
49 3448adb0 2022-11-02 op /*
50 3448adb0 2022-11-02 op * We saw no regional indicator, so the
51 3448adb0 2022-11-02 op * number of consecutive RIs on the left
52 3448adb0 2022-11-02 op * side of the breakpoint is zero, which
53 3448adb0 2022-11-02 op * is an even number.
54 3448adb0 2022-11-02 op *
55 3448adb0 2022-11-02 op */
56 3448adb0 2022-11-02 op state->ri_even = true;
57 3448adb0 2022-11-02 op }
58 3448adb0 2022-11-02 op }
59 3448adb0 2022-11-02 op
60 3448adb0 2022-11-02 op static size_t
61 3448adb0 2022-11-02 op next_word_break(HERODOTUS_READER *r)
62 3448adb0 2022-11-02 op {
63 3448adb0 2022-11-02 op struct proper p;
64 3448adb0 2022-11-02 op struct word_break_state state = { .ri_even = true };
65 3448adb0 2022-11-02 op
66 3448adb0 2022-11-02 op /*
67 3448adb0 2022-11-02 op * Apply word breaking algorithm (UAX #29), see
68 3448adb0 2022-11-02 op * https://unicode.org/reports/tr29/#Word_Boundary_Rules
69 3448adb0 2022-11-02 op */
70 3448adb0 2022-11-02 op proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop,
71 3448adb0 2022-11-02 op is_skippable_word_prop, word_skip_shift_callback, &p);
72 3448adb0 2022-11-02 op
73 3448adb0 2022-11-02 op while (!proper_advance(&p)) {
74 3448adb0 2022-11-02 op /* WB3 */
75 3448adb0 2022-11-02 op if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR &&
76 3448adb0 2022-11-02 op p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
77 3448adb0 2022-11-02 op continue;
78 3448adb0 2022-11-02 op }
79 3448adb0 2022-11-02 op
80 3448adb0 2022-11-02 op /* WB3a */
81 3448adb0 2022-11-02 op if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE ||
82 3448adb0 2022-11-02 op p.raw.prev_prop[0] == WORD_BREAK_PROP_CR ||
83 3448adb0 2022-11-02 op p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) {
84 3448adb0 2022-11-02 op break;
85 3448adb0 2022-11-02 op }
86 3448adb0 2022-11-02 op
87 3448adb0 2022-11-02 op /* WB3b */
88 3448adb0 2022-11-02 op if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE ||
89 3448adb0 2022-11-02 op p.raw.next_prop[0] == WORD_BREAK_PROP_CR ||
90 3448adb0 2022-11-02 op p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
91 3448adb0 2022-11-02 op break;
92 3448adb0 2022-11-02 op }
93 3448adb0 2022-11-02 op
94 3448adb0 2022-11-02 op /* WB3c */
95 3448adb0 2022-11-02 op if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ &&
96 3448adb0 2022-11-02 op (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
97 3448adb0 2022-11-02 op p.raw.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
98 3448adb0 2022-11-02 op continue;
99 3448adb0 2022-11-02 op }
100 3448adb0 2022-11-02 op
101 3448adb0 2022-11-02 op /* WB3d */
102 3448adb0 2022-11-02 op if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE &&
103 3448adb0 2022-11-02 op p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) {
104 3448adb0 2022-11-02 op continue;
105 3448adb0 2022-11-02 op }
106 3448adb0 2022-11-02 op
107 3448adb0 2022-11-02 op /* WB4 */
108 3448adb0 2022-11-02 op if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND ||
109 3448adb0 2022-11-02 op p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT ||
110 3448adb0 2022-11-02 op p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) {
111 3448adb0 2022-11-02 op continue;
112 3448adb0 2022-11-02 op }
113 3448adb0 2022-11-02 op
114 3448adb0 2022-11-02 op /* WB5 */
115 3448adb0 2022-11-02 op if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
116 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
117 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
118 3448adb0 2022-11-02 op (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
119 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
120 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
121 3448adb0 2022-11-02 op continue;
122 3448adb0 2022-11-02 op }
123 3448adb0 2022-11-02 op
124 3448adb0 2022-11-02 op /* WB6 */
125 3448adb0 2022-11-02 op if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
126 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
127 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
128 3448adb0 2022-11-02 op (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
129 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
130 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
131 3448adb0 2022-11-02 op (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER ||
132 3448adb0 2022-11-02 op p.skip.next_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
133 3448adb0 2022-11-02 op p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
134 3448adb0 2022-11-02 op continue;
135 3448adb0 2022-11-02 op }
136 3448adb0 2022-11-02 op
137 3448adb0 2022-11-02 op /* WB7 */
138 3448adb0 2022-11-02 op if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
139 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
140 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
141 3448adb0 2022-11-02 op (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
142 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
143 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
144 3448adb0 2022-11-02 op (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER ||
145 3448adb0 2022-11-02 op p.skip.prev_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
146 3448adb0 2022-11-02 op p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
147 3448adb0 2022-11-02 op continue;
148 3448adb0 2022-11-02 op }
149 3448adb0 2022-11-02 op
150 3448adb0 2022-11-02 op /* WB7a */
151 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
152 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) {
153 3448adb0 2022-11-02 op continue;
154 3448adb0 2022-11-02 op }
155 3448adb0 2022-11-02 op
156 3448adb0 2022-11-02 op /* WB7b */
157 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
158 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
159 3448adb0 2022-11-02 op p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
160 3448adb0 2022-11-02 op continue;
161 3448adb0 2022-11-02 op }
162 3448adb0 2022-11-02 op
163 3448adb0 2022-11-02 op /* WB7c */
164 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
165 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
166 3448adb0 2022-11-02 op p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
167 3448adb0 2022-11-02 op continue;
168 3448adb0 2022-11-02 op }
169 3448adb0 2022-11-02 op
170 3448adb0 2022-11-02 op /* WB8 */
171 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
172 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
173 3448adb0 2022-11-02 op continue;
174 3448adb0 2022-11-02 op }
175 3448adb0 2022-11-02 op
176 3448adb0 2022-11-02 op /* WB9 */
177 3448adb0 2022-11-02 op if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
178 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
179 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
180 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
181 3448adb0 2022-11-02 op continue;
182 3448adb0 2022-11-02 op }
183 3448adb0 2022-11-02 op
184 3448adb0 2022-11-02 op /* WB10 */
185 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
186 3448adb0 2022-11-02 op (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
187 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
188 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
189 3448adb0 2022-11-02 op continue;
190 3448adb0 2022-11-02 op }
191 3448adb0 2022-11-02 op
192 3448adb0 2022-11-02 op /* WB11 */
193 3448adb0 2022-11-02 op if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM ||
194 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
195 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
196 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC &&
197 3448adb0 2022-11-02 op p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) {
198 3448adb0 2022-11-02 op continue;
199 3448adb0 2022-11-02 op }
200 3448adb0 2022-11-02 op
201 3448adb0 2022-11-02 op /* WB12 */
202 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
203 3448adb0 2022-11-02 op (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM ||
204 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
205 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
206 3448adb0 2022-11-02 op p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) {
207 3448adb0 2022-11-02 op continue;
208 3448adb0 2022-11-02 op }
209 3448adb0 2022-11-02 op
210 3448adb0 2022-11-02 op /* WB13 */
211 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA &&
212 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) {
213 3448adb0 2022-11-02 op continue;
214 3448adb0 2022-11-02 op }
215 3448adb0 2022-11-02 op
216 3448adb0 2022-11-02 op /* WB13a */
217 3448adb0 2022-11-02 op if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
218 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
219 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
220 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC ||
221 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA ||
222 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) &&
223 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) {
224 3448adb0 2022-11-02 op continue;
225 3448adb0 2022-11-02 op }
226 3448adb0 2022-11-02 op
227 3448adb0 2022-11-02 op /* WB13b */
228 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET &&
229 3448adb0 2022-11-02 op (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
230 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
231 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
232 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC ||
233 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) {
234 3448adb0 2022-11-02 op continue;
235 3448adb0 2022-11-02 op }
236 3448adb0 2022-11-02 op
237 3448adb0 2022-11-02 op /* WB15 and WB16 */
238 3448adb0 2022-11-02 op if (!state.ri_even &&
239 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
240 3448adb0 2022-11-02 op continue;
241 3448adb0 2022-11-02 op }
242 3448adb0 2022-11-02 op
243 3448adb0 2022-11-02 op /* WB999 */
244 3448adb0 2022-11-02 op break;
245 3448adb0 2022-11-02 op }
246 3448adb0 2022-11-02 op
247 3448adb0 2022-11-02 op return herodotus_reader_number_read(&(p.mid_reader));
248 3448adb0 2022-11-02 op }
249 3448adb0 2022-11-02 op
250 3448adb0 2022-11-02 op size_t
251 3448adb0 2022-11-02 op grapheme_next_word_break(const uint_least32_t *str, size_t len)
252 3448adb0 2022-11-02 op {
253 3448adb0 2022-11-02 op HERODOTUS_READER r;
254 3448adb0 2022-11-02 op
255 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
256 3448adb0 2022-11-02 op
257 3448adb0 2022-11-02 op return next_word_break(&r);
258 3448adb0 2022-11-02 op }
259 3448adb0 2022-11-02 op
260 3448adb0 2022-11-02 op size_t
261 3448adb0 2022-11-02 op grapheme_next_word_break_utf8(const char *str, size_t len)
262 3448adb0 2022-11-02 op {
263 3448adb0 2022-11-02 op HERODOTUS_READER r;
264 3448adb0 2022-11-02 op
265 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
266 3448adb0 2022-11-02 op
267 3448adb0 2022-11-02 op return next_word_break(&r);
268 3448adb0 2022-11-02 op }