Blob


1 /* See LICENSE file for copyright and license details. */
2 #include <stdbool.h>
3 #include <stddef.h>
5 #include "../gen/word.h"
6 #include "../grapheme.h"
7 #include "util.h"
9 struct word_break_state
10 {
11 bool ri_even;
12 };
14 static inline uint_least8_t
15 get_word_break_prop(uint_least32_t cp)
16 {
17 if (likely(cp <= 0x10FFFF)) {
18 return (uint_least8_t)
19 word_break_minor[word_break_major[cp >> 8] + (cp & 0xff)];
20 } else {
21 return WORD_BREAK_PROP_OTHER;
22 }
23 }
25 static bool
26 is_skippable_word_prop(uint_least8_t prop)
27 {
28 return prop == WORD_BREAK_PROP_EXTEND ||
29 prop == WORD_BREAK_PROP_FORMAT ||
30 prop == WORD_BREAK_PROP_ZWJ;
31 }
33 static void
34 word_skip_shift_callback(uint_least8_t prop, void *s)
35 {
36 struct word_break_state *state = (struct word_break_state *)s;
38 if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
39 /*
40 * The property we just shifted in is
41 * a regional indicator, increasing the
42 * number of consecutive RIs on the left
43 * side of the breakpoint by one, changing
44 * the oddness.
45 *
46 */
47 state->ri_even = !(state->ri_even);
48 } else {
49 /*
50 * We saw no regional indicator, so the
51 * number of consecutive RIs on the left
52 * side of the breakpoint is zero, which
53 * is an even number.
54 *
55 */
56 state->ri_even = true;
57 }
58 }
60 static size_t
61 next_word_break(HERODOTUS_READER *r)
62 {
63 struct proper p;
64 struct word_break_state state = { .ri_even = true };
66 /*
67 * Apply word breaking algorithm (UAX #29), see
68 * https://unicode.org/reports/tr29/#Word_Boundary_Rules
69 */
70 proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop,
71 is_skippable_word_prop, word_skip_shift_callback, &p);
73 while (!proper_advance(&p)) {
74 /* WB3 */
75 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR &&
76 p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
77 continue;
78 }
80 /* WB3a */
81 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE ||
82 p.raw.prev_prop[0] == WORD_BREAK_PROP_CR ||
83 p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) {
84 break;
85 }
87 /* WB3b */
88 if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE ||
89 p.raw.next_prop[0] == WORD_BREAK_PROP_CR ||
90 p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
91 break;
92 }
94 /* WB3c */
95 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ &&
96 (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
97 p.raw.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
98 continue;
99 }
101 /* WB3d */
102 if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE &&
103 p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) {
104 continue;
107 /* WB4 */
108 if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND ||
109 p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT ||
110 p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) {
111 continue;
114 /* WB5 */
115 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
116 p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
117 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
118 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
119 p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
120 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
121 continue;
124 /* WB6 */
125 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
126 p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
127 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
128 (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
129 p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
130 p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
131 (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER ||
132 p.skip.next_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
133 p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
134 continue;
137 /* WB7 */
138 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
139 p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
140 p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
141 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
142 p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
143 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
144 (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER ||
145 p.skip.prev_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
146 p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
147 continue;
150 /* WB7a */
151 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
152 p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) {
153 continue;
156 /* WB7b */
157 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
158 p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
159 p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
160 continue;
163 /* WB7c */
164 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
165 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
166 p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
167 continue;
170 /* WB8 */
171 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
172 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
173 continue;
176 /* WB9 */
177 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
178 p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
179 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
180 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
181 continue;
184 /* WB10 */
185 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
186 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
187 p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
188 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
189 continue;
192 /* WB11 */
193 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM ||
194 p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
195 p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
196 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC &&
197 p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) {
198 continue;
201 /* WB12 */
202 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
203 (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM ||
204 p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
205 p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
206 p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) {
207 continue;
210 /* WB13 */
211 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA &&
212 p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) {
213 continue;
216 /* WB13a */
217 if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
218 p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
219 p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
220 p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC ||
221 p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA ||
222 p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) &&
223 p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) {
224 continue;
227 /* WB13b */
228 if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET &&
229 (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
230 p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
231 p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
232 p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC ||
233 p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) {
234 continue;
237 /* WB15 and WB16 */
238 if (!state.ri_even &&
239 p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
240 continue;
243 /* WB999 */
244 break;
247 return herodotus_reader_number_read(&(p.mid_reader));
250 size_t
251 grapheme_next_word_break(const uint_least32_t *str, size_t len)
253 HERODOTUS_READER r;
255 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
257 return next_word_break(&r);
260 size_t
261 grapheme_next_word_break_utf8(const char *str, size_t len)
263 HERODOTUS_READER r;
265 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
267 return next_word_break(&r);