1 3448adb0 2022-11-02 op /* See LICENSE file for copyright and license details. */
2 3448adb0 2022-11-02 op #include <stdbool.h>
3 3448adb0 2022-11-02 op #include <stddef.h>
5 3448adb0 2022-11-02 op #include "../gen/word.h"
6 3448adb0 2022-11-02 op #include "../grapheme.h"
7 3448adb0 2022-11-02 op #include "util.h"
9 3448adb0 2022-11-02 op struct word_break_state
14 3448adb0 2022-11-02 op static inline uint_least8_t
15 3448adb0 2022-11-02 op get_word_break_prop(uint_least32_t cp)
17 3448adb0 2022-11-02 op if (likely(cp <= 0x10FFFF)) {
18 3448adb0 2022-11-02 op return (uint_least8_t)
19 3448adb0 2022-11-02 op word_break_minor[word_break_major[cp >> 8] + (cp & 0xff)];
21 3448adb0 2022-11-02 op return WORD_BREAK_PROP_OTHER;
26 3448adb0 2022-11-02 op is_skippable_word_prop(uint_least8_t prop)
28 3448adb0 2022-11-02 op return prop == WORD_BREAK_PROP_EXTEND ||
29 3448adb0 2022-11-02 op prop == WORD_BREAK_PROP_FORMAT ||
30 3448adb0 2022-11-02 op prop == WORD_BREAK_PROP_ZWJ;
34 3448adb0 2022-11-02 op word_skip_shift_callback(uint_least8_t prop, void *s)
36 3448adb0 2022-11-02 op struct word_break_state *state = (struct word_break_state *)s;
38 3448adb0 2022-11-02 op if (prop == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
40 3448adb0 2022-11-02 op * The property we just shifted in is
41 3448adb0 2022-11-02 op * a regional indicator, increasing the
42 3448adb0 2022-11-02 op * number of consecutive RIs on the left
43 3448adb0 2022-11-02 op * side of the breakpoint by one, changing
44 3448adb0 2022-11-02 op * the oddness.
47 3448adb0 2022-11-02 op state->ri_even = !(state->ri_even);
50 3448adb0 2022-11-02 op * We saw no regional indicator, so the
51 3448adb0 2022-11-02 op * number of consecutive RIs on the left
52 3448adb0 2022-11-02 op * side of the breakpoint is zero, which
53 3448adb0 2022-11-02 op * is an even number.
56 3448adb0 2022-11-02 op state->ri_even = true;
61 3448adb0 2022-11-02 op next_word_break(HERODOTUS_READER *r)
63 3448adb0 2022-11-02 op struct proper p;
64 3448adb0 2022-11-02 op struct word_break_state state = { .ri_even = true };
67 3448adb0 2022-11-02 op * Apply word breaking algorithm (UAX #29), see
68 3448adb0 2022-11-02 op * https://unicode.org/reports/tr29/#Word_Boundary_Rules
70 3448adb0 2022-11-02 op proper_init(r, &state, NUM_WORD_BREAK_PROPS, get_word_break_prop,
71 3448adb0 2022-11-02 op is_skippable_word_prop, word_skip_shift_callback, &p);
73 3448adb0 2022-11-02 op while (!proper_advance(&p)) {
75 3448adb0 2022-11-02 op if (p.raw.prev_prop[0] == WORD_BREAK_PROP_CR &&
76 3448adb0 2022-11-02 op p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
81 3448adb0 2022-11-02 op if (p.raw.prev_prop[0] == WORD_BREAK_PROP_NEWLINE ||
82 3448adb0 2022-11-02 op p.raw.prev_prop[0] == WORD_BREAK_PROP_CR ||
83 3448adb0 2022-11-02 op p.raw.prev_prop[0] == WORD_BREAK_PROP_LF) {
88 3448adb0 2022-11-02 op if (p.raw.next_prop[0] == WORD_BREAK_PROP_NEWLINE ||
89 3448adb0 2022-11-02 op p.raw.next_prop[0] == WORD_BREAK_PROP_CR ||
90 3448adb0 2022-11-02 op p.raw.next_prop[0] == WORD_BREAK_PROP_LF) {
95 3448adb0 2022-11-02 op if (p.raw.prev_prop[0] == WORD_BREAK_PROP_ZWJ &&
96 3448adb0 2022-11-02 op (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTENDED_PICTOGRAPHIC ||
97 3448adb0 2022-11-02 op p.raw.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT)) {
102 3448adb0 2022-11-02 op if (p.raw.prev_prop[0] == WORD_BREAK_PROP_WSEGSPACE &&
103 3448adb0 2022-11-02 op p.raw.next_prop[0] == WORD_BREAK_PROP_WSEGSPACE) {
108 3448adb0 2022-11-02 op if (p.raw.next_prop[0] == WORD_BREAK_PROP_EXTEND ||
109 3448adb0 2022-11-02 op p.raw.next_prop[0] == WORD_BREAK_PROP_FORMAT ||
110 3448adb0 2022-11-02 op p.raw.next_prop[0] == WORD_BREAK_PROP_ZWJ) {
115 3448adb0 2022-11-02 op if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
116 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
117 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
118 3448adb0 2022-11-02 op (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
119 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
120 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
125 3448adb0 2022-11-02 op if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
126 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
127 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
128 3448adb0 2022-11-02 op (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
129 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
130 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
131 3448adb0 2022-11-02 op (p.skip.next_prop[1] == WORD_BREAK_PROP_ALETTER ||
132 3448adb0 2022-11-02 op p.skip.next_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
133 3448adb0 2022-11-02 op p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
138 3448adb0 2022-11-02 op if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDLETTER ||
139 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
140 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
141 3448adb0 2022-11-02 op (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
142 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
143 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
144 3448adb0 2022-11-02 op (p.skip.prev_prop[1] == WORD_BREAK_PROP_ALETTER ||
145 3448adb0 2022-11-02 op p.skip.prev_prop[1] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
146 3448adb0 2022-11-02 op p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER)) {
151 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
152 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) {
157 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
158 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
159 3448adb0 2022-11-02 op p.skip.next_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
164 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_DOUBLE_QUOTE &&
165 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER &&
166 3448adb0 2022-11-02 op p.skip.prev_prop[1] == WORD_BREAK_PROP_HEBREW_LETTER) {
171 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
172 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
177 3448adb0 2022-11-02 op if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
178 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
179 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER) &&
180 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC) {
185 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
186 3448adb0 2022-11-02 op (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
187 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
188 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER)) {
193 3448adb0 2022-11-02 op if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUM ||
194 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
195 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
196 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC &&
197 3448adb0 2022-11-02 op p.skip.prev_prop[1] == WORD_BREAK_PROP_NUMERIC) {
202 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC &&
203 3448adb0 2022-11-02 op (p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUM ||
204 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_MIDNUMLET ||
205 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_SINGLE_QUOTE) &&
206 3448adb0 2022-11-02 op p.skip.next_prop[1] == WORD_BREAK_PROP_NUMERIC) {
211 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA &&
212 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA) {
217 3448adb0 2022-11-02 op if ((p.skip.prev_prop[0] == WORD_BREAK_PROP_ALETTER ||
218 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
219 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
220 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_NUMERIC ||
221 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_KATAKANA ||
222 3448adb0 2022-11-02 op p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) &&
223 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET) {
228 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == WORD_BREAK_PROP_EXTENDNUMLET &&
229 3448adb0 2022-11-02 op (p.skip.next_prop[0] == WORD_BREAK_PROP_ALETTER ||
230 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_BOTH_ALETTER_EXTPICT ||
231 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_HEBREW_LETTER ||
232 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_NUMERIC ||
233 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_KATAKANA)) {
237 3448adb0 2022-11-02 op /* WB15 and WB16 */
238 3448adb0 2022-11-02 op if (!state.ri_even &&
239 3448adb0 2022-11-02 op p.skip.next_prop[0] == WORD_BREAK_PROP_REGIONAL_INDICATOR) {
247 3448adb0 2022-11-02 op return herodotus_reader_number_read(&(p.mid_reader));
251 3448adb0 2022-11-02 op grapheme_next_word_break(const uint_least32_t *str, size_t len)
253 3448adb0 2022-11-02 op HERODOTUS_READER r;
255 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
257 3448adb0 2022-11-02 op return next_word_break(&r);
261 3448adb0 2022-11-02 op grapheme_next_word_break_utf8(const char *str, size_t len)
263 3448adb0 2022-11-02 op HERODOTUS_READER r;
265 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
267 3448adb0 2022-11-02 op return next_word_break(&r);