Blame


1 3448adb0 2022-11-02 op /* See LICENSE file for copyright and license details. */
2 3448adb0 2022-11-02 op #include <stdbool.h>
3 3448adb0 2022-11-02 op #include <stddef.h>
4 3448adb0 2022-11-02 op
5 3448adb0 2022-11-02 op #include "../gen/sentence.h"
6 3448adb0 2022-11-02 op #include "../grapheme.h"
7 3448adb0 2022-11-02 op #include "util.h"
8 3448adb0 2022-11-02 op
9 3448adb0 2022-11-02 op struct sentence_break_state
10 3448adb0 2022-11-02 op {
11 3448adb0 2022-11-02 op uint_least8_t aterm_close_sp_level;
12 3448adb0 2022-11-02 op uint_least8_t saterm_close_sp_parasep_level;
13 3448adb0 2022-11-02 op };
14 3448adb0 2022-11-02 op
15 3448adb0 2022-11-02 op static inline uint_least8_t
16 3448adb0 2022-11-02 op get_sentence_break_prop(uint_least32_t cp)
17 3448adb0 2022-11-02 op {
18 3448adb0 2022-11-02 op if (likely(cp <= UINT32_C(0x10FFFF))) {
19 3448adb0 2022-11-02 op return (uint_least8_t)
20 3448adb0 2022-11-02 op sentence_break_minor[sentence_break_major[cp >> 8] +
21 3448adb0 2022-11-02 op (cp & 0xff)];
22 3448adb0 2022-11-02 op } else {
23 3448adb0 2022-11-02 op return SENTENCE_BREAK_PROP_OTHER;
24 3448adb0 2022-11-02 op }
25 3448adb0 2022-11-02 op }
26 3448adb0 2022-11-02 op
27 3448adb0 2022-11-02 op static bool
28 3448adb0 2022-11-02 op is_skippable_sentence_prop(uint_least8_t prop)
29 3448adb0 2022-11-02 op {
30 3448adb0 2022-11-02 op return prop == SENTENCE_BREAK_PROP_EXTEND ||
31 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_FORMAT;
32 3448adb0 2022-11-02 op }
33 3448adb0 2022-11-02 op
34 3448adb0 2022-11-02 op static void
35 3448adb0 2022-11-02 op sentence_skip_shift_callback(uint_least8_t prop, void *s)
36 3448adb0 2022-11-02 op {
37 3448adb0 2022-11-02 op struct sentence_break_state *state = (struct sentence_break_state *)s;
38 3448adb0 2022-11-02 op
39 3448adb0 2022-11-02 op /*
40 3448adb0 2022-11-02 op * Here comes a bit of magic. The rules
41 3448adb0 2022-11-02 op * SB8, SB8a, SB9 and SB10 have very complicated
42 3448adb0 2022-11-02 op * left-hand-side-rules of the form
43 3448adb0 2022-11-02 op *
44 3448adb0 2022-11-02 op * ATerm Close* Sp*
45 3448adb0 2022-11-02 op * SATerm Close*
46 3448adb0 2022-11-02 op * SATerm Close* Sp*
47 3448adb0 2022-11-02 op * SATerm Close* Sp* ParaSep?
48 3448adb0 2022-11-02 op *
49 3448adb0 2022-11-02 op * but instead of backtracking, we keep the
50 3448adb0 2022-11-02 op * state as some kind of "power level" in
51 3448adb0 2022-11-02 op * two state-variables
52 3448adb0 2022-11-02 op *
53 3448adb0 2022-11-02 op * aterm_close_sp_level
54 3448adb0 2022-11-02 op * saterm_close_sp_parasep_level
55 3448adb0 2022-11-02 op *
56 3448adb0 2022-11-02 op * that go from 0 to 3/4:
57 3448adb0 2022-11-02 op *
58 3448adb0 2022-11-02 op * 0: we are not in the sequence
59 3448adb0 2022-11-02 op * 1: we have one ATerm/SATerm to the left of
60 3448adb0 2022-11-02 op * the middle spot
61 3448adb0 2022-11-02 op * 2: we have one ATerm/SATerm and one or more
62 3448adb0 2022-11-02 op * Close to the left of the middle spot
63 3448adb0 2022-11-02 op * 3: we have one ATerm/SATerm, zero or more
64 3448adb0 2022-11-02 op * Close and one or more Sp to the left of
65 3448adb0 2022-11-02 op * the middle spot.
66 3448adb0 2022-11-02 op * 4: we have one SATerm, zero or more Close,
67 3448adb0 2022-11-02 op * zero or more Sp and one ParaSep to the
68 3448adb0 2022-11-02 op * left of the middle spot.
69 3448adb0 2022-11-02 op *
70 3448adb0 2022-11-02 op */
71 3448adb0 2022-11-02 op if ((state->aterm_close_sp_level == 0 ||
72 3448adb0 2022-11-02 op state->aterm_close_sp_level == 1) &&
73 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_ATERM) {
74 3448adb0 2022-11-02 op /* sequence has begun */
75 3448adb0 2022-11-02 op state->aterm_close_sp_level = 1;
76 3448adb0 2022-11-02 op } else if ((state->aterm_close_sp_level == 1 ||
77 3448adb0 2022-11-02 op state->aterm_close_sp_level == 2) &&
78 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_CLOSE) {
79 3448adb0 2022-11-02 op /* close-sequence begins or continued */
80 3448adb0 2022-11-02 op state->aterm_close_sp_level = 2;
81 3448adb0 2022-11-02 op } else if ((state->aterm_close_sp_level == 1 ||
82 3448adb0 2022-11-02 op state->aterm_close_sp_level == 2 ||
83 3448adb0 2022-11-02 op state->aterm_close_sp_level == 3) &&
84 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_SP) {
85 3448adb0 2022-11-02 op /* sp-sequence begins or continued */
86 3448adb0 2022-11-02 op state->aterm_close_sp_level = 3;
87 3448adb0 2022-11-02 op } else {
88 3448adb0 2022-11-02 op /* sequence broke */
89 3448adb0 2022-11-02 op state->aterm_close_sp_level = 0;
90 3448adb0 2022-11-02 op }
91 3448adb0 2022-11-02 op
92 3448adb0 2022-11-02 op if ((state->saterm_close_sp_parasep_level == 0 ||
93 3448adb0 2022-11-02 op state->saterm_close_sp_parasep_level == 1) &&
94 3448adb0 2022-11-02 op (prop == SENTENCE_BREAK_PROP_STERM ||
95 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_ATERM)) {
96 3448adb0 2022-11-02 op /* sequence has begun */
97 3448adb0 2022-11-02 op state->saterm_close_sp_parasep_level = 1;
98 3448adb0 2022-11-02 op } else if ((state->saterm_close_sp_parasep_level == 1 ||
99 3448adb0 2022-11-02 op state->saterm_close_sp_parasep_level == 2) &&
100 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_CLOSE) {
101 3448adb0 2022-11-02 op /* close-sequence begins or continued */
102 3448adb0 2022-11-02 op state->saterm_close_sp_parasep_level = 2;
103 3448adb0 2022-11-02 op } else if ((state->saterm_close_sp_parasep_level == 1 ||
104 3448adb0 2022-11-02 op state->saterm_close_sp_parasep_level == 2 ||
105 3448adb0 2022-11-02 op state->saterm_close_sp_parasep_level == 3) &&
106 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_SP) {
107 3448adb0 2022-11-02 op /* sp-sequence begins or continued */
108 3448adb0 2022-11-02 op state->saterm_close_sp_parasep_level = 3;
109 3448adb0 2022-11-02 op } else if ((state->saterm_close_sp_parasep_level == 1 ||
110 3448adb0 2022-11-02 op state->saterm_close_sp_parasep_level == 2 ||
111 3448adb0 2022-11-02 op state->saterm_close_sp_parasep_level == 3) &&
112 3448adb0 2022-11-02 op (prop == SENTENCE_BREAK_PROP_SEP ||
113 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_CR ||
114 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_LF)) {
115 3448adb0 2022-11-02 op /* ParaSep at the end of the sequence */
116 3448adb0 2022-11-02 op state->saterm_close_sp_parasep_level = 4;
117 3448adb0 2022-11-02 op } else {
118 3448adb0 2022-11-02 op /* sequence broke */
119 3448adb0 2022-11-02 op state->saterm_close_sp_parasep_level = 0;
120 3448adb0 2022-11-02 op }
121 3448adb0 2022-11-02 op }
122 3448adb0 2022-11-02 op
123 3448adb0 2022-11-02 op static size_t
124 3448adb0 2022-11-02 op next_sentence_break(HERODOTUS_READER *r)
125 3448adb0 2022-11-02 op {
126 3448adb0 2022-11-02 op HERODOTUS_READER tmp;
127 3448adb0 2022-11-02 op enum sentence_break_property prop;
128 3448adb0 2022-11-02 op struct proper p;
129 3448adb0 2022-11-02 op struct sentence_break_state state = { 0 };
130 3448adb0 2022-11-02 op uint_least32_t cp;
131 3448adb0 2022-11-02 op
132 3448adb0 2022-11-02 op /*
133 3448adb0 2022-11-02 op * Apply sentence breaking algorithm (UAX #29), see
134 3448adb0 2022-11-02 op * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
135 3448adb0 2022-11-02 op */
136 3448adb0 2022-11-02 op proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
137 3448adb0 2022-11-02 op get_sentence_break_prop, is_skippable_sentence_prop,
138 3448adb0 2022-11-02 op sentence_skip_shift_callback, &p);
139 3448adb0 2022-11-02 op
140 3448adb0 2022-11-02 op while (!proper_advance(&p)) {
141 3448adb0 2022-11-02 op /* SB3 */
142 3448adb0 2022-11-02 op if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
143 3448adb0 2022-11-02 op p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
144 3448adb0 2022-11-02 op continue;
145 3448adb0 2022-11-02 op }
146 3448adb0 2022-11-02 op
147 3448adb0 2022-11-02 op /* SB4 */
148 3448adb0 2022-11-02 op if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
149 3448adb0 2022-11-02 op p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR ||
150 3448adb0 2022-11-02 op p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
151 3448adb0 2022-11-02 op break;
152 3448adb0 2022-11-02 op }
153 3448adb0 2022-11-02 op
154 3448adb0 2022-11-02 op /* SB5 */
155 3448adb0 2022-11-02 op if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
156 3448adb0 2022-11-02 op p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
157 3448adb0 2022-11-02 op continue;
158 3448adb0 2022-11-02 op }
159 3448adb0 2022-11-02 op
160 3448adb0 2022-11-02 op /* SB6 */
161 3448adb0 2022-11-02 op if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
162 3448adb0 2022-11-02 op p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
163 3448adb0 2022-11-02 op continue;
164 3448adb0 2022-11-02 op }
165 3448adb0 2022-11-02 op
166 3448adb0 2022-11-02 op /* SB7 */
167 3448adb0 2022-11-02 op if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
168 3448adb0 2022-11-02 op p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
169 3448adb0 2022-11-02 op p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
170 3448adb0 2022-11-02 op p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
171 3448adb0 2022-11-02 op continue;
172 3448adb0 2022-11-02 op }
173 3448adb0 2022-11-02 op
174 3448adb0 2022-11-02 op /* SB8 */
175 3448adb0 2022-11-02 op if (state.aterm_close_sp_level == 1 ||
176 3448adb0 2022-11-02 op state.aterm_close_sp_level == 2 ||
177 3448adb0 2022-11-02 op state.aterm_close_sp_level == 3) {
178 3448adb0 2022-11-02 op /*
179 3448adb0 2022-11-02 op * This is the most complicated rule, requiring
180 3448adb0 2022-11-02 op * the right-hand-side to satisfy the regular expression
181 3448adb0 2022-11-02 op *
182 3448adb0 2022-11-02 op * ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
183 3448adb0 2022-11-02 op *
184 3448adb0 2022-11-02 op * which we simply check "manually" given LUT-lookups
185 3448adb0 2022-11-02 op * are very cheap by starting at the mid_reader.
186 3448adb0 2022-11-02 op *
187 3448adb0 2022-11-02 op */
188 3448adb0 2022-11-02 op herodotus_reader_copy(&(p.mid_reader), &tmp);
189 3448adb0 2022-11-02 op
190 3448adb0 2022-11-02 op prop = NUM_SENTENCE_BREAK_PROPS;
191 3448adb0 2022-11-02 op while (herodotus_read_codepoint(&tmp, true, &cp) ==
192 3448adb0 2022-11-02 op HERODOTUS_STATUS_SUCCESS) {
193 3448adb0 2022-11-02 op prop = get_sentence_break_prop(cp);
194 3448adb0 2022-11-02 op
195 3448adb0 2022-11-02 op /*
196 3448adb0 2022-11-02 op * the skippable properties are ignored
197 3448adb0 2022-11-02 op * automatically here given they do not
198 3448adb0 2022-11-02 op * match the following condition
199 3448adb0 2022-11-02 op */
200 3448adb0 2022-11-02 op if (prop == SENTENCE_BREAK_PROP_OLETTER ||
201 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_UPPER ||
202 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_LOWER ||
203 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_SEP ||
204 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_CR ||
205 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_LF ||
206 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_STERM ||
207 3448adb0 2022-11-02 op prop == SENTENCE_BREAK_PROP_ATERM) {
208 3448adb0 2022-11-02 op break;
209 3448adb0 2022-11-02 op }
210 3448adb0 2022-11-02 op }
211 3448adb0 2022-11-02 op
212 3448adb0 2022-11-02 op if (prop == SENTENCE_BREAK_PROP_LOWER) {
213 3448adb0 2022-11-02 op continue;
214 3448adb0 2022-11-02 op }
215 3448adb0 2022-11-02 op }
216 3448adb0 2022-11-02 op
217 3448adb0 2022-11-02 op /* SB8a */
218 3448adb0 2022-11-02 op if ((state.saterm_close_sp_parasep_level == 1 ||
219 3448adb0 2022-11-02 op state.saterm_close_sp_parasep_level == 2 ||
220 3448adb0 2022-11-02 op state.saterm_close_sp_parasep_level == 3) &&
221 3448adb0 2022-11-02 op (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
222 3448adb0 2022-11-02 op p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM ||
223 3448adb0 2022-11-02 op p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
224 3448adb0 2022-11-02 op continue;
225 3448adb0 2022-11-02 op }
226 3448adb0 2022-11-02 op
227 3448adb0 2022-11-02 op /* SB9 */
228 3448adb0 2022-11-02 op if ((state.saterm_close_sp_parasep_level == 1 ||
229 3448adb0 2022-11-02 op state.saterm_close_sp_parasep_level == 2) &&
230 3448adb0 2022-11-02 op (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
231 3448adb0 2022-11-02 op p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
232 3448adb0 2022-11-02 op p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
233 3448adb0 2022-11-02 op p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
234 3448adb0 2022-11-02 op p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
235 3448adb0 2022-11-02 op continue;
236 3448adb0 2022-11-02 op }
237 3448adb0 2022-11-02 op
238 3448adb0 2022-11-02 op /* SB10 */
239 3448adb0 2022-11-02 op if ((state.saterm_close_sp_parasep_level == 1 ||
240 3448adb0 2022-11-02 op state.saterm_close_sp_parasep_level == 2 ||
241 3448adb0 2022-11-02 op state.saterm_close_sp_parasep_level == 3) &&
242 3448adb0 2022-11-02 op (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
243 3448adb0 2022-11-02 op p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
244 3448adb0 2022-11-02 op p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
245 3448adb0 2022-11-02 op p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
246 3448adb0 2022-11-02 op continue;
247 3448adb0 2022-11-02 op }
248 3448adb0 2022-11-02 op
249 3448adb0 2022-11-02 op /* SB11 */
250 3448adb0 2022-11-02 op if (state.saterm_close_sp_parasep_level == 1 ||
251 3448adb0 2022-11-02 op state.saterm_close_sp_parasep_level == 2 ||
252 3448adb0 2022-11-02 op state.saterm_close_sp_parasep_level == 3 ||
253 3448adb0 2022-11-02 op state.saterm_close_sp_parasep_level == 4) {
254 3448adb0 2022-11-02 op break;
255 3448adb0 2022-11-02 op }
256 3448adb0 2022-11-02 op
257 3448adb0 2022-11-02 op /* SB998 */
258 3448adb0 2022-11-02 op continue;
259 3448adb0 2022-11-02 op }
260 3448adb0 2022-11-02 op
261 3448adb0 2022-11-02 op return herodotus_reader_number_read(&(p.mid_reader));
262 3448adb0 2022-11-02 op }
263 3448adb0 2022-11-02 op
264 3448adb0 2022-11-02 op size_t
265 3448adb0 2022-11-02 op grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
266 3448adb0 2022-11-02 op {
267 3448adb0 2022-11-02 op HERODOTUS_READER r;
268 3448adb0 2022-11-02 op
269 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
270 3448adb0 2022-11-02 op
271 3448adb0 2022-11-02 op return next_sentence_break(&r);
272 3448adb0 2022-11-02 op }
273 3448adb0 2022-11-02 op
274 3448adb0 2022-11-02 op size_t
275 3448adb0 2022-11-02 op grapheme_next_sentence_break_utf8(const char *str, size_t len)
276 3448adb0 2022-11-02 op {
277 3448adb0 2022-11-02 op HERODOTUS_READER r;
278 3448adb0 2022-11-02 op
279 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
280 3448adb0 2022-11-02 op
281 3448adb0 2022-11-02 op return next_sentence_break(&r);
282 3448adb0 2022-11-02 op }