1 /* See LICENSE file for copyright and license details. */
5 #include "../gen/sentence.h"
6 #include "../grapheme.h"
9 struct sentence_break_state
11 uint_least8_t aterm_close_sp_level;
12 uint_least8_t saterm_close_sp_parasep_level;
15 static inline uint_least8_t
16 get_sentence_break_prop(uint_least32_t cp)
18 if (likely(cp <= UINT32_C(0x10FFFF))) {
19 return (uint_least8_t)
20 sentence_break_minor[sentence_break_major[cp >> 8] +
23 return SENTENCE_BREAK_PROP_OTHER;
28 is_skippable_sentence_prop(uint_least8_t prop)
30 return prop == SENTENCE_BREAK_PROP_EXTEND ||
31 prop == SENTENCE_BREAK_PROP_FORMAT;
35 sentence_skip_shift_callback(uint_least8_t prop, void *s)
37 struct sentence_break_state *state = (struct sentence_break_state *)s;
40 * Here comes a bit of magic. The rules
41 * SB8, SB8a, SB9 and SB10 have very complicated
42 * left-hand-side-rules of the form
47 * SATerm Close* Sp* ParaSep?
49 * but instead of backtracking, we keep the
50 * state as some kind of "power level" in
53 * aterm_close_sp_level
54 * saterm_close_sp_parasep_level
56 * that go from 0 to 3/4:
58 * 0: we are not in the sequence
59 * 1: we have one ATerm/SATerm to the left of
61 * 2: we have one ATerm/SATerm and one or more
62 * Close to the left of the middle spot
63 * 3: we have one ATerm/SATerm, zero or more
64 * Close and one or more Sp to the left of
66 * 4: we have one SATerm, zero or more Close,
67 * zero or more Sp and one ParaSep to the
68 * left of the middle spot.
71 if ((state->aterm_close_sp_level == 0 ||
72 state->aterm_close_sp_level == 1) &&
73 prop == SENTENCE_BREAK_PROP_ATERM) {
74 /* sequence has begun */
75 state->aterm_close_sp_level = 1;
76 } else if ((state->aterm_close_sp_level == 1 ||
77 state->aterm_close_sp_level == 2) &&
78 prop == SENTENCE_BREAK_PROP_CLOSE) {
79 /* close-sequence begins or continued */
80 state->aterm_close_sp_level = 2;
81 } else if ((state->aterm_close_sp_level == 1 ||
82 state->aterm_close_sp_level == 2 ||
83 state->aterm_close_sp_level == 3) &&
84 prop == SENTENCE_BREAK_PROP_SP) {
85 /* sp-sequence begins or continued */
86 state->aterm_close_sp_level = 3;
89 state->aterm_close_sp_level = 0;
92 if ((state->saterm_close_sp_parasep_level == 0 ||
93 state->saterm_close_sp_parasep_level == 1) &&
94 (prop == SENTENCE_BREAK_PROP_STERM ||
95 prop == SENTENCE_BREAK_PROP_ATERM)) {
96 /* sequence has begun */
97 state->saterm_close_sp_parasep_level = 1;
98 } else if ((state->saterm_close_sp_parasep_level == 1 ||
99 state->saterm_close_sp_parasep_level == 2) &&
100 prop == SENTENCE_BREAK_PROP_CLOSE) {
101 /* close-sequence begins or continued */
102 state->saterm_close_sp_parasep_level = 2;
103 } else if ((state->saterm_close_sp_parasep_level == 1 ||
104 state->saterm_close_sp_parasep_level == 2 ||
105 state->saterm_close_sp_parasep_level == 3) &&
106 prop == SENTENCE_BREAK_PROP_SP) {
107 /* sp-sequence begins or continued */
108 state->saterm_close_sp_parasep_level = 3;
109 } else if ((state->saterm_close_sp_parasep_level == 1 ||
110 state->saterm_close_sp_parasep_level == 2 ||
111 state->saterm_close_sp_parasep_level == 3) &&
112 (prop == SENTENCE_BREAK_PROP_SEP ||
113 prop == SENTENCE_BREAK_PROP_CR ||
114 prop == SENTENCE_BREAK_PROP_LF)) {
115 /* ParaSep at the end of the sequence */
116 state->saterm_close_sp_parasep_level = 4;
119 state->saterm_close_sp_parasep_level = 0;
124 next_sentence_break(HERODOTUS_READER *r)
126 HERODOTUS_READER tmp;
127 enum sentence_break_property prop;
129 struct sentence_break_state state = { 0 };
133 * Apply sentence breaking algorithm (UAX #29), see
134 * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
136 proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
137 get_sentence_break_prop, is_skippable_sentence_prop,
138 sentence_skip_shift_callback, &p);
140 while (!proper_advance(&p)) {
142 if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
143 p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
148 if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
149 p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR ||
150 p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
155 if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
156 p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
161 if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
162 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
167 if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
168 p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
169 p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
170 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
175 if (state.aterm_close_sp_level == 1 ||
176 state.aterm_close_sp_level == 2 ||
177 state.aterm_close_sp_level == 3) {
179 * This is the most complicated rule, requiring
180 * the right-hand-side to satisfy the regular expression
182 * ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
184 * which we simply check "manually" given LUT-lookups
185 * are very cheap by starting at the mid_reader.
188 herodotus_reader_copy(&(p.mid_reader), &tmp);
190 prop = NUM_SENTENCE_BREAK_PROPS;
191 while (herodotus_read_codepoint(&tmp, true, &cp) ==
192 HERODOTUS_STATUS_SUCCESS) {
193 prop = get_sentence_break_prop(cp);
196 * the skippable properties are ignored
197 * automatically here given they do not
198 * match the following condition
200 if (prop == SENTENCE_BREAK_PROP_OLETTER ||
201 prop == SENTENCE_BREAK_PROP_UPPER ||
202 prop == SENTENCE_BREAK_PROP_LOWER ||
203 prop == SENTENCE_BREAK_PROP_SEP ||
204 prop == SENTENCE_BREAK_PROP_CR ||
205 prop == SENTENCE_BREAK_PROP_LF ||
206 prop == SENTENCE_BREAK_PROP_STERM ||
207 prop == SENTENCE_BREAK_PROP_ATERM) {
212 if (prop == SENTENCE_BREAK_PROP_LOWER) {
218 if ((state.saterm_close_sp_parasep_level == 1 ||
219 state.saterm_close_sp_parasep_level == 2 ||
220 state.saterm_close_sp_parasep_level == 3) &&
221 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
222 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM ||
223 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
228 if ((state.saterm_close_sp_parasep_level == 1 ||
229 state.saterm_close_sp_parasep_level == 2) &&
230 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
231 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
232 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
233 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
234 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
239 if ((state.saterm_close_sp_parasep_level == 1 ||
240 state.saterm_close_sp_parasep_level == 2 ||
241 state.saterm_close_sp_parasep_level == 3) &&
242 (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP ||
243 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
244 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR ||
245 p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
250 if (state.saterm_close_sp_parasep_level == 1 ||
251 state.saterm_close_sp_parasep_level == 2 ||
252 state.saterm_close_sp_parasep_level == 3 ||
253 state.saterm_close_sp_parasep_level == 4) {
261 return herodotus_reader_number_read(&(p.mid_reader));
265 grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
269 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
271 return next_sentence_break(&r);
275 grapheme_next_sentence_break_utf8(const char *str, size_t len)
279 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
281 return next_sentence_break(&r);