op public repos

Blob

Date:: Wed Nov 2 20:01:35 2022 UTC
Message:: bundle libgrapheme 2.0.2 in case it's not available
Actions:: History | Blame | Raw File
1 /* See LICENSE file for copyright and license details. */
2 #include <stdbool.h>
3 #include <stddef.h>
4 
5 #include "../gen/sentence.h"
6 #include "../grapheme.h"
7 #include "util.h"
8 
9 struct sentence_break_state
10 {
11 	uint_least8_t aterm_close_sp_level;
12 	uint_least8_t saterm_close_sp_parasep_level;
13 };
14 
15 static inline uint_least8_t
16 get_sentence_break_prop(uint_least32_t cp)
17 {
18 	if (likely(cp <= UINT32_C(0x10FFFF))) {
19 		return (uint_least8_t)
20 		       sentence_break_minor[sentence_break_major[cp >> 8] +
21 		       (cp & 0xff)];
22 	} else {
23 		return SENTENCE_BREAK_PROP_OTHER;
24 	}
25 }
26 
27 static bool
28 is_skippable_sentence_prop(uint_least8_t prop)
29 {
30 	return prop == SENTENCE_BREAK_PROP_EXTEND ||
31 	       prop == SENTENCE_BREAK_PROP_FORMAT;
32 }
33 
34 static void
35 sentence_skip_shift_callback(uint_least8_t prop, void *s)
36 {
37 	struct sentence_break_state *state = (struct sentence_break_state *)s;
38 
39 	/*
40 	 * Here comes a bit of magic. The rules
41 	 * SB8, SB8a, SB9 and SB10 have very complicated
42 	 * left-hand-side-rules of the form
43 	 *
44 	 *  ATerm Close* Sp*
45 	 *  SATerm Close*
46 	 *  SATerm Close* Sp*
47 	 *  SATerm Close* Sp* ParaSep?
48 	 *
49 	 * but instead of backtracking, we keep the
50 	 * state as some kind of "power level" in
51 	 * two state-variables
52 	 *
53 	 *  aterm_close_sp_level
54 	 *  saterm_close_sp_parasep_level
55 	 *
56 	 * that go from 0 to 3/4:
57 	 *
58 	 *  0: we are not in the sequence
59 	 *  1: we have one ATerm/SATerm to the left of
60 	 *     the middle spot
61 	 *  2: we have one ATerm/SATerm and one or more
62 	 *     Close to the left of the middle spot
63 	 *  3: we have one ATerm/SATerm, zero or more
64 	 *     Close and one or more Sp to the left of
65 	 *     the middle spot.
66 	 *  4: we have one SATerm, zero or more Close,
67 	 *     zero or more Sp and one ParaSep to the
68 	 *     left of the middle spot.
69 	 *
70 	 */
71 	if ((state->aterm_close_sp_level == 0 ||
72 	     state->aterm_close_sp_level == 1) &&
73 	    prop == SENTENCE_BREAK_PROP_ATERM) {
74 		/* sequence has begun */
75 		state->aterm_close_sp_level = 1;
76 	} else if ((state->aterm_close_sp_level == 1 ||
77 	            state->aterm_close_sp_level == 2) &&
78 	           prop == SENTENCE_BREAK_PROP_CLOSE) {
79 		/* close-sequence begins or continued */
80 		state->aterm_close_sp_level = 2;
81 	} else if ((state->aterm_close_sp_level == 1 ||
82 	            state->aterm_close_sp_level == 2 ||
83 		    state->aterm_close_sp_level == 3) &&
84 	           prop == SENTENCE_BREAK_PROP_SP) {
85 		/* sp-sequence begins or continued */
86 		state->aterm_close_sp_level = 3;
87 	} else {
88 		/* sequence broke */
89 		state->aterm_close_sp_level = 0;
90 	}
91 
92 	if ((state->saterm_close_sp_parasep_level == 0 ||
93 	     state->saterm_close_sp_parasep_level == 1) &&
94 	    (prop == SENTENCE_BREAK_PROP_STERM ||
95 	     prop == SENTENCE_BREAK_PROP_ATERM)) {
96 		/* sequence has begun */
97 		state->saterm_close_sp_parasep_level = 1;
98 	} else if ((state->saterm_close_sp_parasep_level == 1 ||
99 	            state->saterm_close_sp_parasep_level == 2) &&
100 	           prop == SENTENCE_BREAK_PROP_CLOSE) {
101 		/* close-sequence begins or continued */
102 		state->saterm_close_sp_parasep_level = 2;
103 	} else if ((state->saterm_close_sp_parasep_level == 1 ||
104 	            state->saterm_close_sp_parasep_level == 2 ||
105 		    state->saterm_close_sp_parasep_level == 3) &&
106 	           prop == SENTENCE_BREAK_PROP_SP) {
107 		/* sp-sequence begins or continued */
108 		state->saterm_close_sp_parasep_level = 3;
109 	} else if ((state->saterm_close_sp_parasep_level == 1 ||
110 	            state->saterm_close_sp_parasep_level == 2 ||
111 	            state->saterm_close_sp_parasep_level == 3) &&
112 	           (prop == SENTENCE_BREAK_PROP_SEP ||
113 	            prop == SENTENCE_BREAK_PROP_CR  ||
114 	            prop == SENTENCE_BREAK_PROP_LF)) {
115 		/* ParaSep at the end of the sequence */
116 		state->saterm_close_sp_parasep_level = 4;
117 	} else {
118 		/* sequence broke */
119 		state->saterm_close_sp_parasep_level = 0;
120 	}
121 }
122 
123 static size_t
124 next_sentence_break(HERODOTUS_READER *r)
125 {
126 	HERODOTUS_READER tmp;
127 	enum sentence_break_property prop;
128 	struct proper p;
129 	struct sentence_break_state state = { 0 };
130 	uint_least32_t cp;
131 
132 	/*
133 	 * Apply sentence breaking algorithm (UAX #29), see
134 	 * https://unicode.org/reports/tr29/#Sentence_Boundary_Rules
135 	 */
136 	proper_init(r, &state, NUM_SENTENCE_BREAK_PROPS,
137 	            get_sentence_break_prop, is_skippable_sentence_prop,
138 	            sentence_skip_shift_callback, &p);
139 
140 	while (!proper_advance(&p)) {
141 		/* SB3 */
142 		if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR &&
143 		    p.raw.next_prop[0] == SENTENCE_BREAK_PROP_LF) {
144 			continue;
145 		}
146 
147 		/* SB4 */
148 		if (p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_SEP ||
149 		    p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_CR  ||
150 		    p.raw.prev_prop[0] == SENTENCE_BREAK_PROP_LF) {
151 			break;
152 		}
153 
154 		/* SB5 */
155 		if (p.raw.next_prop[0] == SENTENCE_BREAK_PROP_EXTEND ||
156 		    p.raw.next_prop[0] == SENTENCE_BREAK_PROP_FORMAT) {
157 			continue;
158 		}
159 
160 		/* SB6 */
161 		if (p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
162 		    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_NUMERIC) {
163 			continue;
164 		}
165 
166 		/* SB7 */
167 		if ((p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_UPPER ||
168 		     p.skip.prev_prop[1] == SENTENCE_BREAK_PROP_LOWER) &&
169 		    p.skip.prev_prop[0] == SENTENCE_BREAK_PROP_ATERM &&
170 		    p.skip.next_prop[0] == SENTENCE_BREAK_PROP_UPPER) {
171 			continue;
172 		}
173 
174 		/* SB8 */
175 		if (state.aterm_close_sp_level == 1 ||
176 		    state.aterm_close_sp_level == 2 ||
177 		    state.aterm_close_sp_level == 3) {
178 			/*
179 			 * This is the most complicated rule, requiring
180 			 * the right-hand-side to satisfy the regular expression
181 			 *
182 			 *  ( ¬(OLetter | Upper | Lower | ParaSep | SATerm) )* Lower
183 			 *
184 			 * which we simply check "manually" given LUT-lookups
185 			 * are very cheap by starting at the mid_reader.
186 			 *
187 			 */
188 			herodotus_reader_copy(&(p.mid_reader), &tmp);
189 
190 			prop = NUM_SENTENCE_BREAK_PROPS;
191 			while (herodotus_read_codepoint(&tmp, true, &cp) ==
192 			       HERODOTUS_STATUS_SUCCESS) {
193 				prop = get_sentence_break_prop(cp);
194 
195 				/*
196 				 * the skippable properties are ignored
197 				 * automatically here given they do not
198 				 * match the following condition
199 				 */
200 				if (prop == SENTENCE_BREAK_PROP_OLETTER ||
201 				    prop == SENTENCE_BREAK_PROP_UPPER   ||
202 				    prop == SENTENCE_BREAK_PROP_LOWER   ||
203 				    prop == SENTENCE_BREAK_PROP_SEP     ||
204 				    prop == SENTENCE_BREAK_PROP_CR      ||
205 				    prop == SENTENCE_BREAK_PROP_LF      ||
206 				    prop == SENTENCE_BREAK_PROP_STERM   ||
207 				    prop == SENTENCE_BREAK_PROP_ATERM) {
208 					break;
209 				}
210 			}
211 
212 			if (prop == SENTENCE_BREAK_PROP_LOWER) {
213 				continue;
214 			}
215 		}
216 
217 		/* SB8a */
218 		if ((state.saterm_close_sp_parasep_level == 1 ||
219 		     state.saterm_close_sp_parasep_level == 2 ||
220 		     state.saterm_close_sp_parasep_level == 3) &&
221 		    (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SCONTINUE ||
222 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_STERM     ||
223                      p.skip.next_prop[0] == SENTENCE_BREAK_PROP_ATERM)) {
224 			continue;
225 		}
226 
227 		/* SB9 */
228 		if ((state.saterm_close_sp_parasep_level == 1 ||
229 		     state.saterm_close_sp_parasep_level == 2) &&
230 		    (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CLOSE ||
231 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP    ||
232 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP   ||
233 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR    ||
234 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
235 			continue;
236 		}
237 
238 		/* SB10 */
239 		if ((state.saterm_close_sp_parasep_level == 1 ||
240 		     state.saterm_close_sp_parasep_level == 2 ||
241 		     state.saterm_close_sp_parasep_level == 3) &&
242 		    (p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SP  ||
243 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_SEP ||
244 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_CR  ||
245 		     p.skip.next_prop[0] == SENTENCE_BREAK_PROP_LF)) {
246 			continue;
247 		}
248 
249 		/* SB11 */
250 		if (state.saterm_close_sp_parasep_level == 1 ||
251 		    state.saterm_close_sp_parasep_level == 2 ||
252 		    state.saterm_close_sp_parasep_level == 3 ||
253 		    state.saterm_close_sp_parasep_level == 4) {
254 			break;
255 		}
256 
257 		/* SB998 */
258 		continue;
259 	}
260 
261 	return herodotus_reader_number_read(&(p.mid_reader));
262 }
263 
264 size_t
265 grapheme_next_sentence_break(const uint_least32_t *str, size_t len)
266 {
267 	HERODOTUS_READER r;
268 
269 	herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
270 
271 	return next_sentence_break(&r);
272 }
273 
274 size_t
275 grapheme_next_sentence_break_utf8(const char *str, size_t len)
276 {
277 	HERODOTUS_READER r;
278 
279 	herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
280 
281 	return next_sentence_break(&r);
282 }