Blame


1 3448adb0 2022-11-02 op /* See LICENSE file for copyright and license details. */
2 3448adb0 2022-11-02 op #include <stdbool.h>
3 3448adb0 2022-11-02 op #include <stddef.h>
4 3448adb0 2022-11-02 op
5 3448adb0 2022-11-02 op #include "../gen/line.h"
6 3448adb0 2022-11-02 op #include "../grapheme.h"
7 3448adb0 2022-11-02 op #include "util.h"
8 3448adb0 2022-11-02 op
9 3448adb0 2022-11-02 op static inline enum line_break_property
10 3448adb0 2022-11-02 op get_break_prop(uint_least32_t cp)
11 3448adb0 2022-11-02 op {
12 3448adb0 2022-11-02 op if (likely(cp <= UINT32_C(0x10FFFF))) {
13 3448adb0 2022-11-02 op return (enum line_break_property)
14 3448adb0 2022-11-02 op line_break_minor[line_break_major[cp >> 8] + (cp & 0xff)];
15 3448adb0 2022-11-02 op } else {
16 3448adb0 2022-11-02 op return LINE_BREAK_PROP_AL;
17 3448adb0 2022-11-02 op }
18 3448adb0 2022-11-02 op }
19 3448adb0 2022-11-02 op
20 3448adb0 2022-11-02 op static size_t
21 3448adb0 2022-11-02 op next_line_break(HERODOTUS_READER *r)
22 3448adb0 2022-11-02 op {
23 3448adb0 2022-11-02 op HERODOTUS_READER tmp;
24 3448adb0 2022-11-02 op enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop,
25 3448adb0 2022-11-02 op last_non_sp_prop, last_non_sp_cm_or_zwj_prop;
26 3448adb0 2022-11-02 op uint_least32_t cp;
27 3448adb0 2022-11-02 op uint_least8_t lb25_level = 0;
28 3448adb0 2022-11-02 op bool lb21a_flag = false, ri_even = true;
29 3448adb0 2022-11-02 op
30 3448adb0 2022-11-02 op /*
31 3448adb0 2022-11-02 op * Apply line breaking algorithm (UAX #14), see
32 3448adb0 2022-11-02 op * https://unicode.org/reports/tr14/#Algorithm and tailoring
33 3448adb0 2022-11-02 op * https://unicode.org/reports/tr14/#Examples (example 7),
34 3448adb0 2022-11-02 op * given the automatic test-cases implement this example for
35 3448adb0 2022-11-02 op * better number handling.
36 3448adb0 2022-11-02 op *
37 3448adb0 2022-11-02 op */
38 3448adb0 2022-11-02 op
39 3448adb0 2022-11-02 op /*
40 3448adb0 2022-11-02 op * Initialize the different properties such that we have
41 3448adb0 2022-11-02 op * a good state after the state-update in the loop
42 3448adb0 2022-11-02 op */
43 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */
44 3448adb0 2022-11-02 op last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS;
45 3448adb0 2022-11-02 op
46 3448adb0 2022-11-02 op for (herodotus_read_codepoint(r, true, &cp), cp0_prop = get_break_prop(cp);
47 3448adb0 2022-11-02 op herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS;
48 3448adb0 2022-11-02 op herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop) {
49 3448adb0 2022-11-02 op /* get property of the right codepoint */
50 3448adb0 2022-11-02 op cp1_prop = get_break_prop(cp);
51 3448adb0 2022-11-02 op
52 3448adb0 2022-11-02 op /* update retention-states */
53 3448adb0 2022-11-02 op
54 3448adb0 2022-11-02 op /*
55 3448adb0 2022-11-02 op * store the last observed non-CM-or-ZWJ-property for
56 3448adb0 2022-11-02 op * LB9 and following.
57 3448adb0 2022-11-02 op */
58 3448adb0 2022-11-02 op if (cp0_prop != LINE_BREAK_PROP_CM &&
59 3448adb0 2022-11-02 op cp0_prop != LINE_BREAK_PROP_ZWJ) {
60 3448adb0 2022-11-02 op /*
61 3448adb0 2022-11-02 op * check if the property we are overwriting now is an
62 3448adb0 2022-11-02 op * HL. If so, we set the LB21a-flag which depends on this
63 3448adb0 2022-11-02 op * knowledge.
64 3448adb0 2022-11-02 op */
65 3448adb0 2022-11-02 op lb21a_flag = (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL);
66 3448adb0 2022-11-02 op
67 3448adb0 2022-11-02 op /* check regional indicator state */
68 3448adb0 2022-11-02 op if (cp0_prop == LINE_BREAK_PROP_RI) {
69 3448adb0 2022-11-02 op /*
70 3448adb0 2022-11-02 op * The property we just shifted in is
71 3448adb0 2022-11-02 op * a regional indicator, increasing the
72 3448adb0 2022-11-02 op * number of consecutive RIs on the left
73 3448adb0 2022-11-02 op * side of the breakpoint by one, changing
74 3448adb0 2022-11-02 op * the oddness.
75 3448adb0 2022-11-02 op *
76 3448adb0 2022-11-02 op */
77 3448adb0 2022-11-02 op ri_even = !ri_even;
78 3448adb0 2022-11-02 op } else {
79 3448adb0 2022-11-02 op /*
80 3448adb0 2022-11-02 op * We saw no regional indicator, so the
81 3448adb0 2022-11-02 op * number of consecutive RIs on the left
82 3448adb0 2022-11-02 op * side of the breakpoint is zero, which
83 3448adb0 2022-11-02 op * is an even number.
84 3448adb0 2022-11-02 op *
85 3448adb0 2022-11-02 op */
86 3448adb0 2022-11-02 op ri_even = true;
87 3448adb0 2022-11-02 op }
88 3448adb0 2022-11-02 op
89 3448adb0 2022-11-02 op /*
90 3448adb0 2022-11-02 op * Here comes a bit of magic. The tailored rule
91 3448adb0 2022-11-02 op * LB25 (using example 7) has a very complicated
92 3448adb0 2022-11-02 op * left-hand-side-rule of the form
93 3448adb0 2022-11-02 op *
94 3448adb0 2022-11-02 op * NU (NU | SY | IS)* (CL | CP)?
95 3448adb0 2022-11-02 op *
96 3448adb0 2022-11-02 op * but instead of backtracking, we keep the state
97 3448adb0 2022-11-02 op * as some kind of "power level" in the variable
98 3448adb0 2022-11-02 op *
99 3448adb0 2022-11-02 op * lb25_level
100 3448adb0 2022-11-02 op *
101 3448adb0 2022-11-02 op * that goes from 0 to 3
102 3448adb0 2022-11-02 op *
103 3448adb0 2022-11-02 op * 0: we are not in the sequence
104 3448adb0 2022-11-02 op * 1: we have one NU to the left of the middle
105 3448adb0 2022-11-02 op * spot
106 3448adb0 2022-11-02 op * 2: we have one NU and one or more (NU | SY | IS)
107 3448adb0 2022-11-02 op * to the left of the middle spot
108 3448adb0 2022-11-02 op * 3: we have one NU, zero or more (NU | SY | IS)
109 3448adb0 2022-11-02 op * and one (CL | CP) to the left of the middle
110 3448adb0 2022-11-02 op * spot
111 3448adb0 2022-11-02 op */
112 3448adb0 2022-11-02 op if ((lb25_level == 0 ||
113 3448adb0 2022-11-02 op lb25_level == 1) &&
114 3448adb0 2022-11-02 op cp0_prop == LINE_BREAK_PROP_NU) {
115 3448adb0 2022-11-02 op /* sequence has begun */
116 3448adb0 2022-11-02 op lb25_level = 1;
117 3448adb0 2022-11-02 op } else if ((lb25_level == 1 || lb25_level == 2) &&
118 3448adb0 2022-11-02 op (cp0_prop == LINE_BREAK_PROP_NU ||
119 3448adb0 2022-11-02 op cp0_prop == LINE_BREAK_PROP_SY ||
120 3448adb0 2022-11-02 op cp0_prop == LINE_BREAK_PROP_IS)) {
121 3448adb0 2022-11-02 op /* (NU | SY | IS) sequence begins or continued */
122 3448adb0 2022-11-02 op lb25_level = 2;
123 3448adb0 2022-11-02 op } else if ((lb25_level == 1 || lb25_level == 2) &&
124 3448adb0 2022-11-02 op (cp0_prop == LINE_BREAK_PROP_CL ||
125 3448adb0 2022-11-02 op cp0_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
126 3448adb0 2022-11-02 op cp0_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
127 3448adb0 2022-11-02 op /* CL or CP at the end of the sequence */
128 3448adb0 2022-11-02 op lb25_level = 3;
129 3448adb0 2022-11-02 op } else {
130 3448adb0 2022-11-02 op /* sequence broke */
131 3448adb0 2022-11-02 op lb25_level = 0;
132 3448adb0 2022-11-02 op }
133 3448adb0 2022-11-02 op
134 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop = cp0_prop;
135 3448adb0 2022-11-02 op }
136 3448adb0 2022-11-02 op
137 3448adb0 2022-11-02 op /*
138 3448adb0 2022-11-02 op * store the last observed non-SP-property for LB8, LB14,
139 3448adb0 2022-11-02 op * LB15, LB16 and LB17. LB8 gets its own unskipped property,
140 3448adb0 2022-11-02 op * whereas the others build on top of the CM-ZWJ-skipped
141 3448adb0 2022-11-02 op * properties as they come after LB9
142 3448adb0 2022-11-02 op */
143 3448adb0 2022-11-02 op if (cp0_prop != LINE_BREAK_PROP_SP) {
144 3448adb0 2022-11-02 op last_non_sp_prop = cp0_prop;
145 3448adb0 2022-11-02 op }
146 3448adb0 2022-11-02 op if (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP) {
147 3448adb0 2022-11-02 op last_non_sp_cm_or_zwj_prop = last_non_cm_or_zwj_prop;
148 3448adb0 2022-11-02 op }
149 3448adb0 2022-11-02 op
150 3448adb0 2022-11-02 op /* apply the algorithm */
151 3448adb0 2022-11-02 op
152 3448adb0 2022-11-02 op /* LB4 */
153 3448adb0 2022-11-02 op if (cp0_prop == LINE_BREAK_PROP_BK) {
154 3448adb0 2022-11-02 op break;
155 3448adb0 2022-11-02 op }
156 3448adb0 2022-11-02 op
157 3448adb0 2022-11-02 op /* LB5 */
158 3448adb0 2022-11-02 op if (cp0_prop == LINE_BREAK_PROP_CR &&
159 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_LF) {
160 3448adb0 2022-11-02 op continue;
161 3448adb0 2022-11-02 op }
162 3448adb0 2022-11-02 op if (cp0_prop == LINE_BREAK_PROP_CR ||
163 3448adb0 2022-11-02 op cp0_prop == LINE_BREAK_PROP_LF ||
164 3448adb0 2022-11-02 op cp0_prop == LINE_BREAK_PROP_NL) {
165 3448adb0 2022-11-02 op break;
166 3448adb0 2022-11-02 op }
167 3448adb0 2022-11-02 op
168 3448adb0 2022-11-02 op /* LB6 */
169 3448adb0 2022-11-02 op if (cp1_prop == LINE_BREAK_PROP_BK ||
170 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_CR ||
171 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_LF ||
172 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_NL) {
173 3448adb0 2022-11-02 op continue;
174 3448adb0 2022-11-02 op }
175 3448adb0 2022-11-02 op
176 3448adb0 2022-11-02 op /* LB7 */
177 3448adb0 2022-11-02 op if (cp1_prop == LINE_BREAK_PROP_SP ||
178 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_ZW) {
179 3448adb0 2022-11-02 op continue;
180 3448adb0 2022-11-02 op }
181 3448adb0 2022-11-02 op
182 3448adb0 2022-11-02 op /* LB8 */
183 3448adb0 2022-11-02 op if (last_non_sp_prop == LINE_BREAK_PROP_ZW) {
184 3448adb0 2022-11-02 op break;
185 3448adb0 2022-11-02 op }
186 3448adb0 2022-11-02 op
187 3448adb0 2022-11-02 op /* LB8a */
188 3448adb0 2022-11-02 op if (cp0_prop == LINE_BREAK_PROP_ZWJ) {
189 3448adb0 2022-11-02 op continue;
190 3448adb0 2022-11-02 op }
191 3448adb0 2022-11-02 op
192 3448adb0 2022-11-02 op /* LB9 */
193 3448adb0 2022-11-02 op if ((cp0_prop != LINE_BREAK_PROP_BK &&
194 3448adb0 2022-11-02 op cp0_prop != LINE_BREAK_PROP_CR &&
195 3448adb0 2022-11-02 op cp0_prop != LINE_BREAK_PROP_LF &&
196 3448adb0 2022-11-02 op cp0_prop != LINE_BREAK_PROP_NL &&
197 3448adb0 2022-11-02 op cp0_prop != LINE_BREAK_PROP_SP &&
198 3448adb0 2022-11-02 op cp0_prop != LINE_BREAK_PROP_ZW) &&
199 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_CM ||
200 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_ZWJ)) {
201 3448adb0 2022-11-02 op /*
202 3448adb0 2022-11-02 op * given we skip them, we don't break in such
203 3448adb0 2022-11-02 op * a sequence
204 3448adb0 2022-11-02 op */
205 3448adb0 2022-11-02 op continue;
206 3448adb0 2022-11-02 op }
207 3448adb0 2022-11-02 op
208 3448adb0 2022-11-02 op /* LB10 is baked into the following rules */
209 3448adb0 2022-11-02 op
210 3448adb0 2022-11-02 op /* LB11 */
211 3448adb0 2022-11-02 op if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_WJ ||
212 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_WJ) {
213 3448adb0 2022-11-02 op continue;
214 3448adb0 2022-11-02 op }
215 3448adb0 2022-11-02 op
216 3448adb0 2022-11-02 op /* LB12 */
217 3448adb0 2022-11-02 op if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_GL) {
218 3448adb0 2022-11-02 op continue;
219 3448adb0 2022-11-02 op }
220 3448adb0 2022-11-02 op
221 3448adb0 2022-11-02 op /* LB12a */
222 3448adb0 2022-11-02 op if ((last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP &&
223 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop != LINE_BREAK_PROP_BA &&
224 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop != LINE_BREAK_PROP_HY) &&
225 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_GL) {
226 3448adb0 2022-11-02 op continue;
227 3448adb0 2022-11-02 op }
228 3448adb0 2022-11-02 op
229 3448adb0 2022-11-02 op /* LB13 (affected by tailoring for LB25, see example 7) */
230 3448adb0 2022-11-02 op if (cp1_prop == LINE_BREAK_PROP_EX ||
231 3448adb0 2022-11-02 op (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_NU &&
232 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_CL ||
233 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
234 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF ||
235 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_IS ||
236 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_SY))) {
237 3448adb0 2022-11-02 op continue;
238 3448adb0 2022-11-02 op }
239 3448adb0 2022-11-02 op
240 3448adb0 2022-11-02 op /* LB14 */
241 3448adb0 2022-11-02 op if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
242 3448adb0 2022-11-02 op last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF) {
243 3448adb0 2022-11-02 op continue;
244 3448adb0 2022-11-02 op }
245 3448adb0 2022-11-02 op
246 3448adb0 2022-11-02 op /* LB15 */
247 3448adb0 2022-11-02 op if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_QU &&
248 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
249 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF)) {
250 3448adb0 2022-11-02 op continue;
251 3448adb0 2022-11-02 op }
252 3448adb0 2022-11-02 op
253 3448adb0 2022-11-02 op /* LB16 */
254 3448adb0 2022-11-02 op if ((last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CL ||
255 3448adb0 2022-11-02 op last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
256 3448adb0 2022-11-02 op last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF) &&
257 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_NS) {
258 3448adb0 2022-11-02 op continue;
259 3448adb0 2022-11-02 op }
260 3448adb0 2022-11-02 op
261 3448adb0 2022-11-02 op /* LB17 */
262 3448adb0 2022-11-02 op if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_B2 &&
263 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_B2) {
264 3448adb0 2022-11-02 op continue;
265 3448adb0 2022-11-02 op }
266 3448adb0 2022-11-02 op
267 3448adb0 2022-11-02 op /* LB18 */
268 3448adb0 2022-11-02 op if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SP) {
269 3448adb0 2022-11-02 op break;
270 3448adb0 2022-11-02 op }
271 3448adb0 2022-11-02 op
272 3448adb0 2022-11-02 op /* LB19 */
273 3448adb0 2022-11-02 op if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_QU ||
274 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_QU) {
275 3448adb0 2022-11-02 op continue;
276 3448adb0 2022-11-02 op }
277 3448adb0 2022-11-02 op
278 3448adb0 2022-11-02 op /* LB20 */
279 3448adb0 2022-11-02 op if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CB ||
280 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_CB) {
281 3448adb0 2022-11-02 op break;
282 3448adb0 2022-11-02 op }
283 3448adb0 2022-11-02 op
284 3448adb0 2022-11-02 op /* LB21 */
285 3448adb0 2022-11-02 op if (cp1_prop == LINE_BREAK_PROP_BA ||
286 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_HY ||
287 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_NS ||
288 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BB) {
289 3448adb0 2022-11-02 op continue;
290 3448adb0 2022-11-02 op }
291 3448adb0 2022-11-02 op
292 3448adb0 2022-11-02 op /* LB21a */
293 3448adb0 2022-11-02 op if (lb21a_flag &&
294 3448adb0 2022-11-02 op (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY ||
295 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BA)) {
296 3448adb0 2022-11-02 op continue;
297 3448adb0 2022-11-02 op }
298 3448adb0 2022-11-02 op
299 3448adb0 2022-11-02 op /* LB21b */
300 3448adb0 2022-11-02 op if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SY &&
301 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_HL) {
302 3448adb0 2022-11-02 op continue;
303 3448adb0 2022-11-02 op }
304 3448adb0 2022-11-02 op
305 3448adb0 2022-11-02 op /* LB22 */
306 3448adb0 2022-11-02 op if (cp1_prop == LINE_BREAK_PROP_IN) {
307 3448adb0 2022-11-02 op continue;
308 3448adb0 2022-11-02 op }
309 3448adb0 2022-11-02 op
310 3448adb0 2022-11-02 op /* LB23 */
311 3448adb0 2022-11-02 op if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
312 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
313 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_NU) {
314 3448adb0 2022-11-02 op continue;
315 3448adb0 2022-11-02 op }
316 3448adb0 2022-11-02 op if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU &&
317 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_AL ||
318 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_HL)) {
319 3448adb0 2022-11-02 op continue;
320 3448adb0 2022-11-02 op }
321 3448adb0 2022-11-02 op
322 3448adb0 2022-11-02 op /* LB23a */
323 3448adb0 2022-11-02 op if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
324 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_ID ||
325 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_EB ||
326 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_EM)) {
327 3448adb0 2022-11-02 op continue;
328 3448adb0 2022-11-02 op }
329 3448adb0 2022-11-02 op if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_ID ||
330 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB ||
331 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EM) &&
332 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_PO) {
333 3448adb0 2022-11-02 op continue;
334 3448adb0 2022-11-02 op }
335 3448adb0 2022-11-02 op
336 3448adb0 2022-11-02 op /* LB24 */
337 3448adb0 2022-11-02 op if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
338 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO) &&
339 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_AL ||
340 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_HL)) {
341 3448adb0 2022-11-02 op continue;
342 3448adb0 2022-11-02 op }
343 3448adb0 2022-11-02 op if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
344 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
345 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_PR ||
346 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_PO)) {
347 3448adb0 2022-11-02 op continue;
348 3448adb0 2022-11-02 op }
349 3448adb0 2022-11-02 op
350 3448adb0 2022-11-02 op /* LB25 (tailored with example 7) */
351 3448adb0 2022-11-02 op if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
352 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO)) {
353 3448adb0 2022-11-02 op if (cp1_prop == LINE_BREAK_PROP_NU) {
354 3448adb0 2022-11-02 op continue;
355 3448adb0 2022-11-02 op }
356 3448adb0 2022-11-02 op
357 3448adb0 2022-11-02 op /* this stupid rule is the reason why we cannot
358 3448adb0 2022-11-02 op * simply have a stateful break-detection between
359 3448adb0 2022-11-02 op * two adjacent codepoints as we have it with
360 3448adb0 2022-11-02 op * characters.
361 3448adb0 2022-11-02 op */
362 3448adb0 2022-11-02 op herodotus_reader_copy(r, &tmp);
363 3448adb0 2022-11-02 op herodotus_read_codepoint(&tmp, true, &cp);
364 3448adb0 2022-11-02 op if (herodotus_read_codepoint(&tmp, true, &cp) ==
365 3448adb0 2022-11-02 op HERODOTUS_STATUS_SUCCESS &&
366 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
367 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
368 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_HY)) {
369 3448adb0 2022-11-02 op if (get_break_prop(cp) == LINE_BREAK_PROP_NU) {
370 3448adb0 2022-11-02 op continue;
371 3448adb0 2022-11-02 op }
372 3448adb0 2022-11-02 op }
373 3448adb0 2022-11-02 op }
374 3448adb0 2022-11-02 op if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
375 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
376 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY) &&
377 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_NU) {
378 3448adb0 2022-11-02 op continue;
379 3448adb0 2022-11-02 op }
380 3448adb0 2022-11-02 op if (lb25_level == 1 &&
381 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_NU ||
382 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_SY ||
383 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_IS)) {
384 3448adb0 2022-11-02 op continue;
385 3448adb0 2022-11-02 op }
386 3448adb0 2022-11-02 op if ((lb25_level == 1 || lb25_level == 2) &&
387 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_NU ||
388 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_SY ||
389 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_IS ||
390 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_CL ||
391 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
392 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
393 3448adb0 2022-11-02 op continue;
394 3448adb0 2022-11-02 op }
395 3448adb0 2022-11-02 op if ((lb25_level == 1 || lb25_level == 2 || lb25_level == 3) &&
396 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_PO ||
397 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_PR)) {
398 3448adb0 2022-11-02 op continue;
399 3448adb0 2022-11-02 op }
400 3448adb0 2022-11-02 op
401 3448adb0 2022-11-02 op /* LB26 */
402 3448adb0 2022-11-02 op if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL &&
403 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_JL ||
404 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_JV ||
405 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_H2 ||
406 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_H3)) {
407 3448adb0 2022-11-02 op continue;
408 3448adb0 2022-11-02 op }
409 3448adb0 2022-11-02 op if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
410 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2) &&
411 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_JV ||
412 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_JT)) {
413 3448adb0 2022-11-02 op continue;
414 3448adb0 2022-11-02 op }
415 3448adb0 2022-11-02 op if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
416 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
417 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_JT) {
418 3448adb0 2022-11-02 op continue;
419 3448adb0 2022-11-02 op }
420 3448adb0 2022-11-02 op
421 3448adb0 2022-11-02 op /* LB27 */
422 3448adb0 2022-11-02 op if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL ||
423 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
424 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
425 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2 ||
426 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
427 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_PO) {
428 3448adb0 2022-11-02 op continue;
429 3448adb0 2022-11-02 op }
430 3448adb0 2022-11-02 op if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
431 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_JL ||
432 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_JV ||
433 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_JT ||
434 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_H2 ||
435 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_H3)) {
436 3448adb0 2022-11-02 op continue;
437 3448adb0 2022-11-02 op }
438 3448adb0 2022-11-02 op
439 3448adb0 2022-11-02 op /* LB28 */
440 3448adb0 2022-11-02 op if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
441 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
442 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_AL ||
443 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_HL)) {
444 3448adb0 2022-11-02 op continue;
445 3448adb0 2022-11-02 op }
446 3448adb0 2022-11-02 op
447 3448adb0 2022-11-02 op /* LB29 */
448 3448adb0 2022-11-02 op if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_IS &&
449 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_AL ||
450 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_HL)) {
451 3448adb0 2022-11-02 op continue;
452 3448adb0 2022-11-02 op }
453 3448adb0 2022-11-02 op
454 3448adb0 2022-11-02 op /* LB30 */
455 3448adb0 2022-11-02 op if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
456 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL ||
457 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU) &&
458 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF) {
459 3448adb0 2022-11-02 op continue;
460 3448adb0 2022-11-02 op }
461 3448adb0 2022-11-02 op if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF &&
462 3448adb0 2022-11-02 op (cp1_prop == LINE_BREAK_PROP_AL ||
463 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_HL ||
464 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_NU)) {
465 3448adb0 2022-11-02 op continue;
466 3448adb0 2022-11-02 op }
467 3448adb0 2022-11-02 op
468 3448adb0 2022-11-02 op /* LB30a */
469 3448adb0 2022-11-02 op if (!ri_even &&
470 3448adb0 2022-11-02 op last_non_cm_or_zwj_prop == LINE_BREAK_PROP_RI &&
471 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_RI) {
472 3448adb0 2022-11-02 op continue;
473 3448adb0 2022-11-02 op }
474 3448adb0 2022-11-02 op
475 3448adb0 2022-11-02 op /* LB30b */
476 3448adb0 2022-11-02 op if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB &&
477 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_EM) {
478 3448adb0 2022-11-02 op continue;
479 3448adb0 2022-11-02 op }
480 3448adb0 2022-11-02 op if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BOTH_CN_EXTPICT &&
481 3448adb0 2022-11-02 op cp1_prop == LINE_BREAK_PROP_EM) {
482 3448adb0 2022-11-02 op continue;
483 3448adb0 2022-11-02 op }
484 3448adb0 2022-11-02 op
485 3448adb0 2022-11-02 op /* LB31 */
486 3448adb0 2022-11-02 op break;
487 3448adb0 2022-11-02 op }
488 3448adb0 2022-11-02 op
489 3448adb0 2022-11-02 op return herodotus_reader_number_read(r);
490 3448adb0 2022-11-02 op }
491 3448adb0 2022-11-02 op
492 3448adb0 2022-11-02 op size_t
493 3448adb0 2022-11-02 op grapheme_next_line_break(const uint_least32_t *str, size_t len)
494 3448adb0 2022-11-02 op {
495 3448adb0 2022-11-02 op HERODOTUS_READER r;
496 3448adb0 2022-11-02 op
497 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
498 3448adb0 2022-11-02 op
499 3448adb0 2022-11-02 op return next_line_break(&r);
500 3448adb0 2022-11-02 op }
501 3448adb0 2022-11-02 op
502 3448adb0 2022-11-02 op size_t
503 3448adb0 2022-11-02 op grapheme_next_line_break_utf8(const char *str, size_t len)
504 3448adb0 2022-11-02 op {
505 3448adb0 2022-11-02 op HERODOTUS_READER r;
506 3448adb0 2022-11-02 op
507 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
508 3448adb0 2022-11-02 op
509 3448adb0 2022-11-02 op return next_line_break(&r);
510 3448adb0 2022-11-02 op }