Blob


1 /* See LICENSE file for copyright and license details. */
2 #include <stdbool.h>
3 #include <stddef.h>
5 #include "../gen/line.h"
6 #include "../grapheme.h"
7 #include "util.h"
9 static inline enum line_break_property
10 get_break_prop(uint_least32_t cp)
11 {
12 if (likely(cp <= UINT32_C(0x10FFFF))) {
13 return (enum line_break_property)
14 line_break_minor[line_break_major[cp >> 8] + (cp & 0xff)];
15 } else {
16 return LINE_BREAK_PROP_AL;
17 }
18 }
20 static size_t
21 next_line_break(HERODOTUS_READER *r)
22 {
23 HERODOTUS_READER tmp;
24 enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop,
25 last_non_sp_prop, last_non_sp_cm_or_zwj_prop;
26 uint_least32_t cp;
27 uint_least8_t lb25_level = 0;
28 bool lb21a_flag = false, ri_even = true;
30 /*
31 * Apply line breaking algorithm (UAX #14), see
32 * https://unicode.org/reports/tr14/#Algorithm and tailoring
33 * https://unicode.org/reports/tr14/#Examples (example 7),
34 * given the automatic test-cases implement this example for
35 * better number handling.
36 *
37 */
39 /*
40 * Initialize the different properties such that we have
41 * a good state after the state-update in the loop
42 */
43 last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */
44 last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS;
46 for (herodotus_read_codepoint(r, true, &cp), cp0_prop = get_break_prop(cp);
47 herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS;
48 herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop) {
49 /* get property of the right codepoint */
50 cp1_prop = get_break_prop(cp);
52 /* update retention-states */
54 /*
55 * store the last observed non-CM-or-ZWJ-property for
56 * LB9 and following.
57 */
58 if (cp0_prop != LINE_BREAK_PROP_CM &&
59 cp0_prop != LINE_BREAK_PROP_ZWJ) {
60 /*
61 * check if the property we are overwriting now is an
62 * HL. If so, we set the LB21a-flag which depends on this
63 * knowledge.
64 */
65 lb21a_flag = (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL);
67 /* check regional indicator state */
68 if (cp0_prop == LINE_BREAK_PROP_RI) {
69 /*
70 * The property we just shifted in is
71 * a regional indicator, increasing the
72 * number of consecutive RIs on the left
73 * side of the breakpoint by one, changing
74 * the oddness.
75 *
76 */
77 ri_even = !ri_even;
78 } else {
79 /*
80 * We saw no regional indicator, so the
81 * number of consecutive RIs on the left
82 * side of the breakpoint is zero, which
83 * is an even number.
84 *
85 */
86 ri_even = true;
87 }
89 /*
90 * Here comes a bit of magic. The tailored rule
91 * LB25 (using example 7) has a very complicated
92 * left-hand-side-rule of the form
93 *
94 * NU (NU | SY | IS)* (CL | CP)?
95 *
96 * but instead of backtracking, we keep the state
97 * as some kind of "power level" in the variable
98 *
99 * lb25_level
101 * that goes from 0 to 3
103 * 0: we are not in the sequence
104 * 1: we have one NU to the left of the middle
105 * spot
106 * 2: we have one NU and one or more (NU | SY | IS)
107 * to the left of the middle spot
108 * 3: we have one NU, zero or more (NU | SY | IS)
109 * and one (CL | CP) to the left of the middle
110 * spot
111 */
112 if ((lb25_level == 0 ||
113 lb25_level == 1) &&
114 cp0_prop == LINE_BREAK_PROP_NU) {
115 /* sequence has begun */
116 lb25_level = 1;
117 } else if ((lb25_level == 1 || lb25_level == 2) &&
118 (cp0_prop == LINE_BREAK_PROP_NU ||
119 cp0_prop == LINE_BREAK_PROP_SY ||
120 cp0_prop == LINE_BREAK_PROP_IS)) {
121 /* (NU | SY | IS) sequence begins or continued */
122 lb25_level = 2;
123 } else if ((lb25_level == 1 || lb25_level == 2) &&
124 (cp0_prop == LINE_BREAK_PROP_CL ||
125 cp0_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
126 cp0_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
127 /* CL or CP at the end of the sequence */
128 lb25_level = 3;
129 } else {
130 /* sequence broke */
131 lb25_level = 0;
134 last_non_cm_or_zwj_prop = cp0_prop;
137 /*
138 * store the last observed non-SP-property for LB8, LB14,
139 * LB15, LB16 and LB17. LB8 gets its own unskipped property,
140 * whereas the others build on top of the CM-ZWJ-skipped
141 * properties as they come after LB9
142 */
143 if (cp0_prop != LINE_BREAK_PROP_SP) {
144 last_non_sp_prop = cp0_prop;
146 if (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP) {
147 last_non_sp_cm_or_zwj_prop = last_non_cm_or_zwj_prop;
150 /* apply the algorithm */
152 /* LB4 */
153 if (cp0_prop == LINE_BREAK_PROP_BK) {
154 break;
157 /* LB5 */
158 if (cp0_prop == LINE_BREAK_PROP_CR &&
159 cp1_prop == LINE_BREAK_PROP_LF) {
160 continue;
162 if (cp0_prop == LINE_BREAK_PROP_CR ||
163 cp0_prop == LINE_BREAK_PROP_LF ||
164 cp0_prop == LINE_BREAK_PROP_NL) {
165 break;
168 /* LB6 */
169 if (cp1_prop == LINE_BREAK_PROP_BK ||
170 cp1_prop == LINE_BREAK_PROP_CR ||
171 cp1_prop == LINE_BREAK_PROP_LF ||
172 cp1_prop == LINE_BREAK_PROP_NL) {
173 continue;
176 /* LB7 */
177 if (cp1_prop == LINE_BREAK_PROP_SP ||
178 cp1_prop == LINE_BREAK_PROP_ZW) {
179 continue;
182 /* LB8 */
183 if (last_non_sp_prop == LINE_BREAK_PROP_ZW) {
184 break;
187 /* LB8a */
188 if (cp0_prop == LINE_BREAK_PROP_ZWJ) {
189 continue;
192 /* LB9 */
193 if ((cp0_prop != LINE_BREAK_PROP_BK &&
194 cp0_prop != LINE_BREAK_PROP_CR &&
195 cp0_prop != LINE_BREAK_PROP_LF &&
196 cp0_prop != LINE_BREAK_PROP_NL &&
197 cp0_prop != LINE_BREAK_PROP_SP &&
198 cp0_prop != LINE_BREAK_PROP_ZW) &&
199 (cp1_prop == LINE_BREAK_PROP_CM ||
200 cp1_prop == LINE_BREAK_PROP_ZWJ)) {
201 /*
202 * given we skip them, we don't break in such
203 * a sequence
204 */
205 continue;
208 /* LB10 is baked into the following rules */
210 /* LB11 */
211 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_WJ ||
212 cp1_prop == LINE_BREAK_PROP_WJ) {
213 continue;
216 /* LB12 */
217 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_GL) {
218 continue;
221 /* LB12a */
222 if ((last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP &&
223 last_non_cm_or_zwj_prop != LINE_BREAK_PROP_BA &&
224 last_non_cm_or_zwj_prop != LINE_BREAK_PROP_HY) &&
225 cp1_prop == LINE_BREAK_PROP_GL) {
226 continue;
229 /* LB13 (affected by tailoring for LB25, see example 7) */
230 if (cp1_prop == LINE_BREAK_PROP_EX ||
231 (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_NU &&
232 (cp1_prop == LINE_BREAK_PROP_CL ||
233 cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
234 cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF ||
235 cp1_prop == LINE_BREAK_PROP_IS ||
236 cp1_prop == LINE_BREAK_PROP_SY))) {
237 continue;
240 /* LB14 */
241 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
242 last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF) {
243 continue;
246 /* LB15 */
247 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_QU &&
248 (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
249 cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF)) {
250 continue;
253 /* LB16 */
254 if ((last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CL ||
255 last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
256 last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF) &&
257 cp1_prop == LINE_BREAK_PROP_NS) {
258 continue;
261 /* LB17 */
262 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_B2 &&
263 cp1_prop == LINE_BREAK_PROP_B2) {
264 continue;
267 /* LB18 */
268 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SP) {
269 break;
272 /* LB19 */
273 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_QU ||
274 cp1_prop == LINE_BREAK_PROP_QU) {
275 continue;
278 /* LB20 */
279 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CB ||
280 cp1_prop == LINE_BREAK_PROP_CB) {
281 break;
284 /* LB21 */
285 if (cp1_prop == LINE_BREAK_PROP_BA ||
286 cp1_prop == LINE_BREAK_PROP_HY ||
287 cp1_prop == LINE_BREAK_PROP_NS ||
288 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BB) {
289 continue;
292 /* LB21a */
293 if (lb21a_flag &&
294 (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY ||
295 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BA)) {
296 continue;
299 /* LB21b */
300 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SY &&
301 cp1_prop == LINE_BREAK_PROP_HL) {
302 continue;
305 /* LB22 */
306 if (cp1_prop == LINE_BREAK_PROP_IN) {
307 continue;
310 /* LB23 */
311 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
312 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
313 cp1_prop == LINE_BREAK_PROP_NU) {
314 continue;
316 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU &&
317 (cp1_prop == LINE_BREAK_PROP_AL ||
318 cp1_prop == LINE_BREAK_PROP_HL)) {
319 continue;
322 /* LB23a */
323 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
324 (cp1_prop == LINE_BREAK_PROP_ID ||
325 cp1_prop == LINE_BREAK_PROP_EB ||
326 cp1_prop == LINE_BREAK_PROP_EM)) {
327 continue;
329 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_ID ||
330 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB ||
331 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EM) &&
332 cp1_prop == LINE_BREAK_PROP_PO) {
333 continue;
336 /* LB24 */
337 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
338 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO) &&
339 (cp1_prop == LINE_BREAK_PROP_AL ||
340 cp1_prop == LINE_BREAK_PROP_HL)) {
341 continue;
343 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
344 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
345 (cp1_prop == LINE_BREAK_PROP_PR ||
346 cp1_prop == LINE_BREAK_PROP_PO)) {
347 continue;
350 /* LB25 (tailored with example 7) */
351 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
352 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO)) {
353 if (cp1_prop == LINE_BREAK_PROP_NU) {
354 continue;
357 /* this stupid rule is the reason why we cannot
358 * simply have a stateful break-detection between
359 * two adjacent codepoints as we have it with
360 * characters.
361 */
362 herodotus_reader_copy(r, &tmp);
363 herodotus_read_codepoint(&tmp, true, &cp);
364 if (herodotus_read_codepoint(&tmp, true, &cp) ==
365 HERODOTUS_STATUS_SUCCESS &&
366 (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
367 cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
368 cp1_prop == LINE_BREAK_PROP_HY)) {
369 if (get_break_prop(cp) == LINE_BREAK_PROP_NU) {
370 continue;
374 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
375 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
376 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY) &&
377 cp1_prop == LINE_BREAK_PROP_NU) {
378 continue;
380 if (lb25_level == 1 &&
381 (cp1_prop == LINE_BREAK_PROP_NU ||
382 cp1_prop == LINE_BREAK_PROP_SY ||
383 cp1_prop == LINE_BREAK_PROP_IS)) {
384 continue;
386 if ((lb25_level == 1 || lb25_level == 2) &&
387 (cp1_prop == LINE_BREAK_PROP_NU ||
388 cp1_prop == LINE_BREAK_PROP_SY ||
389 cp1_prop == LINE_BREAK_PROP_IS ||
390 cp1_prop == LINE_BREAK_PROP_CL ||
391 cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
392 cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
393 continue;
395 if ((lb25_level == 1 || lb25_level == 2 || lb25_level == 3) &&
396 (cp1_prop == LINE_BREAK_PROP_PO ||
397 cp1_prop == LINE_BREAK_PROP_PR)) {
398 continue;
401 /* LB26 */
402 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL &&
403 (cp1_prop == LINE_BREAK_PROP_JL ||
404 cp1_prop == LINE_BREAK_PROP_JV ||
405 cp1_prop == LINE_BREAK_PROP_H2 ||
406 cp1_prop == LINE_BREAK_PROP_H3)) {
407 continue;
409 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
410 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2) &&
411 (cp1_prop == LINE_BREAK_PROP_JV ||
412 cp1_prop == LINE_BREAK_PROP_JT)) {
413 continue;
415 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
416 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
417 cp1_prop == LINE_BREAK_PROP_JT) {
418 continue;
421 /* LB27 */
422 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL ||
423 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
424 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
425 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2 ||
426 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
427 cp1_prop == LINE_BREAK_PROP_PO) {
428 continue;
430 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
431 (cp1_prop == LINE_BREAK_PROP_JL ||
432 cp1_prop == LINE_BREAK_PROP_JV ||
433 cp1_prop == LINE_BREAK_PROP_JT ||
434 cp1_prop == LINE_BREAK_PROP_H2 ||
435 cp1_prop == LINE_BREAK_PROP_H3)) {
436 continue;
439 /* LB28 */
440 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
441 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
442 (cp1_prop == LINE_BREAK_PROP_AL ||
443 cp1_prop == LINE_BREAK_PROP_HL)) {
444 continue;
447 /* LB29 */
448 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_IS &&
449 (cp1_prop == LINE_BREAK_PROP_AL ||
450 cp1_prop == LINE_BREAK_PROP_HL)) {
451 continue;
454 /* LB30 */
455 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
456 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL ||
457 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU) &&
458 cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF) {
459 continue;
461 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF &&
462 (cp1_prop == LINE_BREAK_PROP_AL ||
463 cp1_prop == LINE_BREAK_PROP_HL ||
464 cp1_prop == LINE_BREAK_PROP_NU)) {
465 continue;
468 /* LB30a */
469 if (!ri_even &&
470 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_RI &&
471 cp1_prop == LINE_BREAK_PROP_RI) {
472 continue;
475 /* LB30b */
476 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB &&
477 cp1_prop == LINE_BREAK_PROP_EM) {
478 continue;
480 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BOTH_CN_EXTPICT &&
481 cp1_prop == LINE_BREAK_PROP_EM) {
482 continue;
485 /* LB31 */
486 break;
489 return herodotus_reader_number_read(r);
492 size_t
493 grapheme_next_line_break(const uint_least32_t *str, size_t len)
495 HERODOTUS_READER r;
497 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
499 return next_line_break(&r);
502 size_t
503 grapheme_next_line_break_utf8(const char *str, size_t len)
505 HERODOTUS_READER r;
507 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
509 return next_line_break(&r);