1 /* See LICENSE file for copyright and license details. */
5 #include "../gen/line.h"
6 #include "../grapheme.h"
9 static inline enum line_break_property
10 get_break_prop(uint_least32_t cp)
12 if (likely(cp <= UINT32_C(0x10FFFF))) {
13 return (enum line_break_property)
14 line_break_minor[line_break_major[cp >> 8] + (cp & 0xff)];
16 return LINE_BREAK_PROP_AL;
21 next_line_break(HERODOTUS_READER *r)
24 enum line_break_property cp0_prop, cp1_prop, last_non_cm_or_zwj_prop,
25 last_non_sp_prop, last_non_sp_cm_or_zwj_prop;
27 uint_least8_t lb25_level = 0;
28 bool lb21a_flag = false, ri_even = true;
31 * Apply line breaking algorithm (UAX #14), see
32 * https://unicode.org/reports/tr14/#Algorithm and tailoring
33 * https://unicode.org/reports/tr14/#Examples (example 7),
34 * given the automatic test-cases implement this example for
35 * better number handling.
40 * Initialize the different properties such that we have
41 * a good state after the state-update in the loop
43 last_non_cm_or_zwj_prop = LINE_BREAK_PROP_AL; /* according to LB10 */
44 last_non_sp_prop = last_non_sp_cm_or_zwj_prop = NUM_LINE_BREAK_PROPS;
46 for (herodotus_read_codepoint(r, true, &cp), cp0_prop = get_break_prop(cp);
47 herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS;
48 herodotus_read_codepoint(r, true, &cp), cp0_prop = cp1_prop) {
49 /* get property of the right codepoint */
50 cp1_prop = get_break_prop(cp);
52 /* update retention-states */
55 * store the last observed non-CM-or-ZWJ-property for
58 if (cp0_prop != LINE_BREAK_PROP_CM &&
59 cp0_prop != LINE_BREAK_PROP_ZWJ) {
61 * check if the property we are overwriting now is an
62 * HL. If so, we set the LB21a-flag which depends on this
65 lb21a_flag = (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL);
67 /* check regional indicator state */
68 if (cp0_prop == LINE_BREAK_PROP_RI) {
70 * The property we just shifted in is
71 * a regional indicator, increasing the
72 * number of consecutive RIs on the left
73 * side of the breakpoint by one, changing
80 * We saw no regional indicator, so the
81 * number of consecutive RIs on the left
82 * side of the breakpoint is zero, which
90 * Here comes a bit of magic. The tailored rule
91 * LB25 (using example 7) has a very complicated
92 * left-hand-side-rule of the form
94 * NU (NU | SY | IS)* (CL | CP)?
96 * but instead of backtracking, we keep the state
97 * as some kind of "power level" in the variable
101 * that goes from 0 to 3
103 * 0: we are not in the sequence
104 * 1: we have one NU to the left of the middle
106 * 2: we have one NU and one or more (NU | SY | IS)
107 * to the left of the middle spot
108 * 3: we have one NU, zero or more (NU | SY | IS)
109 * and one (CL | CP) to the left of the middle
112 if ((lb25_level == 0 ||
114 cp0_prop == LINE_BREAK_PROP_NU) {
115 /* sequence has begun */
117 } else if ((lb25_level == 1 || lb25_level == 2) &&
118 (cp0_prop == LINE_BREAK_PROP_NU ||
119 cp0_prop == LINE_BREAK_PROP_SY ||
120 cp0_prop == LINE_BREAK_PROP_IS)) {
121 /* (NU | SY | IS) sequence begins or continued */
123 } else if ((lb25_level == 1 || lb25_level == 2) &&
124 (cp0_prop == LINE_BREAK_PROP_CL ||
125 cp0_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
126 cp0_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
127 /* CL or CP at the end of the sequence */
134 last_non_cm_or_zwj_prop = cp0_prop;
138 * store the last observed non-SP-property for LB8, LB14,
139 * LB15, LB16 and LB17. LB8 gets its own unskipped property,
140 * whereas the others build on top of the CM-ZWJ-skipped
141 * properties as they come after LB9
143 if (cp0_prop != LINE_BREAK_PROP_SP) {
144 last_non_sp_prop = cp0_prop;
146 if (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP) {
147 last_non_sp_cm_or_zwj_prop = last_non_cm_or_zwj_prop;
150 /* apply the algorithm */
153 if (cp0_prop == LINE_BREAK_PROP_BK) {
158 if (cp0_prop == LINE_BREAK_PROP_CR &&
159 cp1_prop == LINE_BREAK_PROP_LF) {
162 if (cp0_prop == LINE_BREAK_PROP_CR ||
163 cp0_prop == LINE_BREAK_PROP_LF ||
164 cp0_prop == LINE_BREAK_PROP_NL) {
169 if (cp1_prop == LINE_BREAK_PROP_BK ||
170 cp1_prop == LINE_BREAK_PROP_CR ||
171 cp1_prop == LINE_BREAK_PROP_LF ||
172 cp1_prop == LINE_BREAK_PROP_NL) {
177 if (cp1_prop == LINE_BREAK_PROP_SP ||
178 cp1_prop == LINE_BREAK_PROP_ZW) {
183 if (last_non_sp_prop == LINE_BREAK_PROP_ZW) {
188 if (cp0_prop == LINE_BREAK_PROP_ZWJ) {
193 if ((cp0_prop != LINE_BREAK_PROP_BK &&
194 cp0_prop != LINE_BREAK_PROP_CR &&
195 cp0_prop != LINE_BREAK_PROP_LF &&
196 cp0_prop != LINE_BREAK_PROP_NL &&
197 cp0_prop != LINE_BREAK_PROP_SP &&
198 cp0_prop != LINE_BREAK_PROP_ZW) &&
199 (cp1_prop == LINE_BREAK_PROP_CM ||
200 cp1_prop == LINE_BREAK_PROP_ZWJ)) {
202 * given we skip them, we don't break in such
208 /* LB10 is baked into the following rules */
211 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_WJ ||
212 cp1_prop == LINE_BREAK_PROP_WJ) {
217 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_GL) {
222 if ((last_non_cm_or_zwj_prop != LINE_BREAK_PROP_SP &&
223 last_non_cm_or_zwj_prop != LINE_BREAK_PROP_BA &&
224 last_non_cm_or_zwj_prop != LINE_BREAK_PROP_HY) &&
225 cp1_prop == LINE_BREAK_PROP_GL) {
229 /* LB13 (affected by tailoring for LB25, see example 7) */
230 if (cp1_prop == LINE_BREAK_PROP_EX ||
231 (last_non_cm_or_zwj_prop != LINE_BREAK_PROP_NU &&
232 (cp1_prop == LINE_BREAK_PROP_CL ||
233 cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
234 cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF ||
235 cp1_prop == LINE_BREAK_PROP_IS ||
236 cp1_prop == LINE_BREAK_PROP_SY))) {
241 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
242 last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF) {
247 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_QU &&
248 (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
249 cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF)) {
254 if ((last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CL ||
255 last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
256 last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF) &&
257 cp1_prop == LINE_BREAK_PROP_NS) {
262 if (last_non_sp_cm_or_zwj_prop == LINE_BREAK_PROP_B2 &&
263 cp1_prop == LINE_BREAK_PROP_B2) {
268 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SP) {
273 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_QU ||
274 cp1_prop == LINE_BREAK_PROP_QU) {
279 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CB ||
280 cp1_prop == LINE_BREAK_PROP_CB) {
285 if (cp1_prop == LINE_BREAK_PROP_BA ||
286 cp1_prop == LINE_BREAK_PROP_HY ||
287 cp1_prop == LINE_BREAK_PROP_NS ||
288 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BB) {
294 (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY ||
295 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BA)) {
300 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_SY &&
301 cp1_prop == LINE_BREAK_PROP_HL) {
306 if (cp1_prop == LINE_BREAK_PROP_IN) {
311 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
312 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
313 cp1_prop == LINE_BREAK_PROP_NU) {
316 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU &&
317 (cp1_prop == LINE_BREAK_PROP_AL ||
318 cp1_prop == LINE_BREAK_PROP_HL)) {
323 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
324 (cp1_prop == LINE_BREAK_PROP_ID ||
325 cp1_prop == LINE_BREAK_PROP_EB ||
326 cp1_prop == LINE_BREAK_PROP_EM)) {
329 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_ID ||
330 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB ||
331 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EM) &&
332 cp1_prop == LINE_BREAK_PROP_PO) {
337 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
338 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO) &&
339 (cp1_prop == LINE_BREAK_PROP_AL ||
340 cp1_prop == LINE_BREAK_PROP_HL)) {
343 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
344 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
345 (cp1_prop == LINE_BREAK_PROP_PR ||
346 cp1_prop == LINE_BREAK_PROP_PO)) {
350 /* LB25 (tailored with example 7) */
351 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR ||
352 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PO)) {
353 if (cp1_prop == LINE_BREAK_PROP_NU) {
357 /* this stupid rule is the reason why we cannot
358 * simply have a stateful break-detection between
359 * two adjacent codepoints as we have it with
362 herodotus_reader_copy(r, &tmp);
363 herodotus_read_codepoint(&tmp, true, &cp);
364 if (herodotus_read_codepoint(&tmp, true, &cp) ==
365 HERODOTUS_STATUS_SUCCESS &&
366 (cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
367 cp1_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
368 cp1_prop == LINE_BREAK_PROP_HY)) {
369 if (get_break_prop(cp) == LINE_BREAK_PROP_NU) {
374 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF ||
375 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_OP_WITH_EAW_HWF ||
376 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HY) &&
377 cp1_prop == LINE_BREAK_PROP_NU) {
380 if (lb25_level == 1 &&
381 (cp1_prop == LINE_BREAK_PROP_NU ||
382 cp1_prop == LINE_BREAK_PROP_SY ||
383 cp1_prop == LINE_BREAK_PROP_IS)) {
386 if ((lb25_level == 1 || lb25_level == 2) &&
387 (cp1_prop == LINE_BREAK_PROP_NU ||
388 cp1_prop == LINE_BREAK_PROP_SY ||
389 cp1_prop == LINE_BREAK_PROP_IS ||
390 cp1_prop == LINE_BREAK_PROP_CL ||
391 cp1_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF ||
392 cp1_prop == LINE_BREAK_PROP_CP_WITH_EAW_HWF)) {
395 if ((lb25_level == 1 || lb25_level == 2 || lb25_level == 3) &&
396 (cp1_prop == LINE_BREAK_PROP_PO ||
397 cp1_prop == LINE_BREAK_PROP_PR)) {
402 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL &&
403 (cp1_prop == LINE_BREAK_PROP_JL ||
404 cp1_prop == LINE_BREAK_PROP_JV ||
405 cp1_prop == LINE_BREAK_PROP_H2 ||
406 cp1_prop == LINE_BREAK_PROP_H3)) {
409 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
410 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2) &&
411 (cp1_prop == LINE_BREAK_PROP_JV ||
412 cp1_prop == LINE_BREAK_PROP_JT)) {
415 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
416 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
417 cp1_prop == LINE_BREAK_PROP_JT) {
422 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JL ||
423 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JV ||
424 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_JT ||
425 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H2 ||
426 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_H3) &&
427 cp1_prop == LINE_BREAK_PROP_PO) {
430 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_PR &&
431 (cp1_prop == LINE_BREAK_PROP_JL ||
432 cp1_prop == LINE_BREAK_PROP_JV ||
433 cp1_prop == LINE_BREAK_PROP_JT ||
434 cp1_prop == LINE_BREAK_PROP_H2 ||
435 cp1_prop == LINE_BREAK_PROP_H3)) {
440 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
441 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL) &&
442 (cp1_prop == LINE_BREAK_PROP_AL ||
443 cp1_prop == LINE_BREAK_PROP_HL)) {
448 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_IS &&
449 (cp1_prop == LINE_BREAK_PROP_AL ||
450 cp1_prop == LINE_BREAK_PROP_HL)) {
455 if ((last_non_cm_or_zwj_prop == LINE_BREAK_PROP_AL ||
456 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_HL ||
457 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_NU) &&
458 cp1_prop == LINE_BREAK_PROP_OP_WITHOUT_EAW_HWF) {
461 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_CP_WITHOUT_EAW_HWF &&
462 (cp1_prop == LINE_BREAK_PROP_AL ||
463 cp1_prop == LINE_BREAK_PROP_HL ||
464 cp1_prop == LINE_BREAK_PROP_NU)) {
470 last_non_cm_or_zwj_prop == LINE_BREAK_PROP_RI &&
471 cp1_prop == LINE_BREAK_PROP_RI) {
476 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_EB &&
477 cp1_prop == LINE_BREAK_PROP_EM) {
480 if (last_non_cm_or_zwj_prop == LINE_BREAK_PROP_BOTH_CN_EXTPICT &&
481 cp1_prop == LINE_BREAK_PROP_EM) {
489 return herodotus_reader_number_read(r);
493 grapheme_next_line_break(const uint_least32_t *str, size_t len)
497 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
499 return next_line_break(&r);
503 grapheme_next_line_break_utf8(const char *str, size_t len)
507 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
509 return next_line_break(&r);