1 3448adb0 2022-11-02 op /* See LICENSE file for copyright and license details. */
2 3448adb0 2022-11-02 op #include <limits.h>
3 3448adb0 2022-11-02 op #include <stdbool.h>
4 3448adb0 2022-11-02 op #include <stddef.h>
6 3448adb0 2022-11-02 op #include "../gen/character.h"
7 3448adb0 2022-11-02 op #include "../grapheme.h"
8 3448adb0 2022-11-02 op #include "util.h"
10 3448adb0 2022-11-02 op struct character_break_state {
11 3448adb0 2022-11-02 op uint_least8_t prop;
12 3448adb0 2022-11-02 op bool prop_set;
13 3448adb0 2022-11-02 op bool gb11_flag;
14 3448adb0 2022-11-02 op bool gb12_13_flag;
17 3448adb0 2022-11-02 op static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
18 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_OTHER] =
19 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
20 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
21 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
22 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_CR] =
23 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */
24 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_EXTEND] =
25 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
26 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
27 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
28 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
29 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
30 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
31 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
32 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_HANGUL_L] =
33 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */
34 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */
35 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */
36 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */
37 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
38 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
39 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
40 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_HANGUL_V] =
41 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
42 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
43 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
44 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
45 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
46 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_HANGUL_T] =
47 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
48 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
49 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
50 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
51 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_HANGUL_LV] =
52 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
53 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
54 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
55 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
56 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
57 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_HANGUL_LVT] =
58 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
59 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
60 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
61 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
62 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_PREPEND] =
63 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
64 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
65 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */
66 3448adb0 2022-11-02 op (UINT16_C(0xFFFF) &
67 3448adb0 2022-11-02 op ~(UINT16_C(1) << CHAR_BREAK_PROP_CR |
68 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_LF |
69 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_CONTROL
72 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
73 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
74 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
75 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
76 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_SPACINGMARK] =
77 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
78 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
79 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
80 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_ZWJ] =
81 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
82 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
83 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
85 3448adb0 2022-11-02 op static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
86 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
87 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
88 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
89 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
90 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
91 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
92 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |
93 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ,
94 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
95 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
96 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
98 3448adb0 2022-11-02 op static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
99 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
100 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
102 3448adb0 2022-11-02 op static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
103 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
104 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
106 3448adb0 2022-11-02 op static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
107 3448adb0 2022-11-02 op [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
108 3448adb0 2022-11-02 op UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
111 3448adb0 2022-11-02 op static inline enum char_break_property
112 3448adb0 2022-11-02 op get_break_prop(uint_least32_t cp)
114 3448adb0 2022-11-02 op if (likely(cp <= UINT32_C(0x10FFFF))) {
115 3448adb0 2022-11-02 op return (enum char_break_property)
116 3448adb0 2022-11-02 op char_break_minor[char_break_major[cp >> 8] + (cp & 0xFF)];
118 3448adb0 2022-11-02 op return CHAR_BREAK_PROP_OTHER;
122 3448adb0 2022-11-02 op static inline void
123 3448adb0 2022-11-02 op state_serialize(const struct character_break_state *in, uint_least16_t *out)
125 3448adb0 2022-11-02 op *out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */
126 3448adb0 2022-11-02 op (uint_least16_t)(((uint_least16_t)(in->prop_set)) << 8) | /* 9th bit */
127 3448adb0 2022-11-02 op (uint_least16_t)(((uint_least16_t)(in->gb11_flag)) << 9) | /* 10th bit */
128 3448adb0 2022-11-02 op (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) << 10); /* 11th bit */
131 3448adb0 2022-11-02 op static inline void
132 3448adb0 2022-11-02 op state_deserialize(uint_least16_t in, struct character_break_state *out)
134 3448adb0 2022-11-02 op out->prop = in & UINT8_C(0xFF);
135 3448adb0 2022-11-02 op out->prop_set = in & (UINT16_C(1) << 8);
136 3448adb0 2022-11-02 op out->gb11_flag = in & (UINT16_C(1) << 9);
137 3448adb0 2022-11-02 op out->gb12_13_flag = in & (UINT16_C(1) << 10);
141 3448adb0 2022-11-02 op grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, uint_least16_t *s)
143 3448adb0 2022-11-02 op struct character_break_state state;
144 3448adb0 2022-11-02 op enum char_break_property cp0_prop, cp1_prop;
145 3448adb0 2022-11-02 op bool notbreak = false;
147 3448adb0 2022-11-02 op if (likely(s)) {
148 3448adb0 2022-11-02 op state_deserialize(*s, &state);
150 3448adb0 2022-11-02 op if (likely(state.prop_set)) {
151 3448adb0 2022-11-02 op cp0_prop = state.prop;
153 3448adb0 2022-11-02 op cp0_prop = get_break_prop(cp0);
155 3448adb0 2022-11-02 op cp1_prop = get_break_prop(cp1);
157 3448adb0 2022-11-02 op /* preserve prop of right codepoint for next iteration */
158 3448adb0 2022-11-02 op state.prop = (uint_least8_t)cp1_prop;
159 3448adb0 2022-11-02 op state.prop_set = true;
161 3448adb0 2022-11-02 op /* update flags */
162 3448adb0 2022-11-02 op state.gb11_flag =
163 3448adb0 2022-11-02 op flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
164 3448adb0 2022-11-02 op state.gb11_flag] &
165 3448adb0 2022-11-02 op UINT16_C(1) << cp1_prop;
166 3448adb0 2022-11-02 op state.gb12_13_flag =
167 3448adb0 2022-11-02 op flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS *
168 3448adb0 2022-11-02 op state.gb12_13_flag] &
169 3448adb0 2022-11-02 op UINT16_C(1) << cp1_prop;
172 3448adb0 2022-11-02 op * Apply grapheme cluster breaking algorithm (UAX #29), see
173 3448adb0 2022-11-02 op * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
175 3448adb0 2022-11-02 op notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
176 3448adb0 2022-11-02 op (dont_break_gb11[cp0_prop + state.gb11_flag *
177 3448adb0 2022-11-02 op NUM_CHAR_BREAK_PROPS] &
178 3448adb0 2022-11-02 op (UINT16_C(1) << cp1_prop)) ||
179 3448adb0 2022-11-02 op (dont_break_gb12_13[cp0_prop + state.gb12_13_flag *
180 3448adb0 2022-11-02 op NUM_CHAR_BREAK_PROPS] &
181 3448adb0 2022-11-02 op (UINT16_C(1) << cp1_prop));
183 3448adb0 2022-11-02 op /* update or reset flags (when we have a break) */
184 3448adb0 2022-11-02 op if (likely(!notbreak)) {
185 3448adb0 2022-11-02 op state.gb11_flag = state.gb12_13_flag = false;
188 3448adb0 2022-11-02 op state_serialize(&state, s);
190 3448adb0 2022-11-02 op cp0_prop = get_break_prop(cp0);
191 3448adb0 2022-11-02 op cp1_prop = get_break_prop(cp1);
194 3448adb0 2022-11-02 op * Apply grapheme cluster breaking algorithm (UAX #29), see
195 3448adb0 2022-11-02 op * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
197 3448adb0 2022-11-02 op * Given we have no state, this behaves as if the state-booleans
198 3448adb0 2022-11-02 op * were all set to false
200 3448adb0 2022-11-02 op notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
201 3448adb0 2022-11-02 op (dont_break_gb11[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
202 3448adb0 2022-11-02 op (dont_break_gb12_13[cp0_prop] & (UINT16_C(1) << cp1_prop));
205 3448adb0 2022-11-02 op return !notbreak;
208 3448adb0 2022-11-02 op static size_t
209 3448adb0 2022-11-02 op next_character_break(HERODOTUS_READER *r)
211 3448adb0 2022-11-02 op uint_least16_t state = 0;
212 3448adb0 2022-11-02 op uint_least32_t cp0 = 0, cp1 = 0;
214 3448adb0 2022-11-02 op for (herodotus_read_codepoint(r, true, &cp0);
215 3448adb0 2022-11-02 op herodotus_read_codepoint(r, false, &cp1) == HERODOTUS_STATUS_SUCCESS;
216 3448adb0 2022-11-02 op herodotus_read_codepoint(r, true, &cp0)) {
217 3448adb0 2022-11-02 op if (grapheme_is_character_break(cp0, cp1, &state)) {
222 3448adb0 2022-11-02 op return herodotus_reader_number_read(r);
226 3448adb0 2022-11-02 op grapheme_next_character_break(const uint_least32_t *str, size_t len)
228 3448adb0 2022-11-02 op HERODOTUS_READER r;
230 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
232 3448adb0 2022-11-02 op return next_character_break(&r);
236 3448adb0 2022-11-02 op grapheme_next_character_break_utf8(const char *str, size_t len)
238 3448adb0 2022-11-02 op HERODOTUS_READER r;
240 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
242 3448adb0 2022-11-02 op return next_character_break(&r);