Blob


1 /* See LICENSE file for copyright and license details. */
2 #include <limits.h>
3 #include <stdbool.h>
4 #include <stddef.h>
6 #include "../gen/character.h"
7 #include "../grapheme.h"
8 #include "util.h"
10 struct character_break_state {
11 uint_least8_t prop;
12 bool prop_set;
13 bool gb11_flag;
14 bool gb12_13_flag;
15 };
17 static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = {
18 [CHAR_BREAK_PROP_OTHER] =
19 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
20 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
21 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
22 [CHAR_BREAK_PROP_CR] =
23 UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */
24 [CHAR_BREAK_PROP_EXTEND] =
25 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
26 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
27 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
28 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
29 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
30 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
31 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
32 [CHAR_BREAK_PROP_HANGUL_L] =
33 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */
34 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */
35 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */
36 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */
37 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
38 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
39 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
40 [CHAR_BREAK_PROP_HANGUL_V] =
41 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
42 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
43 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
44 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
45 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
46 [CHAR_BREAK_PROP_HANGUL_T] =
47 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
48 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
49 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
50 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
51 [CHAR_BREAK_PROP_HANGUL_LV] =
52 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */
53 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */
54 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
55 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
56 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
57 [CHAR_BREAK_PROP_HANGUL_LVT] =
58 UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */
59 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
60 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
61 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
62 [CHAR_BREAK_PROP_PREPEND] =
63 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
64 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
65 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */
66 (UINT16_C(0xFFFF) &
67 ~(UINT16_C(1) << CHAR_BREAK_PROP_CR |
68 UINT16_C(1) << CHAR_BREAK_PROP_LF |
69 UINT16_C(1) << CHAR_BREAK_PROP_CONTROL
70 )
71 ), /* GB9b */
72 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
73 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
74 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
75 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
76 [CHAR_BREAK_PROP_SPACINGMARK] =
77 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
78 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
79 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
80 [CHAR_BREAK_PROP_ZWJ] =
81 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */
82 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */
83 UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */
84 };
85 static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
86 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] =
87 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
88 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
89 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
90 UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
91 [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] =
92 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND |
93 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ,
94 [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] =
95 UINT16_C(1) << CHAR_BREAK_PROP_ZWJ |
96 UINT16_C(1) << CHAR_BREAK_PROP_EXTEND,
97 };
98 static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = {
99 [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] =
100 UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC,
101 };
102 static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
103 [CHAR_BREAK_PROP_REGIONAL_INDICATOR] =
104 UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
105 };
106 static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = {
107 [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] =
108 UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR,
109 };
111 static inline enum char_break_property
112 get_break_prop(uint_least32_t cp)
114 if (likely(cp <= UINT32_C(0x10FFFF))) {
115 return (enum char_break_property)
116 char_break_minor[char_break_major[cp >> 8] + (cp & 0xFF)];
117 } else {
118 return CHAR_BREAK_PROP_OTHER;
122 static inline void
123 state_serialize(const struct character_break_state *in, uint_least16_t *out)
125 *out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */
126 (uint_least16_t)(((uint_least16_t)(in->prop_set)) << 8) | /* 9th bit */
127 (uint_least16_t)(((uint_least16_t)(in->gb11_flag)) << 9) | /* 10th bit */
128 (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) << 10); /* 11th bit */
131 static inline void
132 state_deserialize(uint_least16_t in, struct character_break_state *out)
134 out->prop = in & UINT8_C(0xFF);
135 out->prop_set = in & (UINT16_C(1) << 8);
136 out->gb11_flag = in & (UINT16_C(1) << 9);
137 out->gb12_13_flag = in & (UINT16_C(1) << 10);
140 bool
141 grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, uint_least16_t *s)
143 struct character_break_state state;
144 enum char_break_property cp0_prop, cp1_prop;
145 bool notbreak = false;
147 if (likely(s)) {
148 state_deserialize(*s, &state);
150 if (likely(state.prop_set)) {
151 cp0_prop = state.prop;
152 } else {
153 cp0_prop = get_break_prop(cp0);
155 cp1_prop = get_break_prop(cp1);
157 /* preserve prop of right codepoint for next iteration */
158 state.prop = (uint_least8_t)cp1_prop;
159 state.prop_set = true;
161 /* update flags */
162 state.gb11_flag =
163 flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS *
164 state.gb11_flag] &
165 UINT16_C(1) << cp1_prop;
166 state.gb12_13_flag =
167 flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS *
168 state.gb12_13_flag] &
169 UINT16_C(1) << cp1_prop;
171 /*
172 * Apply grapheme cluster breaking algorithm (UAX #29), see
173 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
174 */
175 notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
176 (dont_break_gb11[cp0_prop + state.gb11_flag *
177 NUM_CHAR_BREAK_PROPS] &
178 (UINT16_C(1) << cp1_prop)) ||
179 (dont_break_gb12_13[cp0_prop + state.gb12_13_flag *
180 NUM_CHAR_BREAK_PROPS] &
181 (UINT16_C(1) << cp1_prop));
183 /* update or reset flags (when we have a break) */
184 if (likely(!notbreak)) {
185 state.gb11_flag = state.gb12_13_flag = false;
188 state_serialize(&state, s);
189 } else {
190 cp0_prop = get_break_prop(cp0);
191 cp1_prop = get_break_prop(cp1);
193 /*
194 * Apply grapheme cluster breaking algorithm (UAX #29), see
195 * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules
197 * Given we have no state, this behaves as if the state-booleans
198 * were all set to false
199 */
200 notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
201 (dont_break_gb11[cp0_prop] & (UINT16_C(1) << cp1_prop)) ||
202 (dont_break_gb12_13[cp0_prop] & (UINT16_C(1) << cp1_prop));
205 return !notbreak;
208 static size_t
209 next_character_break(HERODOTUS_READER *r)
211 uint_least16_t state = 0;
212 uint_least32_t cp0 = 0, cp1 = 0;
214 for (herodotus_read_codepoint(r, true, &cp0);
215 herodotus_read_codepoint(r, false, &cp1) == HERODOTUS_STATUS_SUCCESS;
216 herodotus_read_codepoint(r, true, &cp0)) {
217 if (grapheme_is_character_break(cp0, cp1, &state)) {
218 break;
222 return herodotus_reader_number_read(r);
225 size_t
226 grapheme_next_character_break(const uint_least32_t *str, size_t len)
228 HERODOTUS_READER r;
230 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len);
232 return next_character_break(&r);
235 size_t
236 grapheme_next_character_break_utf8(const char *str, size_t len)
238 HERODOTUS_READER r;
240 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len);
242 return next_character_break(&r);