Blame


1 3448adb0 2022-11-02 op /* See LICENSE file for copyright and license details. */
2 3448adb0 2022-11-02 op #include <stddef.h>
3 3448adb0 2022-11-02 op #include <stdint.h>
4 3448adb0 2022-11-02 op
5 3448adb0 2022-11-02 op #include "../grapheme.h"
6 3448adb0 2022-11-02 op #include "../gen/case.h"
7 3448adb0 2022-11-02 op #include "util.h"
8 3448adb0 2022-11-02 op
9 3448adb0 2022-11-02 op static inline enum case_property
10 3448adb0 2022-11-02 op get_case_property(uint_least32_t cp)
11 3448adb0 2022-11-02 op {
12 3448adb0 2022-11-02 op if (likely(cp <= UINT32_C(0x10FFFF))) {
13 3448adb0 2022-11-02 op return (enum case_property)
14 3448adb0 2022-11-02 op case_minor[case_major[cp >> 8] + (cp & 0xFF)];
15 3448adb0 2022-11-02 op } else {
16 3448adb0 2022-11-02 op return CASE_PROP_OTHER;
17 3448adb0 2022-11-02 op }
18 3448adb0 2022-11-02 op }
19 3448adb0 2022-11-02 op
20 3448adb0 2022-11-02 op static inline int_least32_t
21 3448adb0 2022-11-02 op get_case_offset(uint_least32_t cp, const uint_least16_t *major,
22 3448adb0 2022-11-02 op const int_least32_t *minor)
23 3448adb0 2022-11-02 op {
24 3448adb0 2022-11-02 op if (likely(cp <= UINT32_C(0x10FFFF))) {
25 3448adb0 2022-11-02 op /*
26 3448adb0 2022-11-02 op * this value might be larger than or equal to 0x110000
27 3448adb0 2022-11-02 op * for the special-case-mapping. This needs to be handled
28 3448adb0 2022-11-02 op * separately
29 3448adb0 2022-11-02 op */
30 3448adb0 2022-11-02 op return minor[major[cp >> 8] + (cp & 0xFF)];
31 3448adb0 2022-11-02 op } else {
32 3448adb0 2022-11-02 op return 0;
33 3448adb0 2022-11-02 op }
34 3448adb0 2022-11-02 op }
35 3448adb0 2022-11-02 op
36 3448adb0 2022-11-02 op static inline size_t
37 3448adb0 2022-11-02 op to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
38 3448adb0 2022-11-02 op uint_least8_t final_sigma_level, const uint_least16_t *major,
39 3448adb0 2022-11-02 op const int_least32_t *minor, const struct special_case *sc)
40 3448adb0 2022-11-02 op {
41 3448adb0 2022-11-02 op HERODOTUS_READER tmp;
42 3448adb0 2022-11-02 op enum case_property prop;
43 3448adb0 2022-11-02 op enum herodotus_status s;
44 3448adb0 2022-11-02 op size_t off, i;
45 3448adb0 2022-11-02 op uint_least32_t cp, tmp_cp;
46 3448adb0 2022-11-02 op int_least32_t map;
47 3448adb0 2022-11-02 op
48 3448adb0 2022-11-02 op for (; herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;) {
49 3448adb0 2022-11-02 op if (sc == lower_special) {
50 3448adb0 2022-11-02 op /*
51 3448adb0 2022-11-02 op * For the special Final_Sigma-rule (see SpecialCasing.txt),
52 3448adb0 2022-11-02 op * which is the only non-localized case-dependent rule,
53 3448adb0 2022-11-02 op * we apply a different mapping when a sigma is at the
54 3448adb0 2022-11-02 op * end of a word.
55 3448adb0 2022-11-02 op *
56 3448adb0 2022-11-02 op * Before: cased case-ignorable*
57 3448adb0 2022-11-02 op * After: not(case-ignorable* cased)
58 3448adb0 2022-11-02 op *
59 3448adb0 2022-11-02 op * We check the after-condition on demand, but the before-
60 3448adb0 2022-11-02 op * condition is best checked using the "level"-heuristic
61 3448adb0 2022-11-02 op * also used in the sentence and line breaking-implementations.
62 3448adb0 2022-11-02 op */
63 3448adb0 2022-11-02 op if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL LETTER SIGMA */
64 3448adb0 2022-11-02 op (final_sigma_level == 1 ||
65 3448adb0 2022-11-02 op final_sigma_level == 2)) {
66 3448adb0 2022-11-02 op /*
67 3448adb0 2022-11-02 op * check succeeding characters by first skipping
68 3448adb0 2022-11-02 op * all case-ignorable characters and then checking
69 3448adb0 2022-11-02 op * if the succeeding character is cased, invalidating
70 3448adb0 2022-11-02 op * the after-condition
71 3448adb0 2022-11-02 op */
72 3448adb0 2022-11-02 op herodotus_reader_copy(r, &tmp);
73 3448adb0 2022-11-02 op for (prop = NUM_CASE_PROPS;
74 3448adb0 2022-11-02 op (s = herodotus_read_codepoint(&tmp, true, &tmp_cp)) ==
75 3448adb0 2022-11-02 op HERODOTUS_STATUS_SUCCESS; ) {
76 3448adb0 2022-11-02 op prop = get_case_property(tmp_cp);
77 3448adb0 2022-11-02 op
78 3448adb0 2022-11-02 op if (prop != CASE_PROP_CASE_IGNORABLE &&
79 3448adb0 2022-11-02 op prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
80 3448adb0 2022-11-02 op break;
81 3448adb0 2022-11-02 op }
82 3448adb0 2022-11-02 op }
83 3448adb0 2022-11-02 op
84 3448adb0 2022-11-02 op /*
85 3448adb0 2022-11-02 op * Now prop is something other than case-ignorable or
86 3448adb0 2022-11-02 op * the source-string ended.
87 3448adb0 2022-11-02 op * If it is something other than cased, we know
88 3448adb0 2022-11-02 op * that the after-condition holds
89 3448adb0 2022-11-02 op */
90 3448adb0 2022-11-02 op if (s != HERODOTUS_STATUS_SUCCESS ||
91 3448adb0 2022-11-02 op (prop != CASE_PROP_CASED &&
92 3448adb0 2022-11-02 op prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
93 3448adb0 2022-11-02 op /*
94 3448adb0 2022-11-02 op * write GREEK SMALL LETTER FINAL SIGMA to
95 3448adb0 2022-11-02 op * destination
96 3448adb0 2022-11-02 op */
97 3448adb0 2022-11-02 op herodotus_write_codepoint(w, UINT32_C(0x03C2));
98 3448adb0 2022-11-02 op
99 3448adb0 2022-11-02 op /* reset Final_Sigma-state and continue */
100 3448adb0 2022-11-02 op final_sigma_level = 0;
101 3448adb0 2022-11-02 op continue;
102 3448adb0 2022-11-02 op }
103 3448adb0 2022-11-02 op }
104 3448adb0 2022-11-02 op
105 3448adb0 2022-11-02 op /* update state */
106 3448adb0 2022-11-02 op prop = get_case_property(cp);
107 3448adb0 2022-11-02 op if ((final_sigma_level == 0 ||
108 3448adb0 2022-11-02 op final_sigma_level == 1) &&
109 3448adb0 2022-11-02 op (prop == CASE_PROP_CASED ||
110 3448adb0 2022-11-02 op prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
111 3448adb0 2022-11-02 op /* sequence has begun */
112 3448adb0 2022-11-02 op final_sigma_level = 1;
113 3448adb0 2022-11-02 op } else if ((final_sigma_level == 1 ||
114 3448adb0 2022-11-02 op final_sigma_level == 2) &&
115 3448adb0 2022-11-02 op (prop == CASE_PROP_CASE_IGNORABLE ||
116 3448adb0 2022-11-02 op prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
117 3448adb0 2022-11-02 op /* case-ignorable sequence begins or continued */
118 3448adb0 2022-11-02 op final_sigma_level = 2;
119 3448adb0 2022-11-02 op } else {
120 3448adb0 2022-11-02 op /* sequence broke */
121 3448adb0 2022-11-02 op final_sigma_level = 0;
122 3448adb0 2022-11-02 op }
123 3448adb0 2022-11-02 op }
124 3448adb0 2022-11-02 op
125 3448adb0 2022-11-02 op /* get and handle case mapping */
126 3448adb0 2022-11-02 op if (unlikely((map = get_case_offset(cp, major, minor)) >=
127 3448adb0 2022-11-02 op INT32_C(0x110000))) {
128 3448adb0 2022-11-02 op /* we have a special case and the offset in the sc-array
129 3448adb0 2022-11-02 op * is the difference to 0x110000*/
130 3448adb0 2022-11-02 op off = (uint_least32_t)map - UINT32_C(0x110000);
131 3448adb0 2022-11-02 op
132 3448adb0 2022-11-02 op for (i = 0; i < sc[off].cplen; i++) {
133 3448adb0 2022-11-02 op herodotus_write_codepoint(w, sc[off].cp[i]);
134 3448adb0 2022-11-02 op }
135 3448adb0 2022-11-02 op } else {
136 3448adb0 2022-11-02 op /* we have a simple mapping */
137 3448adb0 2022-11-02 op herodotus_write_codepoint(w, (uint_least32_t)
138 3448adb0 2022-11-02 op ((int_least32_t)cp + map));
139 3448adb0 2022-11-02 op }
140 3448adb0 2022-11-02 op }
141 3448adb0 2022-11-02 op
142 3448adb0 2022-11-02 op herodotus_writer_nul_terminate(w);
143 3448adb0 2022-11-02 op
144 3448adb0 2022-11-02 op return herodotus_writer_number_written(w);
145 3448adb0 2022-11-02 op }
146 3448adb0 2022-11-02 op
147 3448adb0 2022-11-02 op static size_t
148 3448adb0 2022-11-02 op herodotus_next_word_break(const HERODOTUS_READER *r)
149 3448adb0 2022-11-02 op {
150 3448adb0 2022-11-02 op HERODOTUS_READER tmp;
151 3448adb0 2022-11-02 op
152 3448adb0 2022-11-02 op herodotus_reader_copy(r, &tmp);
153 3448adb0 2022-11-02 op
154 3448adb0 2022-11-02 op if (r->type == HERODOTUS_TYPE_CODEPOINT) {
155 3448adb0 2022-11-02 op return grapheme_next_word_break(tmp.src, tmp.srclen);
156 3448adb0 2022-11-02 op } else { /* r->type == HERODOTUS_TYPE_UTF8 */
157 3448adb0 2022-11-02 op return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
158 3448adb0 2022-11-02 op }
159 3448adb0 2022-11-02 op }
160 3448adb0 2022-11-02 op
161 3448adb0 2022-11-02 op static inline size_t
162 3448adb0 2022-11-02 op to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
163 3448adb0 2022-11-02 op {
164 3448adb0 2022-11-02 op enum case_property prop;
165 3448adb0 2022-11-02 op enum herodotus_status s;
166 3448adb0 2022-11-02 op uint_least32_t cp;
167 3448adb0 2022-11-02 op size_t nwb;
168 3448adb0 2022-11-02 op
169 3448adb0 2022-11-02 op for (; (nwb = herodotus_next_word_break(r)) > 0;) {
170 3448adb0 2022-11-02 op herodotus_reader_push_advance_limit(r, nwb);
171 3448adb0 2022-11-02 op for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
172 3448adb0 2022-11-02 op /* check if we have a cased character */
173 3448adb0 2022-11-02 op prop = get_case_property(cp);
174 3448adb0 2022-11-02 op if (prop == CASE_PROP_CASED ||
175 3448adb0 2022-11-02 op prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
176 3448adb0 2022-11-02 op break;
177 3448adb0 2022-11-02 op } else {
178 3448adb0 2022-11-02 op /* write the data to the output verbatim, it if permits */
179 3448adb0 2022-11-02 op herodotus_write_codepoint(w, cp);
180 3448adb0 2022-11-02 op
181 3448adb0 2022-11-02 op /* increment reader */
182 3448adb0 2022-11-02 op herodotus_read_codepoint(r, true, &cp);
183 3448adb0 2022-11-02 op }
184 3448adb0 2022-11-02 op }
185 3448adb0 2022-11-02 op
186 3448adb0 2022-11-02 op if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
187 3448adb0 2022-11-02 op /* we are done */
188 3448adb0 2022-11-02 op herodotus_reader_pop_limit(r);
189 3448adb0 2022-11-02 op break;
190 3448adb0 2022-11-02 op } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
191 3448adb0 2022-11-02 op /*
192 3448adb0 2022-11-02 op * we did not encounter any cased character
193 3448adb0 2022-11-02 op * up to the word break
194 3448adb0 2022-11-02 op */
195 3448adb0 2022-11-02 op herodotus_reader_pop_limit(r);
196 3448adb0 2022-11-02 op continue;
197 3448adb0 2022-11-02 op } else {
198 3448adb0 2022-11-02 op /*
199 3448adb0 2022-11-02 op * we encountered a cased character before the word
200 3448adb0 2022-11-02 op * break, convert it to titlecase
201 3448adb0 2022-11-02 op */
202 3448adb0 2022-11-02 op herodotus_reader_push_advance_limit(r,
203 3448adb0 2022-11-02 op herodotus_reader_next_codepoint_break(r));
204 3448adb0 2022-11-02 op to_case(r, w, 0, title_major, title_minor, title_special);
205 3448adb0 2022-11-02 op herodotus_reader_pop_limit(r);
206 3448adb0 2022-11-02 op }
207 3448adb0 2022-11-02 op
208 3448adb0 2022-11-02 op /* cast the rest of the codepoints in the word to lowercase */
209 3448adb0 2022-11-02 op to_case(r, w, 1, lower_major, lower_minor, lower_special);
210 3448adb0 2022-11-02 op
211 3448adb0 2022-11-02 op /* remove the limit on the word before the next iteration */
212 3448adb0 2022-11-02 op herodotus_reader_pop_limit(r);
213 3448adb0 2022-11-02 op }
214 3448adb0 2022-11-02 op
215 3448adb0 2022-11-02 op herodotus_writer_nul_terminate(w);
216 3448adb0 2022-11-02 op
217 3448adb0 2022-11-02 op return herodotus_writer_number_written(w);
218 3448adb0 2022-11-02 op }
219 3448adb0 2022-11-02 op
220 3448adb0 2022-11-02 op size_t
221 3448adb0 2022-11-02 op grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
222 3448adb0 2022-11-02 op {
223 3448adb0 2022-11-02 op HERODOTUS_READER r;
224 3448adb0 2022-11-02 op HERODOTUS_WRITER w;
225 3448adb0 2022-11-02 op
226 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
227 3448adb0 2022-11-02 op herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
228 3448adb0 2022-11-02 op
229 3448adb0 2022-11-02 op return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
230 3448adb0 2022-11-02 op }
231 3448adb0 2022-11-02 op
232 3448adb0 2022-11-02 op size_t
233 3448adb0 2022-11-02 op grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
234 3448adb0 2022-11-02 op {
235 3448adb0 2022-11-02 op HERODOTUS_READER r;
236 3448adb0 2022-11-02 op HERODOTUS_WRITER w;
237 3448adb0 2022-11-02 op
238 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
239 3448adb0 2022-11-02 op herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
240 3448adb0 2022-11-02 op
241 3448adb0 2022-11-02 op return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
242 3448adb0 2022-11-02 op }
243 3448adb0 2022-11-02 op
244 3448adb0 2022-11-02 op size_t
245 3448adb0 2022-11-02 op grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
246 3448adb0 2022-11-02 op {
247 3448adb0 2022-11-02 op HERODOTUS_READER r;
248 3448adb0 2022-11-02 op HERODOTUS_WRITER w;
249 3448adb0 2022-11-02 op
250 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
251 3448adb0 2022-11-02 op herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
252 3448adb0 2022-11-02 op
253 3448adb0 2022-11-02 op return to_titlecase(&r, &w);
254 3448adb0 2022-11-02 op }
255 3448adb0 2022-11-02 op
256 3448adb0 2022-11-02 op size_t
257 3448adb0 2022-11-02 op grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
258 3448adb0 2022-11-02 op {
259 3448adb0 2022-11-02 op HERODOTUS_READER r;
260 3448adb0 2022-11-02 op HERODOTUS_WRITER w;
261 3448adb0 2022-11-02 op
262 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
263 3448adb0 2022-11-02 op herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
264 3448adb0 2022-11-02 op
265 3448adb0 2022-11-02 op return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
266 3448adb0 2022-11-02 op }
267 3448adb0 2022-11-02 op
268 3448adb0 2022-11-02 op size_t
269 3448adb0 2022-11-02 op grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
270 3448adb0 2022-11-02 op {
271 3448adb0 2022-11-02 op HERODOTUS_READER r;
272 3448adb0 2022-11-02 op HERODOTUS_WRITER w;
273 3448adb0 2022-11-02 op
274 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
275 3448adb0 2022-11-02 op herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
276 3448adb0 2022-11-02 op
277 3448adb0 2022-11-02 op return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
278 3448adb0 2022-11-02 op }
279 3448adb0 2022-11-02 op
280 3448adb0 2022-11-02 op size_t
281 3448adb0 2022-11-02 op grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
282 3448adb0 2022-11-02 op {
283 3448adb0 2022-11-02 op HERODOTUS_READER r;
284 3448adb0 2022-11-02 op HERODOTUS_WRITER w;
285 3448adb0 2022-11-02 op
286 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
287 3448adb0 2022-11-02 op herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
288 3448adb0 2022-11-02 op
289 3448adb0 2022-11-02 op return to_titlecase(&r, &w);
290 3448adb0 2022-11-02 op }
291 3448adb0 2022-11-02 op
292 3448adb0 2022-11-02 op static inline bool
293 3448adb0 2022-11-02 op is_case(HERODOTUS_READER *r, const uint_least16_t *major,
294 3448adb0 2022-11-02 op const int_least32_t *minor, const struct special_case *sc,
295 3448adb0 2022-11-02 op size_t *output)
296 3448adb0 2022-11-02 op {
297 3448adb0 2022-11-02 op size_t off, i;
298 3448adb0 2022-11-02 op bool ret = true;
299 3448adb0 2022-11-02 op uint_least32_t cp;
300 3448adb0 2022-11-02 op int_least32_t map;
301 3448adb0 2022-11-02 op
302 3448adb0 2022-11-02 op for (; herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS;) {
303 3448adb0 2022-11-02 op /* get and handle case mapping */
304 3448adb0 2022-11-02 op if (unlikely((map = get_case_offset(cp, major, minor)) >=
305 3448adb0 2022-11-02 op INT32_C(0x110000))) {
306 3448adb0 2022-11-02 op /* we have a special case and the offset in the sc-array
307 3448adb0 2022-11-02 op * is the difference to 0x110000*/
308 3448adb0 2022-11-02 op off = (uint_least32_t)map - UINT32_C(0x110000);
309 3448adb0 2022-11-02 op
310 3448adb0 2022-11-02 op for (i = 0; i < sc[off].cplen; i++) {
311 3448adb0 2022-11-02 op if (herodotus_read_codepoint(r, false, &cp) ==
312 3448adb0 2022-11-02 op HERODOTUS_STATUS_SUCCESS) {
313 3448adb0 2022-11-02 op if (cp != sc[off].cp[i]) {
314 3448adb0 2022-11-02 op ret = false;
315 3448adb0 2022-11-02 op goto done;
316 3448adb0 2022-11-02 op } else {
317 3448adb0 2022-11-02 op /* move forward */
318 3448adb0 2022-11-02 op herodotus_read_codepoint(r, true, &cp);
319 3448adb0 2022-11-02 op }
320 3448adb0 2022-11-02 op } else {
321 3448adb0 2022-11-02 op /*
322 3448adb0 2022-11-02 op * input ended and we didn't see
323 3448adb0 2022-11-02 op * any difference so far, so this
324 3448adb0 2022-11-02 op * string is in fact okay
325 3448adb0 2022-11-02 op */
326 3448adb0 2022-11-02 op ret = true;
327 3448adb0 2022-11-02 op goto done;
328 3448adb0 2022-11-02 op }
329 3448adb0 2022-11-02 op }
330 3448adb0 2022-11-02 op } else {
331 3448adb0 2022-11-02 op /* we have a simple mapping */
332 3448adb0 2022-11-02 op if (cp != (uint_least32_t)((int_least32_t)cp + map)) {
333 3448adb0 2022-11-02 op /* we have a difference */
334 3448adb0 2022-11-02 op ret = false;
335 3448adb0 2022-11-02 op goto done;
336 3448adb0 2022-11-02 op } else {
337 3448adb0 2022-11-02 op /* move forward */
338 3448adb0 2022-11-02 op herodotus_read_codepoint(r, true, &cp);
339 3448adb0 2022-11-02 op }
340 3448adb0 2022-11-02 op }
341 3448adb0 2022-11-02 op }
342 3448adb0 2022-11-02 op done:
343 3448adb0 2022-11-02 op if (output) {
344 3448adb0 2022-11-02 op *output = herodotus_reader_number_read(r);
345 3448adb0 2022-11-02 op }
346 3448adb0 2022-11-02 op return ret;
347 3448adb0 2022-11-02 op }
348 3448adb0 2022-11-02 op
349 3448adb0 2022-11-02 op static inline bool
350 3448adb0 2022-11-02 op is_titlecase(HERODOTUS_READER *r, size_t *output)
351 3448adb0 2022-11-02 op {
352 3448adb0 2022-11-02 op enum case_property prop;
353 3448adb0 2022-11-02 op enum herodotus_status s;
354 3448adb0 2022-11-02 op bool ret = true;
355 3448adb0 2022-11-02 op uint_least32_t cp;
356 3448adb0 2022-11-02 op size_t nwb;
357 3448adb0 2022-11-02 op
358 3448adb0 2022-11-02 op for (; (nwb = herodotus_next_word_break(r)) > 0;) {
359 3448adb0 2022-11-02 op herodotus_reader_push_advance_limit(r, nwb);
360 3448adb0 2022-11-02 op for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
361 3448adb0 2022-11-02 op /* check if we have a cased character */
362 3448adb0 2022-11-02 op prop = get_case_property(cp);
363 3448adb0 2022-11-02 op if (prop == CASE_PROP_CASED ||
364 3448adb0 2022-11-02 op prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
365 3448adb0 2022-11-02 op break;
366 3448adb0 2022-11-02 op } else {
367 3448adb0 2022-11-02 op /* increment reader */
368 3448adb0 2022-11-02 op herodotus_read_codepoint(r, true, &cp);
369 3448adb0 2022-11-02 op }
370 3448adb0 2022-11-02 op }
371 3448adb0 2022-11-02 op
372 3448adb0 2022-11-02 op if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
373 3448adb0 2022-11-02 op /* we are done */
374 3448adb0 2022-11-02 op break;
375 3448adb0 2022-11-02 op } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
376 3448adb0 2022-11-02 op /*
377 3448adb0 2022-11-02 op * we did not encounter any cased character
378 3448adb0 2022-11-02 op * up to the word break
379 3448adb0 2022-11-02 op */
380 3448adb0 2022-11-02 op herodotus_reader_pop_limit(r);
381 3448adb0 2022-11-02 op continue;
382 3448adb0 2022-11-02 op } else {
383 3448adb0 2022-11-02 op /*
384 3448adb0 2022-11-02 op * we encountered a cased character before the word
385 3448adb0 2022-11-02 op * break, check if it's titlecase
386 3448adb0 2022-11-02 op */
387 3448adb0 2022-11-02 op herodotus_reader_push_advance_limit(r,
388 3448adb0 2022-11-02 op herodotus_reader_next_codepoint_break(r));
389 3448adb0 2022-11-02 op if (!is_case(r, title_major, title_minor, title_special, NULL)) {
390 3448adb0 2022-11-02 op ret = false;
391 3448adb0 2022-11-02 op goto done;
392 3448adb0 2022-11-02 op }
393 3448adb0 2022-11-02 op herodotus_reader_pop_limit(r);
394 3448adb0 2022-11-02 op }
395 3448adb0 2022-11-02 op
396 3448adb0 2022-11-02 op /* check if the rest of the codepoints in the word are lowercase */
397 3448adb0 2022-11-02 op if (!is_case(r, lower_major, lower_minor, lower_special, NULL)) {
398 3448adb0 2022-11-02 op ret = false;
399 3448adb0 2022-11-02 op goto done;
400 3448adb0 2022-11-02 op }
401 3448adb0 2022-11-02 op
402 3448adb0 2022-11-02 op /* remove the limit on the word before the next iteration */
403 3448adb0 2022-11-02 op herodotus_reader_pop_limit(r);
404 3448adb0 2022-11-02 op }
405 3448adb0 2022-11-02 op done:
406 3448adb0 2022-11-02 op if (output) {
407 3448adb0 2022-11-02 op *output = herodotus_reader_number_read(r);
408 3448adb0 2022-11-02 op }
409 3448adb0 2022-11-02 op return ret;
410 3448adb0 2022-11-02 op }
411 3448adb0 2022-11-02 op
412 3448adb0 2022-11-02 op bool
413 3448adb0 2022-11-02 op grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
414 3448adb0 2022-11-02 op {
415 3448adb0 2022-11-02 op HERODOTUS_READER r;
416 3448adb0 2022-11-02 op
417 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
418 3448adb0 2022-11-02 op
419 3448adb0 2022-11-02 op return is_case(&r, upper_major, upper_minor, upper_special, caselen);
420 3448adb0 2022-11-02 op }
421 3448adb0 2022-11-02 op
422 3448adb0 2022-11-02 op bool
423 3448adb0 2022-11-02 op grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
424 3448adb0 2022-11-02 op {
425 3448adb0 2022-11-02 op HERODOTUS_READER r;
426 3448adb0 2022-11-02 op
427 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
428 3448adb0 2022-11-02 op
429 3448adb0 2022-11-02 op return is_case(&r, lower_major, lower_minor, lower_special, caselen);
430 3448adb0 2022-11-02 op }
431 3448adb0 2022-11-02 op
432 3448adb0 2022-11-02 op bool
433 3448adb0 2022-11-02 op grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *caselen)
434 3448adb0 2022-11-02 op {
435 3448adb0 2022-11-02 op HERODOTUS_READER r;
436 3448adb0 2022-11-02 op
437 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
438 3448adb0 2022-11-02 op
439 3448adb0 2022-11-02 op return is_titlecase(&r, caselen);
440 3448adb0 2022-11-02 op }
441 3448adb0 2022-11-02 op
442 3448adb0 2022-11-02 op bool
443 3448adb0 2022-11-02 op grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *caselen)
444 3448adb0 2022-11-02 op {
445 3448adb0 2022-11-02 op HERODOTUS_READER r;
446 3448adb0 2022-11-02 op
447 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
448 3448adb0 2022-11-02 op
449 3448adb0 2022-11-02 op return is_case(&r, upper_major, upper_minor, upper_special, caselen);
450 3448adb0 2022-11-02 op }
451 3448adb0 2022-11-02 op
452 3448adb0 2022-11-02 op bool
453 3448adb0 2022-11-02 op grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *caselen)
454 3448adb0 2022-11-02 op {
455 3448adb0 2022-11-02 op HERODOTUS_READER r;
456 3448adb0 2022-11-02 op
457 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
458 3448adb0 2022-11-02 op
459 3448adb0 2022-11-02 op return is_case(&r, lower_major, lower_minor, lower_special, caselen);
460 3448adb0 2022-11-02 op }
461 3448adb0 2022-11-02 op
462 3448adb0 2022-11-02 op bool
463 3448adb0 2022-11-02 op grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *caselen)
464 3448adb0 2022-11-02 op {
465 3448adb0 2022-11-02 op HERODOTUS_READER r;
466 3448adb0 2022-11-02 op
467 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
468 3448adb0 2022-11-02 op
469 3448adb0 2022-11-02 op return is_titlecase(&r, caselen);
470 3448adb0 2022-11-02 op }