1 3448adb0 2022-11-02 op /* See LICENSE file for copyright and license details. */
2 3448adb0 2022-11-02 op #include <stddef.h>
3 3448adb0 2022-11-02 op #include <stdint.h>
5 3448adb0 2022-11-02 op #include "../grapheme.h"
6 3448adb0 2022-11-02 op #include "../gen/case.h"
7 3448adb0 2022-11-02 op #include "util.h"
9 3448adb0 2022-11-02 op static inline enum case_property
10 3448adb0 2022-11-02 op get_case_property(uint_least32_t cp)
12 3448adb0 2022-11-02 op if (likely(cp <= UINT32_C(0x10FFFF))) {
13 3448adb0 2022-11-02 op return (enum case_property)
14 3448adb0 2022-11-02 op case_minor[case_major[cp >> 8] + (cp & 0xFF)];
16 3448adb0 2022-11-02 op return CASE_PROP_OTHER;
20 3448adb0 2022-11-02 op static inline int_least32_t
21 3448adb0 2022-11-02 op get_case_offset(uint_least32_t cp, const uint_least16_t *major,
22 3448adb0 2022-11-02 op const int_least32_t *minor)
24 3448adb0 2022-11-02 op if (likely(cp <= UINT32_C(0x10FFFF))) {
26 3448adb0 2022-11-02 op * this value might be larger than or equal to 0x110000
27 3448adb0 2022-11-02 op * for the special-case-mapping. This needs to be handled
30 3448adb0 2022-11-02 op return minor[major[cp >> 8] + (cp & 0xFF)];
36 3448adb0 2022-11-02 op static inline size_t
37 3448adb0 2022-11-02 op to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
38 3448adb0 2022-11-02 op uint_least8_t final_sigma_level, const uint_least16_t *major,
39 3448adb0 2022-11-02 op const int_least32_t *minor, const struct special_case *sc)
41 3448adb0 2022-11-02 op HERODOTUS_READER tmp;
42 3448adb0 2022-11-02 op enum case_property prop;
43 3448adb0 2022-11-02 op enum herodotus_status s;
44 3448adb0 2022-11-02 op size_t off, i;
45 3448adb0 2022-11-02 op uint_least32_t cp, tmp_cp;
46 3448adb0 2022-11-02 op int_least32_t map;
48 3448adb0 2022-11-02 op for (; herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;) {
49 3448adb0 2022-11-02 op if (sc == lower_special) {
51 3448adb0 2022-11-02 op * For the special Final_Sigma-rule (see SpecialCasing.txt),
52 3448adb0 2022-11-02 op * which is the only non-localized case-dependent rule,
53 3448adb0 2022-11-02 op * we apply a different mapping when a sigma is at the
54 3448adb0 2022-11-02 op * end of a word.
56 3448adb0 2022-11-02 op * Before: cased case-ignorable*
57 3448adb0 2022-11-02 op * After: not(case-ignorable* cased)
59 3448adb0 2022-11-02 op * We check the after-condition on demand, but the before-
60 3448adb0 2022-11-02 op * condition is best checked using the "level"-heuristic
61 3448adb0 2022-11-02 op * also used in the sentence and line breaking-implementations.
63 3448adb0 2022-11-02 op if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL LETTER SIGMA */
64 3448adb0 2022-11-02 op (final_sigma_level == 1 ||
65 3448adb0 2022-11-02 op final_sigma_level == 2)) {
67 3448adb0 2022-11-02 op * check succeeding characters by first skipping
68 3448adb0 2022-11-02 op * all case-ignorable characters and then checking
69 3448adb0 2022-11-02 op * if the succeeding character is cased, invalidating
70 3448adb0 2022-11-02 op * the after-condition
72 3448adb0 2022-11-02 op herodotus_reader_copy(r, &tmp);
73 3448adb0 2022-11-02 op for (prop = NUM_CASE_PROPS;
74 3448adb0 2022-11-02 op (s = herodotus_read_codepoint(&tmp, true, &tmp_cp)) ==
75 3448adb0 2022-11-02 op HERODOTUS_STATUS_SUCCESS; ) {
76 3448adb0 2022-11-02 op prop = get_case_property(tmp_cp);
78 3448adb0 2022-11-02 op if (prop != CASE_PROP_CASE_IGNORABLE &&
79 3448adb0 2022-11-02 op prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
85 3448adb0 2022-11-02 op * Now prop is something other than case-ignorable or
86 3448adb0 2022-11-02 op * the source-string ended.
87 3448adb0 2022-11-02 op * If it is something other than cased, we know
88 3448adb0 2022-11-02 op * that the after-condition holds
90 3448adb0 2022-11-02 op if (s != HERODOTUS_STATUS_SUCCESS ||
91 3448adb0 2022-11-02 op (prop != CASE_PROP_CASED &&
92 3448adb0 2022-11-02 op prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
94 3448adb0 2022-11-02 op * write GREEK SMALL LETTER FINAL SIGMA to
97 3448adb0 2022-11-02 op herodotus_write_codepoint(w, UINT32_C(0x03C2));
99 3448adb0 2022-11-02 op /* reset Final_Sigma-state and continue */
100 3448adb0 2022-11-02 op final_sigma_level = 0;
105 3448adb0 2022-11-02 op /* update state */
106 3448adb0 2022-11-02 op prop = get_case_property(cp);
107 3448adb0 2022-11-02 op if ((final_sigma_level == 0 ||
108 3448adb0 2022-11-02 op final_sigma_level == 1) &&
109 3448adb0 2022-11-02 op (prop == CASE_PROP_CASED ||
110 3448adb0 2022-11-02 op prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
111 3448adb0 2022-11-02 op /* sequence has begun */
112 3448adb0 2022-11-02 op final_sigma_level = 1;
113 3448adb0 2022-11-02 op } else if ((final_sigma_level == 1 ||
114 3448adb0 2022-11-02 op final_sigma_level == 2) &&
115 3448adb0 2022-11-02 op (prop == CASE_PROP_CASE_IGNORABLE ||
116 3448adb0 2022-11-02 op prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
117 3448adb0 2022-11-02 op /* case-ignorable sequence begins or continued */
118 3448adb0 2022-11-02 op final_sigma_level = 2;
120 3448adb0 2022-11-02 op /* sequence broke */
121 3448adb0 2022-11-02 op final_sigma_level = 0;
125 3448adb0 2022-11-02 op /* get and handle case mapping */
126 3448adb0 2022-11-02 op if (unlikely((map = get_case_offset(cp, major, minor)) >=
127 3448adb0 2022-11-02 op INT32_C(0x110000))) {
128 3448adb0 2022-11-02 op /* we have a special case and the offset in the sc-array
129 3448adb0 2022-11-02 op * is the difference to 0x110000*/
130 3448adb0 2022-11-02 op off = (uint_least32_t)map - UINT32_C(0x110000);
132 3448adb0 2022-11-02 op for (i = 0; i < sc[off].cplen; i++) {
133 3448adb0 2022-11-02 op herodotus_write_codepoint(w, sc[off].cp[i]);
136 3448adb0 2022-11-02 op /* we have a simple mapping */
137 3448adb0 2022-11-02 op herodotus_write_codepoint(w, (uint_least32_t)
138 3448adb0 2022-11-02 op ((int_least32_t)cp + map));
142 3448adb0 2022-11-02 op herodotus_writer_nul_terminate(w);
144 3448adb0 2022-11-02 op return herodotus_writer_number_written(w);
147 3448adb0 2022-11-02 op static size_t
148 3448adb0 2022-11-02 op herodotus_next_word_break(const HERODOTUS_READER *r)
150 3448adb0 2022-11-02 op HERODOTUS_READER tmp;
152 3448adb0 2022-11-02 op herodotus_reader_copy(r, &tmp);
154 3448adb0 2022-11-02 op if (r->type == HERODOTUS_TYPE_CODEPOINT) {
155 3448adb0 2022-11-02 op return grapheme_next_word_break(tmp.src, tmp.srclen);
156 3448adb0 2022-11-02 op } else { /* r->type == HERODOTUS_TYPE_UTF8 */
157 3448adb0 2022-11-02 op return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
161 3448adb0 2022-11-02 op static inline size_t
162 3448adb0 2022-11-02 op to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
164 3448adb0 2022-11-02 op enum case_property prop;
165 3448adb0 2022-11-02 op enum herodotus_status s;
166 3448adb0 2022-11-02 op uint_least32_t cp;
169 3448adb0 2022-11-02 op for (; (nwb = herodotus_next_word_break(r)) > 0;) {
170 3448adb0 2022-11-02 op herodotus_reader_push_advance_limit(r, nwb);
171 3448adb0 2022-11-02 op for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
172 3448adb0 2022-11-02 op /* check if we have a cased character */
173 3448adb0 2022-11-02 op prop = get_case_property(cp);
174 3448adb0 2022-11-02 op if (prop == CASE_PROP_CASED ||
175 3448adb0 2022-11-02 op prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
178 3448adb0 2022-11-02 op /* write the data to the output verbatim, it if permits */
179 3448adb0 2022-11-02 op herodotus_write_codepoint(w, cp);
181 3448adb0 2022-11-02 op /* increment reader */
182 3448adb0 2022-11-02 op herodotus_read_codepoint(r, true, &cp);
186 3448adb0 2022-11-02 op if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
187 3448adb0 2022-11-02 op /* we are done */
188 3448adb0 2022-11-02 op herodotus_reader_pop_limit(r);
190 3448adb0 2022-11-02 op } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
192 3448adb0 2022-11-02 op * we did not encounter any cased character
193 3448adb0 2022-11-02 op * up to the word break
195 3448adb0 2022-11-02 op herodotus_reader_pop_limit(r);
199 3448adb0 2022-11-02 op * we encountered a cased character before the word
200 3448adb0 2022-11-02 op * break, convert it to titlecase
202 3448adb0 2022-11-02 op herodotus_reader_push_advance_limit(r,
203 3448adb0 2022-11-02 op herodotus_reader_next_codepoint_break(r));
204 3448adb0 2022-11-02 op to_case(r, w, 0, title_major, title_minor, title_special);
205 3448adb0 2022-11-02 op herodotus_reader_pop_limit(r);
208 3448adb0 2022-11-02 op /* cast the rest of the codepoints in the word to lowercase */
209 3448adb0 2022-11-02 op to_case(r, w, 1, lower_major, lower_minor, lower_special);
211 3448adb0 2022-11-02 op /* remove the limit on the word before the next iteration */
212 3448adb0 2022-11-02 op herodotus_reader_pop_limit(r);
215 3448adb0 2022-11-02 op herodotus_writer_nul_terminate(w);
217 3448adb0 2022-11-02 op return herodotus_writer_number_written(w);
221 3448adb0 2022-11-02 op grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
223 3448adb0 2022-11-02 op HERODOTUS_READER r;
224 3448adb0 2022-11-02 op HERODOTUS_WRITER w;
226 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
227 3448adb0 2022-11-02 op herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
229 3448adb0 2022-11-02 op return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
233 3448adb0 2022-11-02 op grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
235 3448adb0 2022-11-02 op HERODOTUS_READER r;
236 3448adb0 2022-11-02 op HERODOTUS_WRITER w;
238 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
239 3448adb0 2022-11-02 op herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
241 3448adb0 2022-11-02 op return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
245 3448adb0 2022-11-02 op grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
247 3448adb0 2022-11-02 op HERODOTUS_READER r;
248 3448adb0 2022-11-02 op HERODOTUS_WRITER w;
250 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
251 3448adb0 2022-11-02 op herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
253 3448adb0 2022-11-02 op return to_titlecase(&r, &w);
257 3448adb0 2022-11-02 op grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
259 3448adb0 2022-11-02 op HERODOTUS_READER r;
260 3448adb0 2022-11-02 op HERODOTUS_WRITER w;
262 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
263 3448adb0 2022-11-02 op herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
265 3448adb0 2022-11-02 op return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
269 3448adb0 2022-11-02 op grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
271 3448adb0 2022-11-02 op HERODOTUS_READER r;
272 3448adb0 2022-11-02 op HERODOTUS_WRITER w;
274 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
275 3448adb0 2022-11-02 op herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
277 3448adb0 2022-11-02 op return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
281 3448adb0 2022-11-02 op grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
283 3448adb0 2022-11-02 op HERODOTUS_READER r;
284 3448adb0 2022-11-02 op HERODOTUS_WRITER w;
286 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
287 3448adb0 2022-11-02 op herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
289 3448adb0 2022-11-02 op return to_titlecase(&r, &w);
292 3448adb0 2022-11-02 op static inline bool
293 3448adb0 2022-11-02 op is_case(HERODOTUS_READER *r, const uint_least16_t *major,
294 3448adb0 2022-11-02 op const int_least32_t *minor, const struct special_case *sc,
295 3448adb0 2022-11-02 op size_t *output)
297 3448adb0 2022-11-02 op size_t off, i;
298 3448adb0 2022-11-02 op bool ret = true;
299 3448adb0 2022-11-02 op uint_least32_t cp;
300 3448adb0 2022-11-02 op int_least32_t map;
302 3448adb0 2022-11-02 op for (; herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS;) {
303 3448adb0 2022-11-02 op /* get and handle case mapping */
304 3448adb0 2022-11-02 op if (unlikely((map = get_case_offset(cp, major, minor)) >=
305 3448adb0 2022-11-02 op INT32_C(0x110000))) {
306 3448adb0 2022-11-02 op /* we have a special case and the offset in the sc-array
307 3448adb0 2022-11-02 op * is the difference to 0x110000*/
308 3448adb0 2022-11-02 op off = (uint_least32_t)map - UINT32_C(0x110000);
310 3448adb0 2022-11-02 op for (i = 0; i < sc[off].cplen; i++) {
311 3448adb0 2022-11-02 op if (herodotus_read_codepoint(r, false, &cp) ==
312 3448adb0 2022-11-02 op HERODOTUS_STATUS_SUCCESS) {
313 3448adb0 2022-11-02 op if (cp != sc[off].cp[i]) {
317 3448adb0 2022-11-02 op /* move forward */
318 3448adb0 2022-11-02 op herodotus_read_codepoint(r, true, &cp);
322 3448adb0 2022-11-02 op * input ended and we didn't see
323 3448adb0 2022-11-02 op * any difference so far, so this
324 3448adb0 2022-11-02 op * string is in fact okay
331 3448adb0 2022-11-02 op /* we have a simple mapping */
332 3448adb0 2022-11-02 op if (cp != (uint_least32_t)((int_least32_t)cp + map)) {
333 3448adb0 2022-11-02 op /* we have a difference */
337 3448adb0 2022-11-02 op /* move forward */
338 3448adb0 2022-11-02 op herodotus_read_codepoint(r, true, &cp);
343 3448adb0 2022-11-02 op if (output) {
344 3448adb0 2022-11-02 op *output = herodotus_reader_number_read(r);
349 3448adb0 2022-11-02 op static inline bool
350 3448adb0 2022-11-02 op is_titlecase(HERODOTUS_READER *r, size_t *output)
352 3448adb0 2022-11-02 op enum case_property prop;
353 3448adb0 2022-11-02 op enum herodotus_status s;
354 3448adb0 2022-11-02 op bool ret = true;
355 3448adb0 2022-11-02 op uint_least32_t cp;
358 3448adb0 2022-11-02 op for (; (nwb = herodotus_next_word_break(r)) > 0;) {
359 3448adb0 2022-11-02 op herodotus_reader_push_advance_limit(r, nwb);
360 3448adb0 2022-11-02 op for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
361 3448adb0 2022-11-02 op /* check if we have a cased character */
362 3448adb0 2022-11-02 op prop = get_case_property(cp);
363 3448adb0 2022-11-02 op if (prop == CASE_PROP_CASED ||
364 3448adb0 2022-11-02 op prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
367 3448adb0 2022-11-02 op /* increment reader */
368 3448adb0 2022-11-02 op herodotus_read_codepoint(r, true, &cp);
372 3448adb0 2022-11-02 op if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
373 3448adb0 2022-11-02 op /* we are done */
375 3448adb0 2022-11-02 op } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
377 3448adb0 2022-11-02 op * we did not encounter any cased character
378 3448adb0 2022-11-02 op * up to the word break
380 3448adb0 2022-11-02 op herodotus_reader_pop_limit(r);
384 3448adb0 2022-11-02 op * we encountered a cased character before the word
385 3448adb0 2022-11-02 op * break, check if it's titlecase
387 3448adb0 2022-11-02 op herodotus_reader_push_advance_limit(r,
388 3448adb0 2022-11-02 op herodotus_reader_next_codepoint_break(r));
389 3448adb0 2022-11-02 op if (!is_case(r, title_major, title_minor, title_special, NULL)) {
393 3448adb0 2022-11-02 op herodotus_reader_pop_limit(r);
396 3448adb0 2022-11-02 op /* check if the rest of the codepoints in the word are lowercase */
397 3448adb0 2022-11-02 op if (!is_case(r, lower_major, lower_minor, lower_special, NULL)) {
402 3448adb0 2022-11-02 op /* remove the limit on the word before the next iteration */
403 3448adb0 2022-11-02 op herodotus_reader_pop_limit(r);
406 3448adb0 2022-11-02 op if (output) {
407 3448adb0 2022-11-02 op *output = herodotus_reader_number_read(r);
413 3448adb0 2022-11-02 op grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
415 3448adb0 2022-11-02 op HERODOTUS_READER r;
417 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
419 3448adb0 2022-11-02 op return is_case(&r, upper_major, upper_minor, upper_special, caselen);
423 3448adb0 2022-11-02 op grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
425 3448adb0 2022-11-02 op HERODOTUS_READER r;
427 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
429 3448adb0 2022-11-02 op return is_case(&r, lower_major, lower_minor, lower_special, caselen);
433 3448adb0 2022-11-02 op grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *caselen)
435 3448adb0 2022-11-02 op HERODOTUS_READER r;
437 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
439 3448adb0 2022-11-02 op return is_titlecase(&r, caselen);
443 3448adb0 2022-11-02 op grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *caselen)
445 3448adb0 2022-11-02 op HERODOTUS_READER r;
447 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
449 3448adb0 2022-11-02 op return is_case(&r, upper_major, upper_minor, upper_special, caselen);
453 3448adb0 2022-11-02 op grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *caselen)
455 3448adb0 2022-11-02 op HERODOTUS_READER r;
457 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
459 3448adb0 2022-11-02 op return is_case(&r, lower_major, lower_minor, lower_special, caselen);
463 3448adb0 2022-11-02 op grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *caselen)
465 3448adb0 2022-11-02 op HERODOTUS_READER r;
467 3448adb0 2022-11-02 op herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
469 3448adb0 2022-11-02 op return is_titlecase(&r, caselen);