1 /* See LICENSE file for copyright and license details. */
5 #include "../grapheme.h"
6 #include "../gen/case.h"
9 static inline enum case_property
10 get_case_property(uint_least32_t cp)
12 if (likely(cp <= UINT32_C(0x10FFFF))) {
13 return (enum case_property)
14 case_minor[case_major[cp >> 8] + (cp & 0xFF)];
16 return CASE_PROP_OTHER;
20 static inline int_least32_t
21 get_case_offset(uint_least32_t cp, const uint_least16_t *major,
22 const int_least32_t *minor)
24 if (likely(cp <= UINT32_C(0x10FFFF))) {
26 * this value might be larger than or equal to 0x110000
27 * for the special-case-mapping. This needs to be handled
30 return minor[major[cp >> 8] + (cp & 0xFF)];
37 to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
38 uint_least8_t final_sigma_level, const uint_least16_t *major,
39 const int_least32_t *minor, const struct special_case *sc)
42 enum case_property prop;
43 enum herodotus_status s;
45 uint_least32_t cp, tmp_cp;
48 for (; herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;) {
49 if (sc == lower_special) {
51 * For the special Final_Sigma-rule (see SpecialCasing.txt),
52 * which is the only non-localized case-dependent rule,
53 * we apply a different mapping when a sigma is at the
56 * Before: cased case-ignorable*
57 * After: not(case-ignorable* cased)
59 * We check the after-condition on demand, but the before-
60 * condition is best checked using the "level"-heuristic
61 * also used in the sentence and line breaking-implementations.
63 if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL LETTER SIGMA */
64 (final_sigma_level == 1 ||
65 final_sigma_level == 2)) {
67 * check succeeding characters by first skipping
68 * all case-ignorable characters and then checking
69 * if the succeeding character is cased, invalidating
72 herodotus_reader_copy(r, &tmp);
73 for (prop = NUM_CASE_PROPS;
74 (s = herodotus_read_codepoint(&tmp, true, &tmp_cp)) ==
75 HERODOTUS_STATUS_SUCCESS; ) {
76 prop = get_case_property(tmp_cp);
78 if (prop != CASE_PROP_CASE_IGNORABLE &&
79 prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
85 * Now prop is something other than case-ignorable or
86 * the source-string ended.
87 * If it is something other than cased, we know
88 * that the after-condition holds
90 if (s != HERODOTUS_STATUS_SUCCESS ||
91 (prop != CASE_PROP_CASED &&
92 prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
94 * write GREEK SMALL LETTER FINAL SIGMA to
97 herodotus_write_codepoint(w, UINT32_C(0x03C2));
99 /* reset Final_Sigma-state and continue */
100 final_sigma_level = 0;
106 prop = get_case_property(cp);
107 if ((final_sigma_level == 0 ||
108 final_sigma_level == 1) &&
109 (prop == CASE_PROP_CASED ||
110 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
111 /* sequence has begun */
112 final_sigma_level = 1;
113 } else if ((final_sigma_level == 1 ||
114 final_sigma_level == 2) &&
115 (prop == CASE_PROP_CASE_IGNORABLE ||
116 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
117 /* case-ignorable sequence begins or continued */
118 final_sigma_level = 2;
121 final_sigma_level = 0;
125 /* get and handle case mapping */
126 if (unlikely((map = get_case_offset(cp, major, minor)) >=
127 INT32_C(0x110000))) {
128 /* we have a special case and the offset in the sc-array
129 * is the difference to 0x110000*/
130 off = (uint_least32_t)map - UINT32_C(0x110000);
132 for (i = 0; i < sc[off].cplen; i++) {
133 herodotus_write_codepoint(w, sc[off].cp[i]);
136 /* we have a simple mapping */
137 herodotus_write_codepoint(w, (uint_least32_t)
138 ((int_least32_t)cp + map));
142 herodotus_writer_nul_terminate(w);
144 return herodotus_writer_number_written(w);
148 herodotus_next_word_break(const HERODOTUS_READER *r)
150 HERODOTUS_READER tmp;
152 herodotus_reader_copy(r, &tmp);
154 if (r->type == HERODOTUS_TYPE_CODEPOINT) {
155 return grapheme_next_word_break(tmp.src, tmp.srclen);
156 } else { /* r->type == HERODOTUS_TYPE_UTF8 */
157 return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
162 to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
164 enum case_property prop;
165 enum herodotus_status s;
169 for (; (nwb = herodotus_next_word_break(r)) > 0;) {
170 herodotus_reader_push_advance_limit(r, nwb);
171 for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
172 /* check if we have a cased character */
173 prop = get_case_property(cp);
174 if (prop == CASE_PROP_CASED ||
175 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
178 /* write the data to the output verbatim, it if permits */
179 herodotus_write_codepoint(w, cp);
181 /* increment reader */
182 herodotus_read_codepoint(r, true, &cp);
186 if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
188 herodotus_reader_pop_limit(r);
190 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
192 * we did not encounter any cased character
193 * up to the word break
195 herodotus_reader_pop_limit(r);
199 * we encountered a cased character before the word
200 * break, convert it to titlecase
202 herodotus_reader_push_advance_limit(r,
203 herodotus_reader_next_codepoint_break(r));
204 to_case(r, w, 0, title_major, title_minor, title_special);
205 herodotus_reader_pop_limit(r);
208 /* cast the rest of the codepoints in the word to lowercase */
209 to_case(r, w, 1, lower_major, lower_minor, lower_special);
211 /* remove the limit on the word before the next iteration */
212 herodotus_reader_pop_limit(r);
215 herodotus_writer_nul_terminate(w);
217 return herodotus_writer_number_written(w);
221 grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
226 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
227 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
229 return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
233 grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
238 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
239 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
241 return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
245 grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
250 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
251 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
253 return to_titlecase(&r, &w);
257 grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
262 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
263 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
265 return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
269 grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
274 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
275 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
277 return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
281 grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
286 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
287 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
289 return to_titlecase(&r, &w);
293 is_case(HERODOTUS_READER *r, const uint_least16_t *major,
294 const int_least32_t *minor, const struct special_case *sc,
302 for (; herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS;) {
303 /* get and handle case mapping */
304 if (unlikely((map = get_case_offset(cp, major, minor)) >=
305 INT32_C(0x110000))) {
306 /* we have a special case and the offset in the sc-array
307 * is the difference to 0x110000*/
308 off = (uint_least32_t)map - UINT32_C(0x110000);
310 for (i = 0; i < sc[off].cplen; i++) {
311 if (herodotus_read_codepoint(r, false, &cp) ==
312 HERODOTUS_STATUS_SUCCESS) {
313 if (cp != sc[off].cp[i]) {
318 herodotus_read_codepoint(r, true, &cp);
322 * input ended and we didn't see
323 * any difference so far, so this
324 * string is in fact okay
331 /* we have a simple mapping */
332 if (cp != (uint_least32_t)((int_least32_t)cp + map)) {
333 /* we have a difference */
338 herodotus_read_codepoint(r, true, &cp);
344 *output = herodotus_reader_number_read(r);
350 is_titlecase(HERODOTUS_READER *r, size_t *output)
352 enum case_property prop;
353 enum herodotus_status s;
358 for (; (nwb = herodotus_next_word_break(r)) > 0;) {
359 herodotus_reader_push_advance_limit(r, nwb);
360 for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
361 /* check if we have a cased character */
362 prop = get_case_property(cp);
363 if (prop == CASE_PROP_CASED ||
364 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
367 /* increment reader */
368 herodotus_read_codepoint(r, true, &cp);
372 if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
375 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
377 * we did not encounter any cased character
378 * up to the word break
380 herodotus_reader_pop_limit(r);
384 * we encountered a cased character before the word
385 * break, check if it's titlecase
387 herodotus_reader_push_advance_limit(r,
388 herodotus_reader_next_codepoint_break(r));
389 if (!is_case(r, title_major, title_minor, title_special, NULL)) {
393 herodotus_reader_pop_limit(r);
396 /* check if the rest of the codepoints in the word are lowercase */
397 if (!is_case(r, lower_major, lower_minor, lower_special, NULL)) {
402 /* remove the limit on the word before the next iteration */
403 herodotus_reader_pop_limit(r);
407 *output = herodotus_reader_number_read(r);
413 grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
417 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
419 return is_case(&r, upper_major, upper_minor, upper_special, caselen);
423 grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
427 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
429 return is_case(&r, lower_major, lower_minor, lower_special, caselen);
433 grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *caselen)
437 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
439 return is_titlecase(&r, caselen);
443 grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *caselen)
447 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
449 return is_case(&r, upper_major, upper_minor, upper_special, caselen);
453 grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *caselen)
457 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
459 return is_case(&r, lower_major, lower_minor, lower_special, caselen);
463 grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *caselen)
467 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
469 return is_titlecase(&r, caselen);