Blob


1 /* See LICENSE file for copyright and license details. */
2 #include <stddef.h>
3 #include <stdint.h>
5 #include "../grapheme.h"
6 #include "../gen/case.h"
7 #include "util.h"
9 static inline enum case_property
10 get_case_property(uint_least32_t cp)
11 {
12 if (likely(cp <= UINT32_C(0x10FFFF))) {
13 return (enum case_property)
14 case_minor[case_major[cp >> 8] + (cp & 0xFF)];
15 } else {
16 return CASE_PROP_OTHER;
17 }
18 }
20 static inline int_least32_t
21 get_case_offset(uint_least32_t cp, const uint_least16_t *major,
22 const int_least32_t *minor)
23 {
24 if (likely(cp <= UINT32_C(0x10FFFF))) {
25 /*
26 * this value might be larger than or equal to 0x110000
27 * for the special-case-mapping. This needs to be handled
28 * separately
29 */
30 return minor[major[cp >> 8] + (cp & 0xFF)];
31 } else {
32 return 0;
33 }
34 }
36 static inline size_t
37 to_case(HERODOTUS_READER *r, HERODOTUS_WRITER *w,
38 uint_least8_t final_sigma_level, const uint_least16_t *major,
39 const int_least32_t *minor, const struct special_case *sc)
40 {
41 HERODOTUS_READER tmp;
42 enum case_property prop;
43 enum herodotus_status s;
44 size_t off, i;
45 uint_least32_t cp, tmp_cp;
46 int_least32_t map;
48 for (; herodotus_read_codepoint(r, true, &cp) == HERODOTUS_STATUS_SUCCESS;) {
49 if (sc == lower_special) {
50 /*
51 * For the special Final_Sigma-rule (see SpecialCasing.txt),
52 * which is the only non-localized case-dependent rule,
53 * we apply a different mapping when a sigma is at the
54 * end of a word.
55 *
56 * Before: cased case-ignorable*
57 * After: not(case-ignorable* cased)
58 *
59 * We check the after-condition on demand, but the before-
60 * condition is best checked using the "level"-heuristic
61 * also used in the sentence and line breaking-implementations.
62 */
63 if (cp == UINT32_C(0x03A3) && /* GREEK CAPITAL LETTER SIGMA */
64 (final_sigma_level == 1 ||
65 final_sigma_level == 2)) {
66 /*
67 * check succeeding characters by first skipping
68 * all case-ignorable characters and then checking
69 * if the succeeding character is cased, invalidating
70 * the after-condition
71 */
72 herodotus_reader_copy(r, &tmp);
73 for (prop = NUM_CASE_PROPS;
74 (s = herodotus_read_codepoint(&tmp, true, &tmp_cp)) ==
75 HERODOTUS_STATUS_SUCCESS; ) {
76 prop = get_case_property(tmp_cp);
78 if (prop != CASE_PROP_CASE_IGNORABLE &&
79 prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
80 break;
81 }
82 }
84 /*
85 * Now prop is something other than case-ignorable or
86 * the source-string ended.
87 * If it is something other than cased, we know
88 * that the after-condition holds
89 */
90 if (s != HERODOTUS_STATUS_SUCCESS ||
91 (prop != CASE_PROP_CASED &&
92 prop != CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
93 /*
94 * write GREEK SMALL LETTER FINAL SIGMA to
95 * destination
96 */
97 herodotus_write_codepoint(w, UINT32_C(0x03C2));
99 /* reset Final_Sigma-state and continue */
100 final_sigma_level = 0;
101 continue;
105 /* update state */
106 prop = get_case_property(cp);
107 if ((final_sigma_level == 0 ||
108 final_sigma_level == 1) &&
109 (prop == CASE_PROP_CASED ||
110 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
111 /* sequence has begun */
112 final_sigma_level = 1;
113 } else if ((final_sigma_level == 1 ||
114 final_sigma_level == 2) &&
115 (prop == CASE_PROP_CASE_IGNORABLE ||
116 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE)) {
117 /* case-ignorable sequence begins or continued */
118 final_sigma_level = 2;
119 } else {
120 /* sequence broke */
121 final_sigma_level = 0;
125 /* get and handle case mapping */
126 if (unlikely((map = get_case_offset(cp, major, minor)) >=
127 INT32_C(0x110000))) {
128 /* we have a special case and the offset in the sc-array
129 * is the difference to 0x110000*/
130 off = (uint_least32_t)map - UINT32_C(0x110000);
132 for (i = 0; i < sc[off].cplen; i++) {
133 herodotus_write_codepoint(w, sc[off].cp[i]);
135 } else {
136 /* we have a simple mapping */
137 herodotus_write_codepoint(w, (uint_least32_t)
138 ((int_least32_t)cp + map));
142 herodotus_writer_nul_terminate(w);
144 return herodotus_writer_number_written(w);
147 static size_t
148 herodotus_next_word_break(const HERODOTUS_READER *r)
150 HERODOTUS_READER tmp;
152 herodotus_reader_copy(r, &tmp);
154 if (r->type == HERODOTUS_TYPE_CODEPOINT) {
155 return grapheme_next_word_break(tmp.src, tmp.srclen);
156 } else { /* r->type == HERODOTUS_TYPE_UTF8 */
157 return grapheme_next_word_break_utf8(tmp.src, tmp.srclen);
161 static inline size_t
162 to_titlecase(HERODOTUS_READER *r, HERODOTUS_WRITER *w)
164 enum case_property prop;
165 enum herodotus_status s;
166 uint_least32_t cp;
167 size_t nwb;
169 for (; (nwb = herodotus_next_word_break(r)) > 0;) {
170 herodotus_reader_push_advance_limit(r, nwb);
171 for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
172 /* check if we have a cased character */
173 prop = get_case_property(cp);
174 if (prop == CASE_PROP_CASED ||
175 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
176 break;
177 } else {
178 /* write the data to the output verbatim, it if permits */
179 herodotus_write_codepoint(w, cp);
181 /* increment reader */
182 herodotus_read_codepoint(r, true, &cp);
186 if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
187 /* we are done */
188 herodotus_reader_pop_limit(r);
189 break;
190 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
191 /*
192 * we did not encounter any cased character
193 * up to the word break
194 */
195 herodotus_reader_pop_limit(r);
196 continue;
197 } else {
198 /*
199 * we encountered a cased character before the word
200 * break, convert it to titlecase
201 */
202 herodotus_reader_push_advance_limit(r,
203 herodotus_reader_next_codepoint_break(r));
204 to_case(r, w, 0, title_major, title_minor, title_special);
205 herodotus_reader_pop_limit(r);
208 /* cast the rest of the codepoints in the word to lowercase */
209 to_case(r, w, 1, lower_major, lower_minor, lower_special);
211 /* remove the limit on the word before the next iteration */
212 herodotus_reader_pop_limit(r);
215 herodotus_writer_nul_terminate(w);
217 return herodotus_writer_number_written(w);
220 size_t
221 grapheme_to_uppercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
223 HERODOTUS_READER r;
224 HERODOTUS_WRITER w;
226 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
227 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
229 return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
232 size_t
233 grapheme_to_lowercase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
235 HERODOTUS_READER r;
236 HERODOTUS_WRITER w;
238 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
239 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
241 return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
244 size_t
245 grapheme_to_titlecase(const uint_least32_t *src, size_t srclen, uint_least32_t *dest, size_t destlen)
247 HERODOTUS_READER r;
248 HERODOTUS_WRITER w;
250 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
251 herodotus_writer_init(&w, HERODOTUS_TYPE_CODEPOINT, dest, destlen);
253 return to_titlecase(&r, &w);
256 size_t
257 grapheme_to_uppercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
259 HERODOTUS_READER r;
260 HERODOTUS_WRITER w;
262 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
263 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
265 return to_case(&r, &w, 0, upper_major, upper_minor, upper_special);
268 size_t
269 grapheme_to_lowercase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
271 HERODOTUS_READER r;
272 HERODOTUS_WRITER w;
274 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
275 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
277 return to_case(&r, &w, 0, lower_major, lower_minor, lower_special);
280 size_t
281 grapheme_to_titlecase_utf8(const char *src, size_t srclen, char *dest, size_t destlen)
283 HERODOTUS_READER r;
284 HERODOTUS_WRITER w;
286 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
287 herodotus_writer_init(&w, HERODOTUS_TYPE_UTF8, dest, destlen);
289 return to_titlecase(&r, &w);
292 static inline bool
293 is_case(HERODOTUS_READER *r, const uint_least16_t *major,
294 const int_least32_t *minor, const struct special_case *sc,
295 size_t *output)
297 size_t off, i;
298 bool ret = true;
299 uint_least32_t cp;
300 int_least32_t map;
302 for (; herodotus_read_codepoint(r, false, &cp) == HERODOTUS_STATUS_SUCCESS;) {
303 /* get and handle case mapping */
304 if (unlikely((map = get_case_offset(cp, major, minor)) >=
305 INT32_C(0x110000))) {
306 /* we have a special case and the offset in the sc-array
307 * is the difference to 0x110000*/
308 off = (uint_least32_t)map - UINT32_C(0x110000);
310 for (i = 0; i < sc[off].cplen; i++) {
311 if (herodotus_read_codepoint(r, false, &cp) ==
312 HERODOTUS_STATUS_SUCCESS) {
313 if (cp != sc[off].cp[i]) {
314 ret = false;
315 goto done;
316 } else {
317 /* move forward */
318 herodotus_read_codepoint(r, true, &cp);
320 } else {
321 /*
322 * input ended and we didn't see
323 * any difference so far, so this
324 * string is in fact okay
325 */
326 ret = true;
327 goto done;
330 } else {
331 /* we have a simple mapping */
332 if (cp != (uint_least32_t)((int_least32_t)cp + map)) {
333 /* we have a difference */
334 ret = false;
335 goto done;
336 } else {
337 /* move forward */
338 herodotus_read_codepoint(r, true, &cp);
342 done:
343 if (output) {
344 *output = herodotus_reader_number_read(r);
346 return ret;
349 static inline bool
350 is_titlecase(HERODOTUS_READER *r, size_t *output)
352 enum case_property prop;
353 enum herodotus_status s;
354 bool ret = true;
355 uint_least32_t cp;
356 size_t nwb;
358 for (; (nwb = herodotus_next_word_break(r)) > 0;) {
359 herodotus_reader_push_advance_limit(r, nwb);
360 for (; (s = herodotus_read_codepoint(r, false, &cp)) == HERODOTUS_STATUS_SUCCESS;) {
361 /* check if we have a cased character */
362 prop = get_case_property(cp);
363 if (prop == CASE_PROP_CASED ||
364 prop == CASE_PROP_BOTH_CASED_CASE_IGNORABLE) {
365 break;
366 } else {
367 /* increment reader */
368 herodotus_read_codepoint(r, true, &cp);
372 if (s == HERODOTUS_STATUS_END_OF_BUFFER) {
373 /* we are done */
374 break;
375 } else if (s == HERODOTUS_STATUS_SOFT_LIMIT_REACHED) {
376 /*
377 * we did not encounter any cased character
378 * up to the word break
379 */
380 herodotus_reader_pop_limit(r);
381 continue;
382 } else {
383 /*
384 * we encountered a cased character before the word
385 * break, check if it's titlecase
386 */
387 herodotus_reader_push_advance_limit(r,
388 herodotus_reader_next_codepoint_break(r));
389 if (!is_case(r, title_major, title_minor, title_special, NULL)) {
390 ret = false;
391 goto done;
393 herodotus_reader_pop_limit(r);
396 /* check if the rest of the codepoints in the word are lowercase */
397 if (!is_case(r, lower_major, lower_minor, lower_special, NULL)) {
398 ret = false;
399 goto done;
402 /* remove the limit on the word before the next iteration */
403 herodotus_reader_pop_limit(r);
405 done:
406 if (output) {
407 *output = herodotus_reader_number_read(r);
409 return ret;
412 bool
413 grapheme_is_uppercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
415 HERODOTUS_READER r;
417 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
419 return is_case(&r, upper_major, upper_minor, upper_special, caselen);
422 bool
423 grapheme_is_lowercase(const uint_least32_t *src, size_t srclen, size_t *caselen)
425 HERODOTUS_READER r;
427 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
429 return is_case(&r, lower_major, lower_minor, lower_special, caselen);
432 bool
433 grapheme_is_titlecase(const uint_least32_t *src, size_t srclen, size_t *caselen)
435 HERODOTUS_READER r;
437 herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, src, srclen);
439 return is_titlecase(&r, caselen);
442 bool
443 grapheme_is_uppercase_utf8(const char *src, size_t srclen, size_t *caselen)
445 HERODOTUS_READER r;
447 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
449 return is_case(&r, upper_major, upper_minor, upper_special, caselen);
452 bool
453 grapheme_is_lowercase_utf8(const char *src, size_t srclen, size_t *caselen)
455 HERODOTUS_READER r;
457 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
459 return is_case(&r, lower_major, lower_minor, lower_special, caselen);
462 bool
463 grapheme_is_titlecase_utf8(const char *src, size_t srclen, size_t *caselen)
465 HERODOTUS_READER r;
467 herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, src, srclen);
469 return is_titlecase(&r, caselen);