/* See LICENSE file for copyright and license details. */ #include #include #include #include "../gen/character.h" #include "../grapheme.h" #include "util.h" struct character_break_state { uint_least8_t prop; bool prop_set; bool gb11_flag; bool gb12_13_flag; }; static const uint_least16_t dont_break[NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_OTHER] = UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_CR] = UINT16_C(1) << CHAR_BREAK_PROP_LF, /* GB3 */ [CHAR_BREAK_PROP_EXTEND] = UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_HANGUL_L] = UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_L | /* GB6 */ UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB6 */ UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LV | /* GB6 */ UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_LVT | /* GB6 */ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_HANGUL_V] = UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_HANGUL_T] = UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_HANGUL_LV] = UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_V | /* GB7 */ UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB7 */ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_HANGUL_LVT] = UINT16_C(1) << CHAR_BREAK_PROP_HANGUL_T | /* GB8 */ UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_PREPEND] = UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK | /* GB9a */ (UINT16_C(0xFFFF) & ~(UINT16_C(1) << CHAR_BREAK_PROP_CR | UINT16_C(1) << CHAR_BREAK_PROP_LF | UINT16_C(1) << CHAR_BREAK_PROP_CONTROL ) ), /* GB9b */ [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_SPACINGMARK] = UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ [CHAR_BREAK_PROP_ZWJ] = UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | /* GB9 */ UINT16_C(1) << CHAR_BREAK_PROP_SPACINGMARK, /* GB9a */ }; static const uint_least16_t flag_update_gb11[2 * NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC] = UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | UINT16_C(1) << CHAR_BREAK_PROP_EXTEND, [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, [CHAR_BREAK_PROP_EXTEND + NUM_CHAR_BREAK_PROPS] = UINT16_C(1) << CHAR_BREAK_PROP_EXTEND | UINT16_C(1) << CHAR_BREAK_PROP_ZWJ, [CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC + NUM_CHAR_BREAK_PROPS] = UINT16_C(1) << CHAR_BREAK_PROP_ZWJ | UINT16_C(1) << CHAR_BREAK_PROP_EXTEND, }; static const uint_least16_t dont_break_gb11[2 * NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_ZWJ + NUM_CHAR_BREAK_PROPS] = UINT16_C(1) << CHAR_BREAK_PROP_EXTENDED_PICTOGRAPHIC, }; static const uint_least16_t flag_update_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_REGIONAL_INDICATOR] = UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, }; static const uint_least16_t dont_break_gb12_13[2 * NUM_CHAR_BREAK_PROPS] = { [CHAR_BREAK_PROP_REGIONAL_INDICATOR + NUM_CHAR_BREAK_PROPS] = UINT16_C(1) << CHAR_BREAK_PROP_REGIONAL_INDICATOR, }; static inline enum char_break_property get_break_prop(uint_least32_t cp) { if (likely(cp <= UINT32_C(0x10FFFF))) { return (enum char_break_property) char_break_minor[char_break_major[cp >> 8] + (cp & 0xFF)]; } else { return CHAR_BREAK_PROP_OTHER; } } static inline void state_serialize(const struct character_break_state *in, uint_least16_t *out) { *out = (uint_least16_t)(in->prop & UINT8_C(0xFF)) | /* first 8 bits */ (uint_least16_t)(((uint_least16_t)(in->prop_set)) << 8) | /* 9th bit */ (uint_least16_t)(((uint_least16_t)(in->gb11_flag)) << 9) | /* 10th bit */ (uint_least16_t)(((uint_least16_t)(in->gb12_13_flag)) << 10); /* 11th bit */ } static inline void state_deserialize(uint_least16_t in, struct character_break_state *out) { out->prop = in & UINT8_C(0xFF); out->prop_set = in & (UINT16_C(1) << 8); out->gb11_flag = in & (UINT16_C(1) << 9); out->gb12_13_flag = in & (UINT16_C(1) << 10); } bool grapheme_is_character_break(uint_least32_t cp0, uint_least32_t cp1, uint_least16_t *s) { struct character_break_state state; enum char_break_property cp0_prop, cp1_prop; bool notbreak = false; if (likely(s)) { state_deserialize(*s, &state); if (likely(state.prop_set)) { cp0_prop = state.prop; } else { cp0_prop = get_break_prop(cp0); } cp1_prop = get_break_prop(cp1); /* preserve prop of right codepoint for next iteration */ state.prop = (uint_least8_t)cp1_prop; state.prop_set = true; /* update flags */ state.gb11_flag = flag_update_gb11[cp0_prop + NUM_CHAR_BREAK_PROPS * state.gb11_flag] & UINT16_C(1) << cp1_prop; state.gb12_13_flag = flag_update_gb12_13[cp0_prop + NUM_CHAR_BREAK_PROPS * state.gb12_13_flag] & UINT16_C(1) << cp1_prop; /* * Apply grapheme cluster breaking algorithm (UAX #29), see * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules */ notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) || (dont_break_gb11[cp0_prop + state.gb11_flag * NUM_CHAR_BREAK_PROPS] & (UINT16_C(1) << cp1_prop)) || (dont_break_gb12_13[cp0_prop + state.gb12_13_flag * NUM_CHAR_BREAK_PROPS] & (UINT16_C(1) << cp1_prop)); /* update or reset flags (when we have a break) */ if (likely(!notbreak)) { state.gb11_flag = state.gb12_13_flag = false; } state_serialize(&state, s); } else { cp0_prop = get_break_prop(cp0); cp1_prop = get_break_prop(cp1); /* * Apply grapheme cluster breaking algorithm (UAX #29), see * http://unicode.org/reports/tr29/#Grapheme_Cluster_Boundary_Rules * * Given we have no state, this behaves as if the state-booleans * were all set to false */ notbreak = (dont_break[cp0_prop] & (UINT16_C(1) << cp1_prop)) || (dont_break_gb11[cp0_prop] & (UINT16_C(1) << cp1_prop)) || (dont_break_gb12_13[cp0_prop] & (UINT16_C(1) << cp1_prop)); } return !notbreak; } static size_t next_character_break(HERODOTUS_READER *r) { uint_least16_t state = 0; uint_least32_t cp0 = 0, cp1 = 0; for (herodotus_read_codepoint(r, true, &cp0); herodotus_read_codepoint(r, false, &cp1) == HERODOTUS_STATUS_SUCCESS; herodotus_read_codepoint(r, true, &cp0)) { if (grapheme_is_character_break(cp0, cp1, &state)) { break; } } return herodotus_reader_number_read(r); } size_t grapheme_next_character_break(const uint_least32_t *str, size_t len) { HERODOTUS_READER r; herodotus_reader_init(&r, HERODOTUS_TYPE_CODEPOINT, str, len); return next_character_break(&r); } size_t grapheme_next_character_break_utf8(const char *str, size_t len) { HERODOTUS_READER r; herodotus_reader_init(&r, HERODOTUS_TYPE_UTF8, str, len); return next_character_break(&r); }