/* See LICENSE file for copyright and license details. */ #include #include #include #include #include "../gen/types.h" #include "../grapheme.h" #include "util.h" void herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type, const void *src, size_t srclen) { size_t i; r->type = type; r->src = src; r->srclen = srclen; r->off = 0; r->terminated_by_null = false; for (i = 0; i < LEN(r->soft_limit); i++) { r->soft_limit[i] = SIZE_MAX; } } void herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest) { size_t i; /* * we copy such that we have a "fresh" start and build on the * fact that src->soft_limit[i] for any i and src->srclen are * always larger or equal to src->off */ dest->type = src->type; if (src->type == HERODOTUS_TYPE_CODEPOINT) { dest->src = (src->src == NULL) ? NULL : ((const uint_least32_t *)(src->src)) + src->off; } else { /* src->type == HERODOTUS_TYPE_UTF8 */ dest->src = (src->src == NULL) ? NULL : ((const char *)(src->src)) + src->off; } if (src->srclen == SIZE_MAX) { dest->srclen = SIZE_MAX; } else { dest->srclen = (src->off < src->srclen) ? src->srclen - src->off : 0; } dest->off = 0; dest->terminated_by_null = src->terminated_by_null; for (i = 0; i < LEN(src->soft_limit); i++) { if (src->soft_limit[i] == SIZE_MAX) { dest->soft_limit[i] = SIZE_MAX; } else { /* * if we have a degenerate case where the offset is * higher than the soft-limit, we simply clamp the * soft-limit to zero given we can't decide here * to release the limit and, instead, we just * prevent any more reads */ dest->soft_limit[i] = (src->off < src->soft_limit[i]) ? src->soft_limit[i] - src->off : 0; } } } void herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count) { size_t i; for (i = LEN(r->soft_limit) - 1; i >= 1; i--) { r->soft_limit[i] = r->soft_limit[i - 1]; } r->soft_limit[0] = r->off + count; } void herodotus_reader_pop_limit(HERODOTUS_READER *r) { size_t i; for (i = 0; i < LEN(r->soft_limit) - 1; i++) { r->soft_limit[i] = r->soft_limit[i + 1]; } r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX; } size_t herodotus_reader_next_word_break(const HERODOTUS_READER *r) { if (r->type == HERODOTUS_TYPE_CODEPOINT) { return grapheme_next_word_break( (const uint_least32_t *)(r->src) + r->off, MIN(r->srclen, r->soft_limit[0]) - r->off); } else { /* r->type == HERODOTUS_TYPE_UTF8 */ return grapheme_next_word_break_utf8( (const char *)(r->src) + r->off, MIN(r->srclen, r->soft_limit[0]) - r->off); } } size_t herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r) { if (r->type == HERODOTUS_TYPE_CODEPOINT) { return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0; } else { /* r->type == HERODOTUS_TYPE_UTF8 */ return grapheme_decode_utf8( (const char *)(r->src) + r->off, MIN(r->srclen, r->soft_limit[0]) - r->off, NULL); } } size_t herodotus_reader_number_read(const HERODOTUS_READER *r) { return r->off; } enum herodotus_status herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp) { size_t ret; if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) { *cp = GRAPHEME_INVALID_CODEPOINT; return HERODOTUS_STATUS_END_OF_BUFFER; } if (r->off >= r->soft_limit[0]) { *cp = GRAPHEME_INVALID_CODEPOINT; return HERODOTUS_STATUS_SOFT_LIMIT_REACHED; } if (r->type == HERODOTUS_TYPE_CODEPOINT) { *cp = ((const uint_least32_t *)(r->src))[r->off]; ret = 1; } else { /* r->type == HERODOTUS_TYPE_UTF8 */ ret = grapheme_decode_utf8((const char *)r->src + r->off, MIN(r->srclen, r->soft_limit[0]) - r->off, cp); } if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) { /* * We encountered a null-codepoint. Don't increment * offset and return as if the buffer had ended here all * along */ r->terminated_by_null = true; return HERODOTUS_STATUS_END_OF_BUFFER; } if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) { /* * we want more than we have; instead of returning * garbage we terminate here. */ return HERODOTUS_STATUS_END_OF_BUFFER; } /* * Increase offset which we now know won't surpass the limits, * unless we got told otherwise */ if (advance) { r->off += ret; } return HERODOTUS_STATUS_SUCCESS; } void herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type, void *dest, size_t destlen) { w->type = type; w->dest = dest; w->destlen = destlen; w->off = 0; w->first_unwritable_offset = SIZE_MAX; } void herodotus_writer_nul_terminate(HERODOTUS_WRITER *w) { if (w->dest == NULL) { return; } if (w->off < w->destlen) { /* We still have space in the buffer. Simply use it */ if (w->type == HERODOTUS_TYPE_CODEPOINT) { ((uint_least32_t *)(w->dest))[w->off] = 0; } else { /* w->type == HERODOTUS_TYPE_UTF8 */ ((char *)(w->dest))[w->off] = '\0'; } } else if (w->first_unwritable_offset < w->destlen) { /* * There is no more space in the buffer. However, * we have noted down the first offset we couldn't * use to write into the buffer and it's smaller than * destlen. Thus we bailed writing into the * destination when a multibyte-codepoint couldn't be * written. So the last "real" byte might be at * destlen-4, destlen-3, destlen-2 or destlen-1 * (the last case meaning truncation). */ if (w->type == HERODOTUS_TYPE_CODEPOINT) { ((uint_least32_t *)(w->dest)) [w->first_unwritable_offset] = 0; } else { /* w->type == HERODOTUS_TYPE_UTF8 */ ((char *)(w->dest))[w->first_unwritable_offset] = '\0'; } } else if (w->destlen > 0) { /* * In this case, there is no more space in the buffer and * the last unwritable offset is larger than * or equal to the destination buffer length. This means * that we are forced to simply write into the last * byte. */ if (w->type == HERODOTUS_TYPE_CODEPOINT) { ((uint_least32_t *)(w->dest)) [w->destlen - 1] = 0; } else { /* w->type == HERODOTUS_TYPE_UTF8 */ ((char *)(w->dest))[w->destlen - 1] = '\0'; } } /* w->off is not incremented in any case */ } size_t herodotus_writer_number_written(const HERODOTUS_WRITER *w) { return w->off; } void herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp) { size_t ret; /* * This function will always faithfully say how many codepoints * were written, even if the buffer ends. This is used to enable * truncation detection. */ if (w->type == HERODOTUS_TYPE_CODEPOINT) { if (w->dest != NULL && w->off < w->destlen) { ((uint_least32_t *)(w->dest))[w->off] = cp; } w->off += 1; } else { /* w->type == HERODOTUS_TYPE_UTF8 */ /* * First determine how many bytes we need to encode the * codepoint */ ret = grapheme_encode_utf8(cp, NULL, 0); if (w->dest != NULL && w->off + ret < w->destlen) { /* we still have enough room in the buffer */ grapheme_encode_utf8(cp, (char *)(w->dest) + w->off, w->destlen - w->off); } else if (w->first_unwritable_offset == SIZE_MAX) { /* * the first unwritable offset has not been * noted down, so this is the first time we can't * write (completely) to an offset */ w->first_unwritable_offset = w->off; } w->off += ret; } } void proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop, uint_least8_t (*get_break_prop)(uint_least32_t), bool (*is_skippable_prop)(uint_least8_t), void (*skip_shift_callback)(uint_least8_t, void *), struct proper *p) { uint_least8_t prop; uint_least32_t cp; size_t i; /* set internal variables */ p->state = state; p->no_prop = no_prop; p->get_break_prop = get_break_prop; p->is_skippable_prop = is_skippable_prop; p->skip_shift_callback = skip_shift_callback; /* * Initialize mid-reader, which is basically just there * to reflect the current position of the viewing-line */ herodotus_reader_copy(r, &(p->mid_reader)); /* * In the initialization, we simply (try to) fill in next_prop. * If we cannot read in more (due to the buffer ending), we * fill in the prop as invalid */ /* * initialize the previous properties to have no property * (given we are at the start of the buffer) */ p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop; p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop; /* * initialize the next properties */ /* initialize the raw reader */ herodotus_reader_copy(r, &(p->raw_reader)); /* fill in the two next raw properties (after no-initialization) */ p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop; for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) == HERODOTUS_STATUS_SUCCESS; ) { p->raw.next_prop[i++] = p->get_break_prop(cp); } /* initialize the skip reader */ herodotus_reader_copy(r, &(p->skip_reader)); /* fill in the two next skip properties (after no-initialization) */ p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop; for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) == HERODOTUS_STATUS_SUCCESS; ) { prop = p->get_break_prop(cp); if (!p->is_skippable_prop(prop)) { p->skip.next_prop[i++] = prop; } } } int proper_advance(struct proper *p) { uint_least8_t prop; uint_least32_t cp; /* read in next "raw" property */ if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) == HERODOTUS_STATUS_SUCCESS) { prop = p->get_break_prop(cp); } else { prop = p->no_prop; } /* * do a shift-in, unless we find that the property that is to * be moved past the "raw-viewing-line" (this property is stored * in p->raw.next_prop[0]) is a no_prop, indicating that * we are at the end of the buffer. */ if (p->raw.next_prop[0] == p->no_prop) { return 1; } /* shift in the properties */ p->raw.prev_prop[1] = p->raw.prev_prop[0]; p->raw.prev_prop[0] = p->raw.next_prop[0]; p->raw.next_prop[0] = p->raw.next_prop[1]; p->raw.next_prop[1] = prop; /* advance the middle reader viewing-line */ (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp); /* check skippability-property */ if (!p->is_skippable_prop(p->raw.prev_prop[0])) { /* * the property that has moved past the "raw-viewing-line" * (this property is now (after the raw-shift) stored in * p->raw.prev_prop[0] and guaranteed not to be a no-prop, * guaranteeing that we won't shift a no-prop past the * "viewing-line" in the skip-properties) is not a skippable * property, thus we need to shift the skip property as well. */ p->skip.prev_prop[1] = p->skip.prev_prop[0]; p->skip.prev_prop[0] = p->skip.next_prop[0]; p->skip.next_prop[0] = p->skip.next_prop[1]; /* * call the skip-shift-callback on the property that * passed the skip-viewing-line (this property is now * stored in p->skip.prev_prop[0]). */ p->skip_shift_callback(p->skip.prev_prop[0], p->state); /* determine the next shift property */ p->skip.next_prop[1] = p->no_prop; while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) == HERODOTUS_STATUS_SUCCESS) { prop = p->get_break_prop(cp); if (!p->is_skippable_prop(prop)) { p->skip.next_prop[1] = prop; break; } } } return 0; }