Blob


1 /* See LICENSE file for copyright and license details. */
2 #include <limits.h>
3 #include <stdbool.h>
4 #include <stddef.h>
5 #include <stdint.h>
7 #include "../gen/types.h"
8 #include "../grapheme.h"
9 #include "util.h"
11 void
12 herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type,
13 const void *src, size_t srclen)
14 {
15 size_t i;
17 r->type = type;
18 r->src = src;
19 r->srclen = srclen;
20 r->off = 0;
21 r->terminated_by_null = false;
23 for (i = 0; i < LEN(r->soft_limit); i++) {
24 r->soft_limit[i] = SIZE_MAX;
25 }
26 }
28 void
29 herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest)
30 {
31 size_t i;
33 /*
34 * we copy such that we have a "fresh" start and build on the
35 * fact that src->soft_limit[i] for any i and src->srclen are
36 * always larger or equal to src->off
37 */
38 dest->type = src->type;
39 if (src->type == HERODOTUS_TYPE_CODEPOINT) {
40 dest->src = (src->src == NULL) ? NULL :
41 ((const uint_least32_t *)(src->src)) + src->off;
42 } else { /* src->type == HERODOTUS_TYPE_UTF8 */
43 dest->src = (src->src == NULL) ? NULL :
44 ((const char *)(src->src)) + src->off;
45 }
46 if (src->srclen == SIZE_MAX) {
47 dest->srclen = SIZE_MAX;
48 } else {
49 dest->srclen = (src->off < src->srclen) ? src->srclen - src->off : 0;
50 }
51 dest->off = 0;
52 dest->terminated_by_null = src->terminated_by_null;
54 for (i = 0; i < LEN(src->soft_limit); i++) {
55 if (src->soft_limit[i] == SIZE_MAX) {
56 dest->soft_limit[i] = SIZE_MAX;
57 } else {
58 /*
59 * if we have a degenerate case where the offset is
60 * higher than the soft-limit, we simply clamp the
61 * soft-limit to zero given we can't decide here
62 * to release the limit and, instead, we just
63 * prevent any more reads
64 */
65 dest->soft_limit[i] = (src->off < src->soft_limit[i]) ?
66 src->soft_limit[i] - src->off : 0;
67 }
68 }
69 }
71 void
72 herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count)
73 {
74 size_t i;
76 for (i = LEN(r->soft_limit) - 1; i >= 1; i--) {
77 r->soft_limit[i] = r->soft_limit[i - 1];
78 }
79 r->soft_limit[0] = r->off + count;
80 }
82 void
83 herodotus_reader_pop_limit(HERODOTUS_READER *r)
84 {
85 size_t i;
87 for (i = 0; i < LEN(r->soft_limit) - 1; i++) {
88 r->soft_limit[i] = r->soft_limit[i + 1];
89 }
90 r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX;
91 }
93 size_t
94 herodotus_reader_next_word_break(const HERODOTUS_READER *r)
95 {
96 if (r->type == HERODOTUS_TYPE_CODEPOINT) {
97 return grapheme_next_word_break(
98 (const uint_least32_t *)(r->src) + r->off,
99 MIN(r->srclen, r->soft_limit[0]) - r->off);
100 } else { /* r->type == HERODOTUS_TYPE_UTF8 */
101 return grapheme_next_word_break_utf8(
102 (const char *)(r->src) + r->off,
103 MIN(r->srclen, r->soft_limit[0]) - r->off);
107 size_t
108 herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r)
110 if (r->type == HERODOTUS_TYPE_CODEPOINT) {
111 return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0;
112 } else { /* r->type == HERODOTUS_TYPE_UTF8 */
113 return grapheme_decode_utf8(
114 (const char *)(r->src) + r->off,
115 MIN(r->srclen, r->soft_limit[0]) - r->off, NULL);
119 size_t
120 herodotus_reader_number_read(const HERODOTUS_READER *r)
122 return r->off;
125 enum herodotus_status
126 herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp)
128 size_t ret;
130 if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) {
131 *cp = GRAPHEME_INVALID_CODEPOINT;
132 return HERODOTUS_STATUS_END_OF_BUFFER;
135 if (r->off >= r->soft_limit[0]) {
136 *cp = GRAPHEME_INVALID_CODEPOINT;
137 return HERODOTUS_STATUS_SOFT_LIMIT_REACHED;
140 if (r->type == HERODOTUS_TYPE_CODEPOINT) {
141 *cp = ((const uint_least32_t *)(r->src))[r->off];
142 ret = 1;
143 } else { /* r->type == HERODOTUS_TYPE_UTF8 */
144 ret = grapheme_decode_utf8((const char *)r->src + r->off,
145 MIN(r->srclen, r->soft_limit[0]) -
146 r->off, cp);
149 if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) {
150 /*
151 * We encountered a null-codepoint. Don't increment
152 * offset and return as if the buffer had ended here all
153 * along
154 */
155 r->terminated_by_null = true;
156 return HERODOTUS_STATUS_END_OF_BUFFER;
159 if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) {
160 /*
161 * we want more than we have; instead of returning
162 * garbage we terminate here.
163 */
164 return HERODOTUS_STATUS_END_OF_BUFFER;
167 /*
168 * Increase offset which we now know won't surpass the limits,
169 * unless we got told otherwise
170 */
171 if (advance) {
172 r->off += ret;
175 return HERODOTUS_STATUS_SUCCESS;
178 void
179 herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type,
180 void *dest, size_t destlen)
182 w->type = type;
183 w->dest = dest;
184 w->destlen = destlen;
185 w->off = 0;
186 w->first_unwritable_offset = SIZE_MAX;
189 void
190 herodotus_writer_nul_terminate(HERODOTUS_WRITER *w)
192 if (w->dest == NULL) {
193 return;
196 if (w->off < w->destlen) {
197 /* We still have space in the buffer. Simply use it */
198 if (w->type == HERODOTUS_TYPE_CODEPOINT) {
199 ((uint_least32_t *)(w->dest))[w->off] = 0;
200 } else { /* w->type == HERODOTUS_TYPE_UTF8 */
201 ((char *)(w->dest))[w->off] = '\0';
203 } else if (w->first_unwritable_offset < w->destlen) {
204 /*
205 * There is no more space in the buffer. However,
206 * we have noted down the first offset we couldn't
207 * use to write into the buffer and it's smaller than
208 * destlen. Thus we bailed writing into the
209 * destination when a multibyte-codepoint couldn't be
210 * written. So the last "real" byte might be at
211 * destlen-4, destlen-3, destlen-2 or destlen-1
212 * (the last case meaning truncation).
213 */
214 if (w->type == HERODOTUS_TYPE_CODEPOINT) {
215 ((uint_least32_t *)(w->dest))
216 [w->first_unwritable_offset] = 0;
217 } else { /* w->type == HERODOTUS_TYPE_UTF8 */
218 ((char *)(w->dest))[w->first_unwritable_offset] = '\0';
220 } else if (w->destlen > 0) {
221 /*
222 * In this case, there is no more space in the buffer and
223 * the last unwritable offset is larger than
224 * or equal to the destination buffer length. This means
225 * that we are forced to simply write into the last
226 * byte.
227 */
228 if (w->type == HERODOTUS_TYPE_CODEPOINT) {
229 ((uint_least32_t *)(w->dest))
230 [w->destlen - 1] = 0;
231 } else { /* w->type == HERODOTUS_TYPE_UTF8 */
232 ((char *)(w->dest))[w->destlen - 1] = '\0';
236 /* w->off is not incremented in any case */
239 size_t
240 herodotus_writer_number_written(const HERODOTUS_WRITER *w)
242 return w->off;
245 void
246 herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
248 size_t ret;
250 /*
251 * This function will always faithfully say how many codepoints
252 * were written, even if the buffer ends. This is used to enable
253 * truncation detection.
254 */
255 if (w->type == HERODOTUS_TYPE_CODEPOINT) {
256 if (w->dest != NULL && w->off < w->destlen) {
257 ((uint_least32_t *)(w->dest))[w->off] = cp;
260 w->off += 1;
261 } else { /* w->type == HERODOTUS_TYPE_UTF8 */
262 /*
263 * First determine how many bytes we need to encode the
264 * codepoint
265 */
266 ret = grapheme_encode_utf8(cp, NULL, 0);
268 if (w->dest != NULL && w->off + ret < w->destlen) {
269 /* we still have enough room in the buffer */
270 grapheme_encode_utf8(cp, (char *)(w->dest) +
271 w->off, w->destlen - w->off);
272 } else if (w->first_unwritable_offset == SIZE_MAX) {
273 /*
274 * the first unwritable offset has not been
275 * noted down, so this is the first time we can't
276 * write (completely) to an offset
277 */
278 w->first_unwritable_offset = w->off;
281 w->off += ret;
285 void
286 proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
287 uint_least8_t (*get_break_prop)(uint_least32_t),
288 bool (*is_skippable_prop)(uint_least8_t),
289 void (*skip_shift_callback)(uint_least8_t, void *),
290 struct proper *p)
292 uint_least8_t prop;
293 uint_least32_t cp;
294 size_t i;
296 /* set internal variables */
297 p->state = state;
298 p->no_prop = no_prop;
299 p->get_break_prop = get_break_prop;
300 p->is_skippable_prop = is_skippable_prop;
301 p->skip_shift_callback = skip_shift_callback;
303 /*
304 * Initialize mid-reader, which is basically just there
305 * to reflect the current position of the viewing-line
306 */
307 herodotus_reader_copy(r, &(p->mid_reader));
309 /*
310 * In the initialization, we simply (try to) fill in next_prop.
311 * If we cannot read in more (due to the buffer ending), we
312 * fill in the prop as invalid
313 */
315 /*
316 * initialize the previous properties to have no property
317 * (given we are at the start of the buffer)
318 */
319 p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
320 p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
322 /*
323 * initialize the next properties
324 */
326 /* initialize the raw reader */
327 herodotus_reader_copy(r, &(p->raw_reader));
329 /* fill in the two next raw properties (after no-initialization) */
330 p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
331 for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
332 HERODOTUS_STATUS_SUCCESS; ) {
333 p->raw.next_prop[i++] = p->get_break_prop(cp);
336 /* initialize the skip reader */
337 herodotus_reader_copy(r, &(p->skip_reader));
339 /* fill in the two next skip properties (after no-initialization) */
340 p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
341 for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
342 HERODOTUS_STATUS_SUCCESS; ) {
343 prop = p->get_break_prop(cp);
344 if (!p->is_skippable_prop(prop)) {
345 p->skip.next_prop[i++] = prop;
350 int
351 proper_advance(struct proper *p)
353 uint_least8_t prop;
354 uint_least32_t cp;
356 /* read in next "raw" property */
357 if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
358 HERODOTUS_STATUS_SUCCESS) {
359 prop = p->get_break_prop(cp);
360 } else {
361 prop = p->no_prop;
364 /*
365 * do a shift-in, unless we find that the property that is to
366 * be moved past the "raw-viewing-line" (this property is stored
367 * in p->raw.next_prop[0]) is a no_prop, indicating that
368 * we are at the end of the buffer.
369 */
370 if (p->raw.next_prop[0] == p->no_prop) {
371 return 1;
374 /* shift in the properties */
375 p->raw.prev_prop[1] = p->raw.prev_prop[0];
376 p->raw.prev_prop[0] = p->raw.next_prop[0];
377 p->raw.next_prop[0] = p->raw.next_prop[1];
378 p->raw.next_prop[1] = prop;
380 /* advance the middle reader viewing-line */
381 (void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
383 /* check skippability-property */
384 if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
385 /*
386 * the property that has moved past the "raw-viewing-line"
387 * (this property is now (after the raw-shift) stored in
388 * p->raw.prev_prop[0] and guaranteed not to be a no-prop,
389 * guaranteeing that we won't shift a no-prop past the
390 * "viewing-line" in the skip-properties) is not a skippable
391 * property, thus we need to shift the skip property as well.
392 */
393 p->skip.prev_prop[1] = p->skip.prev_prop[0];
394 p->skip.prev_prop[0] = p->skip.next_prop[0];
395 p->skip.next_prop[0] = p->skip.next_prop[1];
397 /*
398 * call the skip-shift-callback on the property that
399 * passed the skip-viewing-line (this property is now
400 * stored in p->skip.prev_prop[0]).
401 */
402 p->skip_shift_callback(p->skip.prev_prop[0], p->state);
404 /* determine the next shift property */
405 p->skip.next_prop[1] = p->no_prop;
406 while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
407 HERODOTUS_STATUS_SUCCESS) {
408 prop = p->get_break_prop(cp);
409 if (!p->is_skippable_prop(prop)) {
410 p->skip.next_prop[1] = prop;
411 break;
416 return 0;