op public repos

Blob

Date:: Wed Nov 2 20:01:35 2022 UTC
Message:: bundle libgrapheme 2.0.2 in case it's not available
Actions:: History | Blame | Raw File
1 /* See LICENSE file for copyright and license details. */
2 #include <limits.h>
3 #include <stdbool.h>
4 #include <stddef.h>
5 #include <stdint.h>
6 
7 #include "../gen/types.h"
8 #include "../grapheme.h"
9 #include "util.h"
10 
11 void
12 herodotus_reader_init(HERODOTUS_READER *r, enum herodotus_type type,
13                       const void *src, size_t srclen)
14 {
15 	size_t i;
16 
17 	r->type = type;
18 	r->src = src;
19 	r->srclen = srclen;
20 	r->off = 0;
21 	r->terminated_by_null = false;
22 
23 	for (i = 0; i < LEN(r->soft_limit); i++) {
24 		r->soft_limit[i] = SIZE_MAX;
25 	}
26 }
27 
28 void
29 herodotus_reader_copy(const HERODOTUS_READER *src, HERODOTUS_READER *dest)
30 {
31 	size_t i;
32 
33 	/*
34 	 * we copy such that we have a "fresh" start and build on the
35 	 * fact that src->soft_limit[i] for any i and src->srclen are
36 	 * always larger or equal to src->off
37 	 */
38 	dest->type = src->type;
39 	if (src->type == HERODOTUS_TYPE_CODEPOINT) {
40 		dest->src = (src->src == NULL) ? NULL :
41 		            ((const uint_least32_t *)(src->src)) + src->off;
42 	} else { /* src->type == HERODOTUS_TYPE_UTF8 */
43 		dest->src = (src->src == NULL) ? NULL :
44 		            ((const char *)(src->src)) + src->off;
45 	}
46 	if (src->srclen == SIZE_MAX) {
47 		dest->srclen = SIZE_MAX;
48 	} else {
49 		dest->srclen = (src->off < src->srclen) ? src->srclen - src->off : 0;
50 	}
51 	dest->off = 0;
52 	dest->terminated_by_null = src->terminated_by_null;
53 
54 	for (i = 0; i < LEN(src->soft_limit); i++) {
55 		if (src->soft_limit[i] == SIZE_MAX) {
56 			dest->soft_limit[i] = SIZE_MAX;
57 		} else {
58 			/*
59 			 * if we have a degenerate case where the offset is
60 			 * higher than the soft-limit, we simply clamp the
61 			 * soft-limit to zero given we can't decide here
62 			 * to release the limit and, instead, we just
63 			 * prevent any more reads
64 			 */
65 			dest->soft_limit[i] = (src->off < src->soft_limit[i]) ?
66 				src->soft_limit[i] - src->off : 0;
67 		}
68 	}
69 }
70 
71 void
72 herodotus_reader_push_advance_limit(HERODOTUS_READER *r, size_t count)
73 {
74 	size_t i;
75 
76 	for (i = LEN(r->soft_limit) - 1; i >= 1; i--) {
77 		r->soft_limit[i] = r->soft_limit[i - 1];
78 	}
79 	r->soft_limit[0] = r->off + count;
80 }
81 
82 void
83 herodotus_reader_pop_limit(HERODOTUS_READER *r)
84 {
85 	size_t i;
86 
87 	for (i = 0; i < LEN(r->soft_limit) - 1; i++) {
88 		r->soft_limit[i] = r->soft_limit[i + 1];
89 	}
90 	r->soft_limit[LEN(r->soft_limit) - 1] = SIZE_MAX;
91 }
92 
93 size_t
94 herodotus_reader_next_word_break(const HERODOTUS_READER *r)
95 {
96 	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
97 		return grapheme_next_word_break(
98 			(const uint_least32_t *)(r->src) + r->off,
99 			MIN(r->srclen, r->soft_limit[0]) - r->off);
100 	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
101 		return grapheme_next_word_break_utf8(
102 			(const char *)(r->src) + r->off,
103 			MIN(r->srclen, r->soft_limit[0]) - r->off);
104 	}
105 }
106 
107 size_t
108 herodotus_reader_next_codepoint_break(const HERODOTUS_READER *r)
109 {
110 	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
111 		return (r->off < MIN(r->srclen, r->soft_limit[0])) ? 1 : 0;
112 	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
113 		return grapheme_decode_utf8(
114 			(const char *)(r->src) + r->off,
115 			MIN(r->srclen, r->soft_limit[0]) - r->off, NULL);
116 	}
117 }
118 
119 size_t
120 herodotus_reader_number_read(const HERODOTUS_READER *r)
121 {
122 	return r->off;
123 }
124 
125 enum herodotus_status
126 herodotus_read_codepoint(HERODOTUS_READER *r, bool advance, uint_least32_t *cp)
127 {
128 	size_t ret;
129 
130 	if (r->terminated_by_null || r->off >= r->srclen || r->src == NULL) {
131 		*cp = GRAPHEME_INVALID_CODEPOINT;
132 		return HERODOTUS_STATUS_END_OF_BUFFER;
133 	}
134 
135 	if (r->off >= r->soft_limit[0]) {
136 		*cp = GRAPHEME_INVALID_CODEPOINT;
137 		return HERODOTUS_STATUS_SOFT_LIMIT_REACHED;
138 	}
139 
140 	if (r->type == HERODOTUS_TYPE_CODEPOINT) {
141 		*cp = ((const uint_least32_t *)(r->src))[r->off];
142 		ret = 1;
143 	} else { /* r->type == HERODOTUS_TYPE_UTF8 */
144 		ret = grapheme_decode_utf8((const char *)r->src + r->off,
145 		                           MIN(r->srclen, r->soft_limit[0]) -
146 		                           r->off, cp);
147 	}
148 
149 	if (unlikely(r->srclen == SIZE_MAX && *cp == 0)) {
150 		/*
151 		 * We encountered a null-codepoint. Don't increment
152 		 * offset and return as if the buffer had ended here all
153 		 * along
154 		 */
155 		r->terminated_by_null = true;
156 		return HERODOTUS_STATUS_END_OF_BUFFER;
157 	}
158 
159 	if (r->off + ret > MIN(r->srclen, r->soft_limit[0])) {
160 		/*
161 		 * we want more than we have; instead of returning
162 		 * garbage we terminate here.
163 		 */
164 		return HERODOTUS_STATUS_END_OF_BUFFER;
165 	}
166 
167 	/*
168 	 * Increase offset which we now know won't surpass the limits,
169 	 * unless we got told otherwise
170 	 */
171 	if (advance) {
172 		r->off += ret;
173 	}
174 
175 	return HERODOTUS_STATUS_SUCCESS;
176 }
177 
178 void
179 herodotus_writer_init(HERODOTUS_WRITER *w, enum herodotus_type type,
180                       void *dest, size_t destlen)
181 {
182 	w->type = type;
183 	w->dest = dest;
184 	w->destlen = destlen;
185 	w->off = 0;
186 	w->first_unwritable_offset = SIZE_MAX;
187 }
188 
189 void
190 herodotus_writer_nul_terminate(HERODOTUS_WRITER *w)
191 {
192 	if (w->dest == NULL) {
193 		return;
194 	}
195 
196 	if (w->off < w->destlen) {
197 		/* We still have space in the buffer. Simply use it */
198 		if (w->type == HERODOTUS_TYPE_CODEPOINT) {
199 			((uint_least32_t *)(w->dest))[w->off] = 0;
200 		} else { /* w->type == HERODOTUS_TYPE_UTF8 */
201 			((char *)(w->dest))[w->off] = '\0';
202 		}
203 	} else if (w->first_unwritable_offset < w->destlen) {
204 		/*
205 		 * There is no more space in the buffer. However,
206 		 * we have noted down the first offset we couldn't
207 		 * use to write into the buffer and it's smaller than
208 		 * destlen. Thus we bailed writing into the
209 		 * destination when a multibyte-codepoint couldn't be
210 		 * written. So the last "real" byte might be at
211 		 * destlen-4, destlen-3, destlen-2 or destlen-1
212 		 * (the last case meaning truncation).
213 		 */
214 		if (w->type == HERODOTUS_TYPE_CODEPOINT) {
215 			((uint_least32_t *)(w->dest))
216 				[w->first_unwritable_offset] = 0;
217 		} else { /* w->type == HERODOTUS_TYPE_UTF8 */
218 			((char *)(w->dest))[w->first_unwritable_offset] = '\0';
219 		}
220 	} else if (w->destlen > 0) {
221 		/*
222 		 * In this case, there is no more space in the buffer and
223 		 * the last unwritable offset is larger than
224 		 * or equal to the destination buffer length. This means
225 		 * that we are forced to simply write into the last
226 		 * byte.
227 		 */
228 		if (w->type == HERODOTUS_TYPE_CODEPOINT) {
229 			((uint_least32_t *)(w->dest))
230 				[w->destlen - 1] = 0;
231 		} else { /* w->type == HERODOTUS_TYPE_UTF8 */
232 			((char *)(w->dest))[w->destlen - 1] = '\0';
233 		}
234 	}
235 
236 	/* w->off is not incremented in any case */
237 }
238 
239 size_t
240 herodotus_writer_number_written(const HERODOTUS_WRITER *w)
241 {
242 	return w->off;
243 }
244 
245 void
246 herodotus_write_codepoint(HERODOTUS_WRITER *w, uint_least32_t cp)
247 {
248 	size_t ret;
249 
250 	/*
251 	 * This function will always faithfully say how many codepoints
252 	 * were written, even if the buffer ends. This is used to enable
253 	 * truncation detection.
254 	 */
255 	if (w->type == HERODOTUS_TYPE_CODEPOINT) {
256 		if (w->dest != NULL && w->off < w->destlen) {
257 			((uint_least32_t *)(w->dest))[w->off] = cp;
258 		}
259 
260 		w->off += 1;
261 	} else { /* w->type == HERODOTUS_TYPE_UTF8 */
262 		/*
263 		 * First determine how many bytes we need to encode the
264 		 * codepoint
265 		 */
266 		ret = grapheme_encode_utf8(cp, NULL, 0);
267 
268 		if (w->dest != NULL && w->off + ret < w->destlen) {
269 			/* we still have enough room in the buffer */
270 			grapheme_encode_utf8(cp, (char *)(w->dest) +
271 			                     w->off, w->destlen - w->off);
272 		} else if (w->first_unwritable_offset == SIZE_MAX) {
273 			/*
274 			 * the first unwritable offset has not been
275 			 * noted down, so this is the first time we can't
276 			 * write (completely) to an offset
277 			 */
278 			w->first_unwritable_offset = w->off;
279 		}
280 
281 		w->off += ret;
282 	}
283 }
284 
285 void
286 proper_init(const HERODOTUS_READER *r, void *state, uint_least8_t no_prop,
287             uint_least8_t (*get_break_prop)(uint_least32_t),
288             bool (*is_skippable_prop)(uint_least8_t),
289             void (*skip_shift_callback)(uint_least8_t, void *),
290             struct proper *p)
291 {
292 	uint_least8_t prop;
293 	uint_least32_t cp;
294 	size_t i;
295 
296 	/* set internal variables */
297 	p->state = state;
298 	p->no_prop = no_prop;
299 	p->get_break_prop = get_break_prop;
300 	p->is_skippable_prop = is_skippable_prop;
301 	p->skip_shift_callback = skip_shift_callback;
302 
303 	/*
304 	 * Initialize mid-reader, which is basically just there
305 	 * to reflect the current position of the viewing-line
306 	 */
307 	herodotus_reader_copy(r, &(p->mid_reader));
308 
309 	/*
310 	 * In the initialization, we simply (try to) fill in next_prop.
311 	 * If we cannot read in more (due to the buffer ending), we
312 	 * fill in the prop as invalid
313 	 */
314 
315 	/*
316 	 * initialize the previous properties to have no property
317 	 * (given we are at the start of the buffer)
318 	 */
319 	p->raw.prev_prop[1] = p->raw.prev_prop[0] = p->no_prop;
320 	p->skip.prev_prop[1] = p->skip.prev_prop[0] = p->no_prop;
321 
322 	/*
323 	 * initialize the next properties
324 	 */
325 
326 	/* initialize the raw reader */
327 	herodotus_reader_copy(r, &(p->raw_reader));
328 
329 	/* fill in the two next raw properties (after no-initialization) */
330 	p->raw.next_prop[0] = p->raw.next_prop[1] = p->no_prop;
331 	for (i = 0; i < 2 && herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
332 	     HERODOTUS_STATUS_SUCCESS; ) {
333 		p->raw.next_prop[i++] = p->get_break_prop(cp);
334 	}
335 
336 	/* initialize the skip reader */
337 	herodotus_reader_copy(r, &(p->skip_reader));
338 
339 	/* fill in the two next skip properties (after no-initialization) */
340 	p->skip.next_prop[0] = p->skip.next_prop[1] = p->no_prop;
341 	for (i = 0; i < 2 && herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
342 	     HERODOTUS_STATUS_SUCCESS; ) {
343 		prop = p->get_break_prop(cp);
344 		if (!p->is_skippable_prop(prop)) {
345 			p->skip.next_prop[i++] = prop;
346 		}
347 	}
348 }
349 
350 int
351 proper_advance(struct proper *p)
352 {
353 	uint_least8_t prop;
354 	uint_least32_t cp;
355 
356 	/* read in next "raw" property */
357 	if (herodotus_read_codepoint(&(p->raw_reader), true, &cp) ==
358 	    HERODOTUS_STATUS_SUCCESS) {
359 		prop = p->get_break_prop(cp);
360 	} else {
361 		prop = p->no_prop;
362 	}
363 
364 	/*
365 	 * do a shift-in, unless we find that the property that is to
366 	 * be moved past the "raw-viewing-line" (this property is stored
367 	 * in p->raw.next_prop[0]) is a no_prop, indicating that
368 	 * we are at the end of the buffer.
369 	 */
370 	if (p->raw.next_prop[0] == p->no_prop) {
371 		return 1;
372 	}
373 
374 	/* shift in the properties */
375 	p->raw.prev_prop[1] = p->raw.prev_prop[0];
376 	p->raw.prev_prop[0] = p->raw.next_prop[0];
377 	p->raw.next_prop[0] = p->raw.next_prop[1];
378 	p->raw.next_prop[1] = prop;
379 
380 	/* advance the middle reader viewing-line */
381 	(void)herodotus_read_codepoint(&(p->mid_reader), true, &cp);
382 
383 	/* check skippability-property */
384 	if (!p->is_skippable_prop(p->raw.prev_prop[0])) {
385 		/*
386 		 * the property that has moved past the "raw-viewing-line"
387 		 * (this property is now (after the raw-shift) stored in
388 		 * p->raw.prev_prop[0] and guaranteed not to be a no-prop,
389 		 * guaranteeing that we won't shift a no-prop past the
390 		 * "viewing-line" in the skip-properties) is not a skippable
391 		 * property, thus we need to shift the skip property as well.
392 		 */
393 		p->skip.prev_prop[1] = p->skip.prev_prop[0];
394 		p->skip.prev_prop[0] = p->skip.next_prop[0];
395 		p->skip.next_prop[0] = p->skip.next_prop[1];
396 
397 		/*
398 		 * call the skip-shift-callback on the property that
399 		 * passed the skip-viewing-line (this property is now
400 		 * stored in p->skip.prev_prop[0]).
401 		 */
402 		p->skip_shift_callback(p->skip.prev_prop[0], p->state);
403 
404 		/* determine the next shift property */
405 		p->skip.next_prop[1] = p->no_prop;
406 		while (herodotus_read_codepoint(&(p->skip_reader), true, &cp) ==
407 		       HERODOTUS_STATUS_SUCCESS) {
408 			prop = p->get_break_prop(cp);
409 			if (!p->is_skippable_prop(prop)) {
410 				p->skip.next_prop[1] = prop;
411 				break;
412 			}
413 		}
414 	}
415 
416 	return 0;
417 }