Blob


1 /* See LICENSE file for copyright and license details. */
2 #include <stdbool.h>
3 #include <ctype.h>
4 #include <errno.h>
5 #include <inttypes.h>
6 #include <stdbool.h>
7 #include <stddef.h>
8 #include <stdint.h>
9 #include <stdlib.h>
10 #include <stdio.h>
11 #include <string.h>
13 #include "util.h"
15 struct range {
16 uint_least32_t lower;
17 uint_least32_t upper;
18 };
20 struct properties_payload {
21 struct properties *prop;
22 const struct property_spec *spec;
23 uint_least8_t speclen;
24 int (*set_value)(struct properties_payload *, uint_least32_t, int_least64_t);
25 uint_least8_t (*handle_conflict)(uint_least32_t, uint_least8_t, uint_least8_t);
26 };
28 struct break_test_payload
29 {
30 struct break_test **test;
31 size_t *testlen;
32 };
34 static void *
35 reallocate_array(void *p, size_t len, size_t size)
36 {
37 if (len > 0 && size > SIZE_MAX / len) {
38 errno = ENOMEM;
39 return NULL;
40 }
42 return realloc(p, len * size);
43 }
45 int
46 hextocp(const char *str, size_t len, uint_least32_t *cp)
47 {
48 size_t i;
49 int off;
50 char relative;
52 /* the maximum valid codepoint is 0x10FFFF */
53 if (len > 6) {
54 fprintf(stderr, "hextocp: '%.*s' is too long.\n",
55 (int)len, str);
56 return 1;
57 }
59 for (i = 0, *cp = 0; i < len; i++) {
60 if (str[i] >= '0' && str[i] <= '9') {
61 relative = '0';
62 off = 0;
63 } else if (str[i] >= 'a' && str[i] <= 'f') {
64 relative = 'a';
65 off = 10;
66 } else if (str[i] >= 'A' && str[i] <= 'F') {
67 relative = 'A';
68 off = 10;
69 } else {
70 fprintf(stderr, "hextocp: '%.*s' is not hexadecimal.\n",
71 (int)len, str);
72 return 1;
73 }
75 *cp += ((uint_least32_t)1 << (4 * (len - i - 1))) *
76 (uint_least32_t)(str[i] - relative + off);
77 }
79 if (*cp > UINT32_C(0x10FFFF)) {
80 fprintf(stderr, "hextocp: '%.*s' is too large.\n",
81 (int)len, str);
82 return 1;
83 }
85 return 0;
86 }
88 static int
89 range_parse(const char *str, struct range *range)
90 {
91 char *p;
93 if ((p = strstr(str, "..")) == NULL) {
94 /* input has the form "XXXXXX" */
95 if (hextocp(str, strlen(str), &range->lower)) {
96 return 1;
97 }
98 range->upper = range->lower;
99 } else {
100 /* input has the form "XXXXXX..XXXXXX" */
101 if (hextocp(str, (size_t)(p - str), &range->lower) ||
102 hextocp(p + 2, strlen(p + 2), &range->upper)) {
103 return 1;
107 return 0;
110 void
111 parse_file_with_callback(const char *fname, int (*callback)(const char *,
112 char **, size_t, char *, void *), void *payload)
114 FILE *fp;
115 char *line = NULL, **field = NULL, *comment;
116 size_t linebufsize = 0, i, fieldbufsize = 0, j, nfields;
117 ssize_t len;
119 /* open file */
120 if (!(fp = fopen(fname, "r"))) {
121 fprintf(stderr, "parse_file_with_callback: fopen '%s': %s.\n",
122 fname, strerror(errno));
123 exit(1);
126 while ((len = getline(&line, &linebufsize, fp)) >= 0) {
127 /* remove trailing newline */
128 if (len > 0 && line[len - 1] == '\n') {
129 line[len - 1] = '\0';
130 len--;
133 /* skip empty lines and comment lines */
134 if (len == 0 || line[0] == '#') {
135 continue;
138 /* tokenize line into fields */
139 for (i = 0, nfields = 0, comment = NULL; i < (size_t)len; i++) {
140 /* skip leading whitespace */
141 while (line[i] == ' ') {
142 i++;
145 /* check if we crashed into the comment */
146 if (line[i] != '#') {
147 /* extend field buffer, if necessary */
148 if (++nfields > fieldbufsize) {
149 if ((field = realloc(field, nfields *
150 sizeof(*field))) == NULL) {
151 fprintf(stderr, "parse_file_with_"
152 "callback: realloc: %s.\n",
153 strerror(errno));
154 exit(1);
156 fieldbufsize = nfields;
159 /* set current position as field start */
160 field[nfields - 1] = &line[i];
162 /* continue until we reach ';' or '#' or end */
163 while (line[i] != ';' && line[i] != '#' &&
164 line[i] != '\0') {
165 i++;
169 if (line[i] == '#') {
170 /* set comment-variable for later */
171 comment = &line[i + 1];
174 /* go back whitespace and terminate field there */
175 if (i > 0) {
176 for (j = i - 1; line[j] == ' '; j--)
178 line[j + 1] = '\0';
179 } else {
180 line[i] = '\0';
183 /* if comment is set, we are done */
184 if (comment != NULL) {
185 break;
189 /* skip leading whitespace in comment */
190 while (comment != NULL && comment[0] == ' ') {
191 comment++;
194 /* call callback function */
195 if (callback(fname, field, nfields, comment, payload)) {
196 fprintf(stderr, "parse_file_with_callback: "
197 "Malformed input.\n");
198 exit(1);
202 free(line);
203 free(field);
206 static int
207 properties_callback(const char *file, char **field, size_t nfields,
208 char *comment, void *payload)
210 /* prop always has the length 0x110000 */
211 struct properties_payload *p = (struct properties_payload *)payload;
212 struct range r;
213 uint_least8_t i;
214 uint_least32_t cp;
216 (void)comment;
218 if (nfields < 2) {
219 return 1;
222 for (i = 0; i < p->speclen; i++) {
223 /* identify fitting file and identifier */
224 if (p->spec[i].file &&
225 !strcmp(p->spec[i].file, file) &&
226 (!strcmp(p->spec[i].ucdname, field[1]) ||
227 (comment != NULL && !strncmp(p->spec[i].ucdname, comment, strlen(p->spec[i].ucdname)) &&
228 comment[strlen(p->spec[i].ucdname)] == ' '))) {
229 /* parse range in first field */
230 if (range_parse(field[0], &r)) {
231 return 1;
234 /* apply to all codepoints in the range */
235 for (cp = r.lower; cp <= r.upper; cp++) {
236 if (p->set_value(payload, cp, i)) {
237 exit(1);
240 break;
244 return 0;
247 void
248 properties_compress(const struct properties *prop,
249 struct properties_compressed *comp)
251 uint_least32_t cp, i;
253 /* initialization */
254 if (!(comp->offset = malloc((size_t)UINT32_C(0x110000) * sizeof(*(comp->offset))))) {
255 fprintf(stderr, "malloc: %s\n", strerror(errno));
256 exit(1);
258 comp->data = NULL;
259 comp->datalen = 0;
261 for (cp = 0; cp < UINT32_C(0x110000); cp++) {
262 for (i = 0; i < comp->datalen; i++) {
263 if (!memcmp(&(prop[cp]), &(comp->data[i]), sizeof(*prop))) {
264 /* found a match! */
265 comp->offset[cp] = i;
266 break;
269 if (i == comp->datalen) {
270 /*
271 * found no matching properties-struct, so
272 * add current properties to data and add the
273 * offset in the offset-table
274 */
275 if (!(comp->data = reallocate_array(comp->data,
276 ++(comp->datalen),
277 sizeof(*(comp->data))))) {
278 fprintf(stderr, "reallocate_array: %s\n",
279 strerror(errno));
280 exit(1);
282 memcpy(&(comp->data[comp->datalen - 1]), &(prop[cp]),
283 sizeof(*prop));
284 comp->offset[cp] = comp->datalen - 1;
289 double
290 properties_get_major_minor(const struct properties_compressed *comp,
291 struct properties_major_minor *mm)
293 size_t i, j, compression_count = 0;
295 /*
296 * we currently have an array comp->offset which maps the
297 * codepoints 0..0x110000 to offsets into comp->data.
298 * To improve cache-locality instead and allow a bit of
299 * compressing, instead of directly mapping a codepoint
300 * 0xAAAABB with comp->offset, we generate two arrays major
301 * and minor such that
302 * comp->offset(0xAAAABB) == minor[major[0xAAAA] + 0xBB]
303 * This yields a major-array of length 2^16 and a minor array
304 * of variable length depending on how many common subsequences
305 * can be filtered out.
306 */
308 /* initialize */
309 if (!(mm->major = malloc((size_t)0x1100 * sizeof(*(mm->major))))) {
310 fprintf(stderr, "malloc: %s\n", strerror(errno));
311 exit(1);
313 mm->minor = NULL;
314 mm->minorlen = 0;
316 for (i = 0; i < (size_t)0x1100; i++) {
317 /*
318 * we now look at the cp-range (i << 8)..(i << 8 + 0xFF)
319 * and check if its corresponding offset-data already
320 * exists in minor (because then we just point there
321 * and need less storage)
322 */
323 for (j = 0; j + 0xFF < mm->minorlen; j++) {
324 if (!memcmp(&(comp->offset[i << 8]),
325 &(mm->minor[j]),
326 sizeof(*(comp->offset)) * 0x100)) {
327 break;
330 if (j + 0xFF < mm->minorlen) {
331 /* found an index */
332 compression_count++;
333 mm->major[i] = j;
334 } else {
335 /*
336 * add "new" sequence to minor and point to it
337 * in major
338 */
339 mm->minorlen += 0x100;
340 if (!(mm->minor = reallocate_array(mm->minor,
341 mm->minorlen,
342 sizeof(*(mm->minor))))) {
343 fprintf(stderr, "reallocate_array: %s\n",
344 strerror(errno));
345 exit(1);
347 memcpy(&(mm->minor[mm->minorlen - 0x100]),
348 &(comp->offset[i << 8]),
349 sizeof(*(mm->minor)) * 0x100);
350 mm->major[i] = mm->minorlen - 0x100;
354 /* return compression ratio */
355 return (double)compression_count / 0x1100 * 100;
358 void
359 properties_print_lookup_table(char *name, size_t *data, size_t datalen)
361 char *type;
362 size_t i, maxval;
364 for (i = 0, maxval = 0; i < datalen; i++) {
365 if (data[i] > maxval) {
366 maxval = data[i];
370 type = (maxval <= UINT_LEAST8_MAX) ? "uint_least8_t" :
371 (maxval <= UINT_LEAST16_MAX) ? "uint_least16_t" :
372 (maxval <= UINT_LEAST32_MAX) ? "uint_least32_t" :
373 "uint_least64_t";
375 printf("static const %s %s[] = {\n\t", type, name);
376 for (i = 0; i < datalen; i++) {
377 printf("%zu", data[i]);
378 if (i + 1 == datalen) {
379 printf("\n");
380 } else if ((i + 1) % 8 != 0) {
381 printf(", ");
382 } else {
383 printf(",\n\t");
387 printf("};\n");
390 void
391 properties_print_derived_lookup_table(char *name, char *type, size_t *offset, size_t offsetlen,
392 int_least64_t (*get_value)(const struct properties *,
393 size_t), const void *payload)
395 size_t i;
397 printf("static const %s %s[] = {\n\t", type, name);
398 for (i = 0; i < offsetlen; i++) {
399 printf("%"PRIiLEAST64, get_value(payload, offset[i]));
400 if (i + 1 == offsetlen) {
401 printf("\n");
402 } else if ((i + 1) % 8 != 0) {
403 printf(", ");
404 } else {
405 printf(",\n\t");
409 printf("};\n");
412 static void
413 properties_print_enum(const struct property_spec *spec, size_t speclen,
414 const char *enumname, const char *enumprefix)
416 size_t i;
418 printf("enum %s {\n", enumname);
419 for (i = 0; i < speclen; i++) {
420 printf("\t%s_%s,\n", enumprefix, spec[i].enumname);
422 printf("\tNUM_%sS,\n};\n\n", enumprefix);
425 static int
426 set_value_bp(struct properties_payload *payload, uint_least32_t cp,
427 int_least64_t value)
429 if (payload->prop[cp].property != 0) {
430 if (payload->handle_conflict == NULL) {
431 fprintf(stderr, "set_value_bp: "
432 "Unhandled character break property "
433 "overwrite for 0x%06X (%s <- %s).\n",
434 cp, payload->spec[payload->prop[cp].
435 property].enumname,
436 payload->spec[value].enumname);
437 return 1;
438 } else {
439 value = payload->handle_conflict(cp,
440 (uint_least8_t)payload->prop[cp].property,
441 (uint_least8_t)value);
444 payload->prop[cp].property = value;
446 return 0;
449 static int_least64_t
450 get_value_bp(const struct properties *prop, size_t offset)
452 return (uint_least8_t)prop[offset].property;
455 void
456 properties_generate_break_property(const struct property_spec *spec,
457 uint_least8_t speclen,
458 uint_least8_t (*handle_conflict)(
459 uint_least32_t, uint_least8_t,
460 uint_least8_t), uint_least8_t
461 (*post_process)(uint_least8_t),
462 const char *prefix, const char *argv0)
464 struct properties_compressed comp;
465 struct properties_major_minor mm;
466 struct properties_payload payload;
467 struct properties *prop;
468 size_t i, j, prefixlen = strlen(prefix);
469 char buf1[64], prefix_uc[64], buf2[64], buf3[64], buf4[64];
471 /* allocate property buffer for all 0x110000 codepoints */
472 if (!(prop = calloc(UINT32_C(0x110000), sizeof(*prop)))) {
473 fprintf(stderr, "calloc: %s\n", strerror(errno));
474 exit(1);
477 /* generate data */
478 payload.prop = prop;
479 payload.spec = spec;
480 payload.speclen = speclen;
481 payload.set_value = set_value_bp;
482 payload.handle_conflict = handle_conflict;
484 /* parse each file exactly once and ignore NULL-fields */
485 for (i = 0; i < speclen; i++) {
486 for (j = 0; j < i; j++) {
487 if (spec[i].file && spec[j].file &&
488 !strcmp(spec[i].file, spec[j].file)) {
489 /* file has already been parsed */
490 break;
493 if (i == j && spec[i].file) {
494 /* file has not been processed yet */
495 parse_file_with_callback(spec[i].file,
496 properties_callback,
497 &payload);
501 /* post-processing */
502 if (post_process != NULL) {
503 for (i = 0; i < UINT32_C(0x110000); i++) {
504 payload.prop[i].property =
505 post_process((uint_least8_t)payload.prop[i].property);
509 /* compress data */
510 printf("/* Automatically generated by %s */\n#include <stdint.h>\n\n", argv0);
511 properties_compress(prop, &comp);
513 fprintf(stderr, "%s: %s-LUT compression-ratio: %.2f%%\n", argv0,
514 prefix, properties_get_major_minor(&comp, &mm));
516 /* prepare names */
517 if ((size_t)snprintf(buf1, LEN(buf1), "%s_property", prefix) >= LEN(buf1)) {
518 fprintf(stderr, "snprintf: String truncated.\n");
519 exit(1);
521 if (LEN(prefix_uc) + 1 < prefixlen) {
522 fprintf(stderr, "snprintf: Buffer too small.\n");
523 exit(1);
525 for (i = 0; i < prefixlen; i++) {
526 prefix_uc[i] = (char)toupper(prefix[i]);
528 prefix_uc[prefixlen] = '\0';
529 if ((size_t)snprintf(buf2, LEN(buf2), "%s_PROP", prefix_uc) >= LEN(buf2) ||
530 (size_t)snprintf(buf3, LEN(buf3), "%s_major", prefix) >= LEN(buf3) ||
531 (size_t)snprintf(buf4, LEN(buf4), "%s_minor", prefix) >= LEN(buf4)) {
532 fprintf(stderr, "snprintf: String truncated.\n");
533 exit(1);
536 /* print data */
537 properties_print_enum(spec, speclen, buf1, buf2);
538 properties_print_lookup_table(buf3, mm.major, 0x1100);
539 printf("\n");
540 properties_print_derived_lookup_table(buf4, "uint_least8_t", mm.minor, mm.minorlen,
541 get_value_bp, comp.data);
543 /* free data */
544 free(prop);
545 free(comp.data);
546 free(comp.offset);
547 free(mm.major);
548 free(mm.minor);
551 static int
552 break_test_callback(const char *fname, char **field, size_t nfields,
553 char *comment, void *payload)
555 struct break_test *t,
556 **test = ((struct break_test_payload *)payload)->test;
557 size_t i, *testlen = ((struct break_test_payload *)payload)->testlen;
558 char *token;
560 (void)fname;
562 if (nfields < 1) {
563 return 1;
566 /* append new testcase and initialize with zeroes */
567 if ((*test = realloc(*test, ++(*testlen) * sizeof(**test))) == NULL) {
568 fprintf(stderr, "break_test_callback: realloc: %s.\n",
569 strerror(errno));
570 return 1;
572 t = &(*test)[*testlen - 1];
573 memset(t, 0, sizeof(*t));
575 /* parse testcase "<÷|×> <cp> <÷|×> ... <cp> <÷|×>" */
576 for (token = strtok(field[0], " "), i = 0; token != NULL; i++,
577 token = strtok(NULL, " ")) {
578 if (i % 2 == 0) {
579 /* delimiter or start of sequence */
580 if (i == 0 || !strncmp(token, "\xC3\xB7", 2)) { /* UTF-8 */
581 /*
582 * '÷' indicates a breakpoint,
583 * the current length is done; allocate
584 * a new length field and set it to 0
585 */
586 if ((t->len = realloc(t->len,
587 ++t->lenlen * sizeof(*t->len))) == NULL) {
588 fprintf(stderr, "break_test_"
589 "callback: realloc: %s.\n",
590 strerror(errno));
591 return 1;
593 t->len[t->lenlen - 1] = 0;
594 } else if (!strncmp(token, "\xC3\x97", 2)) { /* UTF-8 */
595 /*
596 * '×' indicates a non-breakpoint, do nothing
597 */
598 } else {
599 fprintf(stderr, "break_test_callback: "
600 "Malformed delimiter '%s'.\n", token);
601 return 1;
603 } else {
604 /* add codepoint to cp-array */
605 if ((t->cp = realloc(t->cp, ++t->cplen *
606 sizeof(*t->cp))) == NULL) {
607 fprintf(stderr, "break_test_callback: "
608 "realloc: %s.\n", strerror(errno));
609 return 1;
611 if (hextocp(token, strlen(token), &t->cp[t->cplen - 1])) {
612 return 1;
614 if (t->lenlen > 0) {
615 t->len[t->lenlen - 1]++;
619 if (t->len[t->lenlen - 1] == 0) {
620 /*
621 * we allocated one more length than we needed because
622 * the breakpoint was at the end
623 */
624 t->lenlen--;
627 /* store comment */
628 if (((*test)[*testlen - 1].descr = strdup(comment)) == NULL) {
629 fprintf(stderr, "break_test_callback: strdup: %s.\n",
630 strerror(errno));
631 return 1;
634 return 0;
637 void
638 break_test_list_parse(char *fname, struct break_test **test,
639 size_t *testlen)
641 struct break_test_payload pl = {
642 .test = test,
643 .testlen = testlen,
644 };
645 *test = NULL;
646 *testlen = 0;
648 parse_file_with_callback(fname, break_test_callback, &pl);
651 void
652 break_test_list_print(const struct break_test *test, size_t testlen,
653 const char *identifier, const char *progname)
655 size_t i, j;
657 printf("/* Automatically generated by %s */\n"
658 "#include <stdint.h>\n#include <stddef.h>\n\n"
659 "#include \"../gen/types.h\"\n\n", progname);
661 printf("static const struct break_test %s[] = {\n", identifier);
662 for (i = 0; i < testlen; i++) {
663 printf("\t{\n");
665 printf("\t\t.cp = (uint_least32_t[]){");
666 for (j = 0; j < test[i].cplen; j++) {
667 printf(" UINT32_C(0x%06X)", test[i].cp[j]);
668 if (j + 1 < test[i].cplen) {
669 putchar(',');
672 printf(" },\n");
673 printf("\t\t.cplen = %zu,\n", test[i].cplen);
675 printf("\t\t.len = (size_t[]){");
676 for (j = 0; j < test[i].lenlen; j++) {
677 printf(" %zu", test[i].len[j]);
678 if (j + 1 < test[i].lenlen) {
679 putchar(',');
682 printf(" },\n");
683 printf("\t\t.lenlen = %zu,\n", test[i].lenlen);
685 printf("\t\t.descr = \"%s\",\n", test[i].descr);
687 printf("\t},\n");
689 printf("};\n");
692 void
693 break_test_list_free(struct break_test *test, size_t testlen)
695 size_t i;
697 for (i = 0; i < testlen; i++) {
698 free(test[i].cp);
699 free(test[i].len);
700 free(test[i].descr);
703 free(test);