Blob


1 /* See LICENSE file for copyright and license details. */
2 #include <errno.h>
3 #include <stdint.h>
4 #include <stdio.h>
5 #include <stdlib.h>
6 #include <string.h>
8 #include "util.h"
10 #define FILE_DCP "data/DerivedCoreProperties.txt"
12 static const struct property_spec case_property[] = {
13 {
14 .enumname = "OTHER",
15 .file = NULL,
16 .ucdname = NULL,
17 },
18 {
19 .enumname = "BOTH_CASED_CASE_IGNORABLE",
20 .file = NULL,
21 .ucdname = NULL,
22 },
23 {
24 .enumname = "CASED",
25 .file = FILE_DCP,
26 .ucdname = "Cased",
27 },
28 {
29 .enumname = "CASE_IGNORABLE",
30 .file = FILE_DCP,
31 .ucdname = "Case_Ignorable",
32 },
33 {
34 .enumname = "UNCASED",
35 .file = FILE_DCP,
36 .ucdname = "Uncased",
37 },
38 };
40 static uint_least8_t
41 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
42 {
43 uint_least8_t result;
45 (void)cp;
47 if ((!strcmp(case_property[prop1].enumname, "CASED") &&
48 !strcmp(case_property[prop2].enumname, "CASE_IGNORABLE")) ||
49 (!strcmp(case_property[prop1].enumname, "CASE_IGNORABLE") &&
50 !strcmp(case_property[prop2].enumname, "CASED"))) {
51 for (result = 0; result < LEN(case_property); result++) {
52 if (!strcmp(case_property[result].enumname,
53 "BOTH_CASED_CASE_IGNORABLE")) {
54 break;
55 }
56 }
57 if (result == LEN(case_property)) {
58 fprintf(stderr, "handle_conflict: Internal error.\n");
59 exit(1);
60 }
61 } else {
62 fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
63 exit(1);
64 }
66 return result;
67 }
69 static struct properties *prop_upper = NULL, *prop_lower, *prop_title;
70 static struct special_case {
71 struct {
72 uint_least32_t *cp;
73 size_t cplen;
74 } upper, lower, title;
75 } *sc = NULL;
76 static size_t sclen = 0;
78 static int
79 unicodedata_callback(const char *file, char **field, size_t nfields,
80 char *comment, void *payload)
81 {
82 uint_least32_t cp, upper, lower, title;
84 (void)file;
85 (void)comment;
86 (void)payload;
88 hextocp(field[0], strlen(field[0]), &cp);
90 upper = lower = title = cp;
92 if ((strlen(field[12]) > 0 && hextocp(field[12], strlen(field[12]), &upper)) ||
93 (strlen(field[13]) > 0 && hextocp(field[13], strlen(field[13]), &lower)) ||
94 (nfields >= 15 && strlen(field[14]) > 0 && hextocp(field[14], strlen(field[14]), &title))) {
95 return 1;
96 }
98 prop_upper[cp].property = (int_least32_t)upper - (int_least32_t)cp;
99 prop_lower[cp].property = (int_least32_t)lower - (int_least32_t)cp;
100 prop_title[cp].property = (int_least32_t)title - (int_least32_t)cp;
102 return 0;
105 static int
106 parse_cp_list(const char *str, uint_least32_t **cp, size_t *cplen)
108 size_t count, i;
109 const char *tmp1 = NULL, *tmp2 = NULL;
111 /* count the number of spaces in the string and infer list length */
112 for (count = 1, tmp1 = str; (tmp2 = strchr(tmp1, ' ')) != NULL; count++, tmp1 = tmp2 + 1)
115 /* allocate resources */
116 if (!(*cp = calloc((*cplen = count), sizeof(**cp)))) {
117 fprintf(stderr, "calloc: %s\n", strerror(errno));
118 exit(1);
121 /* go through the string again, parsing the numbers */
122 for (i = 0, tmp1 = tmp2 = str; tmp2 != NULL; i++) {
123 tmp2 = strchr(tmp1, ' ');
124 if (hextocp(tmp1, tmp2 ? (size_t)(tmp2 - tmp1) : strlen(tmp1), &((*cp)[i]))) {
125 return 1;
127 if (tmp2 != NULL) {
128 tmp1 = tmp2 + 1;
132 return 0;
135 static int
136 specialcasing_callback(const char *file, char **field, size_t nfields,
137 char *comment, void *payload)
139 uint_least32_t cp;
141 (void)file;
142 (void)comment;
143 (void)payload;
145 if (nfields > 4 && strlen(field[4]) > 0) {
146 /*
147 * we have more than 4 fields, i.e. the rule has a
148 * condition (language-sensitive, etc.) and is discarded
149 */
150 return 0;
153 /* parse affected codepoint */
154 hextocp(field[0], strlen(field[0]), &cp);
156 /* extend special case array */
157 if (!(sc = realloc(sc, (++sclen) * sizeof(*sc)))) {
158 fprintf(stderr, "realloc: %s\n", strerror(errno));
159 exit(1);
162 /* parse field data */
163 parse_cp_list(field[3], &(sc[sclen - 1].upper.cp),
164 &(sc[sclen - 1].upper.cplen));
165 parse_cp_list(field[1], &(sc[sclen - 1].lower.cp),
166 &(sc[sclen - 1].lower.cplen));
167 parse_cp_list(field[2], &(sc[sclen - 1].title.cp),
168 &(sc[sclen - 1].title.cplen));
170 /*
171 * overwrite value in "single mapping" property table by the
172 * special value 0x110000 + (offset in special case array),
173 * even if the special case has length 1
174 */
175 prop_upper[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
176 prop_lower[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
177 prop_title[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
179 return 0;
182 static int_least64_t
183 get_value(const struct properties *prop, size_t offset)
185 return prop[offset].property;
188 int
189 main(int argc, char *argv[])
191 struct properties_compressed comp_upper, comp_lower, comp_title;
192 struct properties_major_minor mm_upper, mm_lower, mm_title;
193 size_t i, j;
195 (void)argc;
197 /* generate case property table from the specification */
198 properties_generate_break_property(case_property,
199 LEN(case_property),
200 handle_conflict, NULL, "case",
201 argv[0]);
203 /*
204 * allocate property buffers for all 0x110000 codepoints
206 * the buffers contain the offset from the "base" character
207 * to the respective case mapping. By callocing we set all fields
208 * to zero, which is also the Unicode "default" in the sense that
209 * there is no case mapping by default (unless we fill it in)
210 */
211 if (!(prop_upper = calloc(UINT32_C(0x110000), sizeof(*prop_upper))) ||
212 !(prop_lower = calloc(UINT32_C(0x110000), sizeof(*prop_lower))) ||
213 !(prop_title = calloc(UINT32_C(0x110000), sizeof(*prop_title)))) {
214 fprintf(stderr, "calloc: %s\n", strerror(errno));
215 exit(1);
217 parse_file_with_callback("data/UnicodeData.txt", unicodedata_callback,
218 NULL);
219 parse_file_with_callback("data/SpecialCasing.txt", specialcasing_callback,
220 NULL);
222 /* compress properties */
223 properties_compress(prop_upper, &comp_upper);
224 properties_compress(prop_lower, &comp_lower);
225 properties_compress(prop_title, &comp_title);
227 fprintf(stderr, "%s: LUT compression-ratios: upper=%.2f%%, lower=%.2f%%, title=%.2f%%\n",
228 argv[0], properties_get_major_minor(&comp_upper, &mm_upper),
229 properties_get_major_minor(&comp_lower, &mm_lower),
230 properties_get_major_minor(&comp_title, &mm_title));
232 /* print tables */
233 printf("/* Automatically generated by %s */\n#include <stdint.h>\n#include <stddef.h>\n\n", argv[0]);
235 printf("struct special_case {\n\tuint_least32_t *cp;\n\tsize_t cplen;\n};\n\n");
237 properties_print_lookup_table("upper_major", mm_upper.major, 0x1100);
238 printf("\n");
239 properties_print_derived_lookup_table("upper_minor", "int_least32_t", mm_upper.minor,
240 mm_upper.minorlen, get_value, comp_upper.data);
241 printf("\n");
242 properties_print_lookup_table("lower_major", mm_lower.major, 0x1100);
243 printf("\n");
244 properties_print_derived_lookup_table("lower_minor", "int_least32_t", mm_lower.minor,
245 mm_lower.minorlen, get_value, comp_lower.data);
246 printf("\n");
247 properties_print_lookup_table("title_major", mm_title.major, 0x1100);
248 printf("\n");
249 properties_print_derived_lookup_table("title_minor", "int_least32_t", mm_title.minor,
250 mm_title.minorlen, get_value, comp_title.data);
251 printf("\n");
253 printf("static const struct special_case upper_special[] = {\n");
254 for (i = 0; i < sclen; i++) {
255 printf("\t{\n");
257 printf("\t\t.cp = (uint_least32_t[]){");
258 for (j = 0; j < sc[i].upper.cplen; j++) {
259 printf(" UINT32_C(0x%06X)", sc[i].upper.cp[j]);
260 if (j + 1 < sc[i].upper.cplen) {
261 putchar(',');
264 printf(" },\n");
265 printf("\t\t.cplen = %zu,\n", sc[i].upper.cplen);
266 printf("\t},\n");
268 printf("};\n\n");
270 printf("static const struct special_case lower_special[] = {\n");
271 for (i = 0; i < sclen; i++) {
272 printf("\t{\n");
274 printf("\t\t.cp = (uint_least32_t[]){");
275 for (j = 0; j < sc[i].lower.cplen; j++) {
276 printf(" UINT32_C(0x%06X)", sc[i].lower.cp[j]);
277 if (j + 1 < sc[i].lower.cplen) {
278 putchar(',');
281 printf(" },\n");
282 printf("\t\t.cplen = %zu,\n", sc[i].lower.cplen);
283 printf("\t},\n");
285 printf("};\n\n");
287 printf("static const struct special_case title_special[] = {\n");
288 for (i = 0; i < sclen; i++) {
289 printf("\t{\n");
291 printf("\t\t.cp = (uint_least32_t[]){");
292 for (j = 0; j < sc[i].title.cplen; j++) {
293 printf(" UINT32_C(0x%06X)", sc[i].title.cp[j]);
294 if (j + 1 < sc[i].title.cplen) {
295 putchar(',');
298 printf(" },\n");
299 printf("\t\t.cplen = %zu,\n", sc[i].title.cplen);
300 printf("\t},\n");
302 printf("};\n\n");
304 free(comp_lower.data);
305 free(comp_lower.offset);
306 free(comp_title.data);
307 free(comp_title.offset);
308 free(comp_upper.data);
309 free(comp_upper.offset);
310 free(mm_lower.major);
311 free(mm_lower.minor);
312 free(mm_title.major);
313 free(mm_title.minor);
314 free(mm_upper.major);
315 free(mm_upper.minor);
317 return 0;