Blame


1 3448adb0 2022-11-02 op /* See LICENSE file for copyright and license details. */
2 3448adb0 2022-11-02 op #include <errno.h>
3 3448adb0 2022-11-02 op #include <stdint.h>
4 3448adb0 2022-11-02 op #include <stdio.h>
5 3448adb0 2022-11-02 op #include <stdlib.h>
6 3448adb0 2022-11-02 op #include <string.h>
7 3448adb0 2022-11-02 op
8 3448adb0 2022-11-02 op #include "util.h"
9 3448adb0 2022-11-02 op
10 3448adb0 2022-11-02 op #define FILE_DCP "data/DerivedCoreProperties.txt"
11 3448adb0 2022-11-02 op
12 3448adb0 2022-11-02 op static const struct property_spec case_property[] = {
13 3448adb0 2022-11-02 op {
14 3448adb0 2022-11-02 op .enumname = "OTHER",
15 3448adb0 2022-11-02 op .file = NULL,
16 3448adb0 2022-11-02 op .ucdname = NULL,
17 3448adb0 2022-11-02 op },
18 3448adb0 2022-11-02 op {
19 3448adb0 2022-11-02 op .enumname = "BOTH_CASED_CASE_IGNORABLE",
20 3448adb0 2022-11-02 op .file = NULL,
21 3448adb0 2022-11-02 op .ucdname = NULL,
22 3448adb0 2022-11-02 op },
23 3448adb0 2022-11-02 op {
24 3448adb0 2022-11-02 op .enumname = "CASED",
25 3448adb0 2022-11-02 op .file = FILE_DCP,
26 3448adb0 2022-11-02 op .ucdname = "Cased",
27 3448adb0 2022-11-02 op },
28 3448adb0 2022-11-02 op {
29 3448adb0 2022-11-02 op .enumname = "CASE_IGNORABLE",
30 3448adb0 2022-11-02 op .file = FILE_DCP,
31 3448adb0 2022-11-02 op .ucdname = "Case_Ignorable",
32 3448adb0 2022-11-02 op },
33 3448adb0 2022-11-02 op {
34 3448adb0 2022-11-02 op .enumname = "UNCASED",
35 3448adb0 2022-11-02 op .file = FILE_DCP,
36 3448adb0 2022-11-02 op .ucdname = "Uncased",
37 3448adb0 2022-11-02 op },
38 3448adb0 2022-11-02 op };
39 3448adb0 2022-11-02 op
40 3448adb0 2022-11-02 op static uint_least8_t
41 3448adb0 2022-11-02 op handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
42 3448adb0 2022-11-02 op {
43 3448adb0 2022-11-02 op uint_least8_t result;
44 3448adb0 2022-11-02 op
45 3448adb0 2022-11-02 op (void)cp;
46 3448adb0 2022-11-02 op
47 3448adb0 2022-11-02 op if ((!strcmp(case_property[prop1].enumname, "CASED") &&
48 3448adb0 2022-11-02 op !strcmp(case_property[prop2].enumname, "CASE_IGNORABLE")) ||
49 3448adb0 2022-11-02 op (!strcmp(case_property[prop1].enumname, "CASE_IGNORABLE") &&
50 3448adb0 2022-11-02 op !strcmp(case_property[prop2].enumname, "CASED"))) {
51 3448adb0 2022-11-02 op for (result = 0; result < LEN(case_property); result++) {
52 3448adb0 2022-11-02 op if (!strcmp(case_property[result].enumname,
53 3448adb0 2022-11-02 op "BOTH_CASED_CASE_IGNORABLE")) {
54 3448adb0 2022-11-02 op break;
55 3448adb0 2022-11-02 op }
56 3448adb0 2022-11-02 op }
57 3448adb0 2022-11-02 op if (result == LEN(case_property)) {
58 3448adb0 2022-11-02 op fprintf(stderr, "handle_conflict: Internal error.\n");
59 3448adb0 2022-11-02 op exit(1);
60 3448adb0 2022-11-02 op }
61 3448adb0 2022-11-02 op } else {
62 3448adb0 2022-11-02 op fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
63 3448adb0 2022-11-02 op exit(1);
64 3448adb0 2022-11-02 op }
65 3448adb0 2022-11-02 op
66 3448adb0 2022-11-02 op return result;
67 3448adb0 2022-11-02 op }
68 3448adb0 2022-11-02 op
69 3448adb0 2022-11-02 op static struct properties *prop_upper = NULL, *prop_lower, *prop_title;
70 3448adb0 2022-11-02 op static struct special_case {
71 3448adb0 2022-11-02 op struct {
72 3448adb0 2022-11-02 op uint_least32_t *cp;
73 3448adb0 2022-11-02 op size_t cplen;
74 3448adb0 2022-11-02 op } upper, lower, title;
75 3448adb0 2022-11-02 op } *sc = NULL;
76 3448adb0 2022-11-02 op static size_t sclen = 0;
77 3448adb0 2022-11-02 op
78 3448adb0 2022-11-02 op static int
79 3448adb0 2022-11-02 op unicodedata_callback(const char *file, char **field, size_t nfields,
80 3448adb0 2022-11-02 op char *comment, void *payload)
81 3448adb0 2022-11-02 op {
82 3448adb0 2022-11-02 op uint_least32_t cp, upper, lower, title;
83 3448adb0 2022-11-02 op
84 3448adb0 2022-11-02 op (void)file;
85 3448adb0 2022-11-02 op (void)comment;
86 3448adb0 2022-11-02 op (void)payload;
87 3448adb0 2022-11-02 op
88 3448adb0 2022-11-02 op hextocp(field[0], strlen(field[0]), &cp);
89 3448adb0 2022-11-02 op
90 3448adb0 2022-11-02 op upper = lower = title = cp;
91 3448adb0 2022-11-02 op
92 3448adb0 2022-11-02 op if ((strlen(field[12]) > 0 && hextocp(field[12], strlen(field[12]), &upper)) ||
93 3448adb0 2022-11-02 op (strlen(field[13]) > 0 && hextocp(field[13], strlen(field[13]), &lower)) ||
94 3448adb0 2022-11-02 op (nfields >= 15 && strlen(field[14]) > 0 && hextocp(field[14], strlen(field[14]), &title))) {
95 3448adb0 2022-11-02 op return 1;
96 3448adb0 2022-11-02 op }
97 3448adb0 2022-11-02 op
98 3448adb0 2022-11-02 op prop_upper[cp].property = (int_least32_t)upper - (int_least32_t)cp;
99 3448adb0 2022-11-02 op prop_lower[cp].property = (int_least32_t)lower - (int_least32_t)cp;
100 3448adb0 2022-11-02 op prop_title[cp].property = (int_least32_t)title - (int_least32_t)cp;
101 3448adb0 2022-11-02 op
102 3448adb0 2022-11-02 op return 0;
103 3448adb0 2022-11-02 op }
104 3448adb0 2022-11-02 op
105 3448adb0 2022-11-02 op static int
106 3448adb0 2022-11-02 op parse_cp_list(const char *str, uint_least32_t **cp, size_t *cplen)
107 3448adb0 2022-11-02 op {
108 3448adb0 2022-11-02 op size_t count, i;
109 3448adb0 2022-11-02 op const char *tmp1 = NULL, *tmp2 = NULL;
110 3448adb0 2022-11-02 op
111 3448adb0 2022-11-02 op /* count the number of spaces in the string and infer list length */
112 3448adb0 2022-11-02 op for (count = 1, tmp1 = str; (tmp2 = strchr(tmp1, ' ')) != NULL; count++, tmp1 = tmp2 + 1)
113 3448adb0 2022-11-02 op ;
114 3448adb0 2022-11-02 op
115 3448adb0 2022-11-02 op /* allocate resources */
116 3448adb0 2022-11-02 op if (!(*cp = calloc((*cplen = count), sizeof(**cp)))) {
117 3448adb0 2022-11-02 op fprintf(stderr, "calloc: %s\n", strerror(errno));
118 3448adb0 2022-11-02 op exit(1);
119 3448adb0 2022-11-02 op }
120 3448adb0 2022-11-02 op
121 3448adb0 2022-11-02 op /* go through the string again, parsing the numbers */
122 3448adb0 2022-11-02 op for (i = 0, tmp1 = tmp2 = str; tmp2 != NULL; i++) {
123 3448adb0 2022-11-02 op tmp2 = strchr(tmp1, ' ');
124 3448adb0 2022-11-02 op if (hextocp(tmp1, tmp2 ? (size_t)(tmp2 - tmp1) : strlen(tmp1), &((*cp)[i]))) {
125 3448adb0 2022-11-02 op return 1;
126 3448adb0 2022-11-02 op }
127 3448adb0 2022-11-02 op if (tmp2 != NULL) {
128 3448adb0 2022-11-02 op tmp1 = tmp2 + 1;
129 3448adb0 2022-11-02 op }
130 3448adb0 2022-11-02 op }
131 3448adb0 2022-11-02 op
132 3448adb0 2022-11-02 op return 0;
133 3448adb0 2022-11-02 op }
134 3448adb0 2022-11-02 op
135 3448adb0 2022-11-02 op static int
136 3448adb0 2022-11-02 op specialcasing_callback(const char *file, char **field, size_t nfields,
137 3448adb0 2022-11-02 op char *comment, void *payload)
138 3448adb0 2022-11-02 op {
139 3448adb0 2022-11-02 op uint_least32_t cp;
140 3448adb0 2022-11-02 op
141 3448adb0 2022-11-02 op (void)file;
142 3448adb0 2022-11-02 op (void)comment;
143 3448adb0 2022-11-02 op (void)payload;
144 3448adb0 2022-11-02 op
145 3448adb0 2022-11-02 op if (nfields > 4 && strlen(field[4]) > 0) {
146 3448adb0 2022-11-02 op /*
147 3448adb0 2022-11-02 op * we have more than 4 fields, i.e. the rule has a
148 3448adb0 2022-11-02 op * condition (language-sensitive, etc.) and is discarded
149 3448adb0 2022-11-02 op */
150 3448adb0 2022-11-02 op return 0;
151 3448adb0 2022-11-02 op }
152 3448adb0 2022-11-02 op
153 3448adb0 2022-11-02 op /* parse affected codepoint */
154 3448adb0 2022-11-02 op hextocp(field[0], strlen(field[0]), &cp);
155 3448adb0 2022-11-02 op
156 3448adb0 2022-11-02 op /* extend special case array */
157 3448adb0 2022-11-02 op if (!(sc = realloc(sc, (++sclen) * sizeof(*sc)))) {
158 3448adb0 2022-11-02 op fprintf(stderr, "realloc: %s\n", strerror(errno));
159 3448adb0 2022-11-02 op exit(1);
160 3448adb0 2022-11-02 op }
161 3448adb0 2022-11-02 op
162 3448adb0 2022-11-02 op /* parse field data */
163 3448adb0 2022-11-02 op parse_cp_list(field[3], &(sc[sclen - 1].upper.cp),
164 3448adb0 2022-11-02 op &(sc[sclen - 1].upper.cplen));
165 3448adb0 2022-11-02 op parse_cp_list(field[1], &(sc[sclen - 1].lower.cp),
166 3448adb0 2022-11-02 op &(sc[sclen - 1].lower.cplen));
167 3448adb0 2022-11-02 op parse_cp_list(field[2], &(sc[sclen - 1].title.cp),
168 3448adb0 2022-11-02 op &(sc[sclen - 1].title.cplen));
169 3448adb0 2022-11-02 op
170 3448adb0 2022-11-02 op /*
171 3448adb0 2022-11-02 op * overwrite value in "single mapping" property table by the
172 3448adb0 2022-11-02 op * special value 0x110000 + (offset in special case array),
173 3448adb0 2022-11-02 op * even if the special case has length 1
174 3448adb0 2022-11-02 op */
175 3448adb0 2022-11-02 op prop_upper[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
176 3448adb0 2022-11-02 op prop_lower[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
177 3448adb0 2022-11-02 op prop_title[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
178 3448adb0 2022-11-02 op
179 3448adb0 2022-11-02 op return 0;
180 3448adb0 2022-11-02 op }
181 3448adb0 2022-11-02 op
182 3448adb0 2022-11-02 op static int_least64_t
183 3448adb0 2022-11-02 op get_value(const struct properties *prop, size_t offset)
184 3448adb0 2022-11-02 op {
185 3448adb0 2022-11-02 op return prop[offset].property;
186 3448adb0 2022-11-02 op }
187 3448adb0 2022-11-02 op
188 3448adb0 2022-11-02 op int
189 3448adb0 2022-11-02 op main(int argc, char *argv[])
190 3448adb0 2022-11-02 op {
191 3448adb0 2022-11-02 op struct properties_compressed comp_upper, comp_lower, comp_title;
192 3448adb0 2022-11-02 op struct properties_major_minor mm_upper, mm_lower, mm_title;
193 3448adb0 2022-11-02 op size_t i, j;
194 3448adb0 2022-11-02 op
195 3448adb0 2022-11-02 op (void)argc;
196 3448adb0 2022-11-02 op
197 3448adb0 2022-11-02 op /* generate case property table from the specification */
198 3448adb0 2022-11-02 op properties_generate_break_property(case_property,
199 3448adb0 2022-11-02 op LEN(case_property),
200 3448adb0 2022-11-02 op handle_conflict, NULL, "case",
201 3448adb0 2022-11-02 op argv[0]);
202 3448adb0 2022-11-02 op
203 3448adb0 2022-11-02 op /*
204 3448adb0 2022-11-02 op * allocate property buffers for all 0x110000 codepoints
205 3448adb0 2022-11-02 op *
206 3448adb0 2022-11-02 op * the buffers contain the offset from the "base" character
207 3448adb0 2022-11-02 op * to the respective case mapping. By callocing we set all fields
208 3448adb0 2022-11-02 op * to zero, which is also the Unicode "default" in the sense that
209 3448adb0 2022-11-02 op * there is no case mapping by default (unless we fill it in)
210 3448adb0 2022-11-02 op */
211 3448adb0 2022-11-02 op if (!(prop_upper = calloc(UINT32_C(0x110000), sizeof(*prop_upper))) ||
212 3448adb0 2022-11-02 op !(prop_lower = calloc(UINT32_C(0x110000), sizeof(*prop_lower))) ||
213 3448adb0 2022-11-02 op !(prop_title = calloc(UINT32_C(0x110000), sizeof(*prop_title)))) {
214 3448adb0 2022-11-02 op fprintf(stderr, "calloc: %s\n", strerror(errno));
215 3448adb0 2022-11-02 op exit(1);
216 3448adb0 2022-11-02 op }
217 3448adb0 2022-11-02 op parse_file_with_callback("data/UnicodeData.txt", unicodedata_callback,
218 3448adb0 2022-11-02 op NULL);
219 3448adb0 2022-11-02 op parse_file_with_callback("data/SpecialCasing.txt", specialcasing_callback,
220 3448adb0 2022-11-02 op NULL);
221 3448adb0 2022-11-02 op
222 3448adb0 2022-11-02 op /* compress properties */
223 3448adb0 2022-11-02 op properties_compress(prop_upper, &comp_upper);
224 3448adb0 2022-11-02 op properties_compress(prop_lower, &comp_lower);
225 3448adb0 2022-11-02 op properties_compress(prop_title, &comp_title);
226 3448adb0 2022-11-02 op
227 3448adb0 2022-11-02 op fprintf(stderr, "%s: LUT compression-ratios: upper=%.2f%%, lower=%.2f%%, title=%.2f%%\n",
228 3448adb0 2022-11-02 op argv[0], properties_get_major_minor(&comp_upper, &mm_upper),
229 3448adb0 2022-11-02 op properties_get_major_minor(&comp_lower, &mm_lower),
230 3448adb0 2022-11-02 op properties_get_major_minor(&comp_title, &mm_title));
231 3448adb0 2022-11-02 op
232 3448adb0 2022-11-02 op /* print tables */
233 3448adb0 2022-11-02 op printf("/* Automatically generated by %s */\n#include <stdint.h>\n#include <stddef.h>\n\n", argv[0]);
234 3448adb0 2022-11-02 op
235 3448adb0 2022-11-02 op printf("struct special_case {\n\tuint_least32_t *cp;\n\tsize_t cplen;\n};\n\n");
236 3448adb0 2022-11-02 op
237 3448adb0 2022-11-02 op properties_print_lookup_table("upper_major", mm_upper.major, 0x1100);
238 3448adb0 2022-11-02 op printf("\n");
239 3448adb0 2022-11-02 op properties_print_derived_lookup_table("upper_minor", "int_least32_t", mm_upper.minor,
240 3448adb0 2022-11-02 op mm_upper.minorlen, get_value, comp_upper.data);
241 3448adb0 2022-11-02 op printf("\n");
242 3448adb0 2022-11-02 op properties_print_lookup_table("lower_major", mm_lower.major, 0x1100);
243 3448adb0 2022-11-02 op printf("\n");
244 3448adb0 2022-11-02 op properties_print_derived_lookup_table("lower_minor", "int_least32_t", mm_lower.minor,
245 3448adb0 2022-11-02 op mm_lower.minorlen, get_value, comp_lower.data);
246 3448adb0 2022-11-02 op printf("\n");
247 3448adb0 2022-11-02 op properties_print_lookup_table("title_major", mm_title.major, 0x1100);
248 3448adb0 2022-11-02 op printf("\n");
249 3448adb0 2022-11-02 op properties_print_derived_lookup_table("title_minor", "int_least32_t", mm_title.minor,
250 3448adb0 2022-11-02 op mm_title.minorlen, get_value, comp_title.data);
251 3448adb0 2022-11-02 op printf("\n");
252 3448adb0 2022-11-02 op
253 3448adb0 2022-11-02 op printf("static const struct special_case upper_special[] = {\n");
254 3448adb0 2022-11-02 op for (i = 0; i < sclen; i++) {
255 3448adb0 2022-11-02 op printf("\t{\n");
256 3448adb0 2022-11-02 op
257 3448adb0 2022-11-02 op printf("\t\t.cp = (uint_least32_t[]){");
258 3448adb0 2022-11-02 op for (j = 0; j < sc[i].upper.cplen; j++) {
259 3448adb0 2022-11-02 op printf(" UINT32_C(0x%06X)", sc[i].upper.cp[j]);
260 3448adb0 2022-11-02 op if (j + 1 < sc[i].upper.cplen) {
261 3448adb0 2022-11-02 op putchar(',');
262 3448adb0 2022-11-02 op }
263 3448adb0 2022-11-02 op }
264 3448adb0 2022-11-02 op printf(" },\n");
265 3448adb0 2022-11-02 op printf("\t\t.cplen = %zu,\n", sc[i].upper.cplen);
266 3448adb0 2022-11-02 op printf("\t},\n");
267 3448adb0 2022-11-02 op }
268 3448adb0 2022-11-02 op printf("};\n\n");
269 3448adb0 2022-11-02 op
270 3448adb0 2022-11-02 op printf("static const struct special_case lower_special[] = {\n");
271 3448adb0 2022-11-02 op for (i = 0; i < sclen; i++) {
272 3448adb0 2022-11-02 op printf("\t{\n");
273 3448adb0 2022-11-02 op
274 3448adb0 2022-11-02 op printf("\t\t.cp = (uint_least32_t[]){");
275 3448adb0 2022-11-02 op for (j = 0; j < sc[i].lower.cplen; j++) {
276 3448adb0 2022-11-02 op printf(" UINT32_C(0x%06X)", sc[i].lower.cp[j]);
277 3448adb0 2022-11-02 op if (j + 1 < sc[i].lower.cplen) {
278 3448adb0 2022-11-02 op putchar(',');
279 3448adb0 2022-11-02 op }
280 3448adb0 2022-11-02 op }
281 3448adb0 2022-11-02 op printf(" },\n");
282 3448adb0 2022-11-02 op printf("\t\t.cplen = %zu,\n", sc[i].lower.cplen);
283 3448adb0 2022-11-02 op printf("\t},\n");
284 3448adb0 2022-11-02 op }
285 3448adb0 2022-11-02 op printf("};\n\n");
286 3448adb0 2022-11-02 op
287 3448adb0 2022-11-02 op printf("static const struct special_case title_special[] = {\n");
288 3448adb0 2022-11-02 op for (i = 0; i < sclen; i++) {
289 3448adb0 2022-11-02 op printf("\t{\n");
290 3448adb0 2022-11-02 op
291 3448adb0 2022-11-02 op printf("\t\t.cp = (uint_least32_t[]){");
292 3448adb0 2022-11-02 op for (j = 0; j < sc[i].title.cplen; j++) {
293 3448adb0 2022-11-02 op printf(" UINT32_C(0x%06X)", sc[i].title.cp[j]);
294 3448adb0 2022-11-02 op if (j + 1 < sc[i].title.cplen) {
295 3448adb0 2022-11-02 op putchar(',');
296 3448adb0 2022-11-02 op }
297 3448adb0 2022-11-02 op }
298 3448adb0 2022-11-02 op printf(" },\n");
299 3448adb0 2022-11-02 op printf("\t\t.cplen = %zu,\n", sc[i].title.cplen);
300 3448adb0 2022-11-02 op printf("\t},\n");
301 3448adb0 2022-11-02 op }
302 3448adb0 2022-11-02 op printf("};\n\n");
303 3448adb0 2022-11-02 op
304 3448adb0 2022-11-02 op free(comp_lower.data);
305 3448adb0 2022-11-02 op free(comp_lower.offset);
306 3448adb0 2022-11-02 op free(comp_title.data);
307 3448adb0 2022-11-02 op free(comp_title.offset);
308 3448adb0 2022-11-02 op free(comp_upper.data);
309 3448adb0 2022-11-02 op free(comp_upper.offset);
310 3448adb0 2022-11-02 op free(mm_lower.major);
311 3448adb0 2022-11-02 op free(mm_lower.minor);
312 3448adb0 2022-11-02 op free(mm_title.major);
313 3448adb0 2022-11-02 op free(mm_title.minor);
314 3448adb0 2022-11-02 op free(mm_upper.major);
315 3448adb0 2022-11-02 op free(mm_upper.minor);
316 3448adb0 2022-11-02 op
317 3448adb0 2022-11-02 op return 0;
318 3448adb0 2022-11-02 op }