1 3448adb0 2022-11-02 op /* See LICENSE file for copyright and license details. */
2 3448adb0 2022-11-02 op #include <errno.h>
3 3448adb0 2022-11-02 op #include <stdint.h>
4 3448adb0 2022-11-02 op #include <stdio.h>
5 3448adb0 2022-11-02 op #include <stdlib.h>
6 3448adb0 2022-11-02 op #include <string.h>
8 3448adb0 2022-11-02 op #include "util.h"
10 3448adb0 2022-11-02 op #define FILE_DCP "data/DerivedCoreProperties.txt"
12 3448adb0 2022-11-02 op static const struct property_spec case_property[] = {
14 3448adb0 2022-11-02 op .enumname = "OTHER",
16 3448adb0 2022-11-02 op .ucdname = NULL,
19 3448adb0 2022-11-02 op .enumname = "BOTH_CASED_CASE_IGNORABLE",
21 3448adb0 2022-11-02 op .ucdname = NULL,
24 3448adb0 2022-11-02 op .enumname = "CASED",
25 3448adb0 2022-11-02 op .file = FILE_DCP,
26 3448adb0 2022-11-02 op .ucdname = "Cased",
29 3448adb0 2022-11-02 op .enumname = "CASE_IGNORABLE",
30 3448adb0 2022-11-02 op .file = FILE_DCP,
31 3448adb0 2022-11-02 op .ucdname = "Case_Ignorable",
34 3448adb0 2022-11-02 op .enumname = "UNCASED",
35 3448adb0 2022-11-02 op .file = FILE_DCP,
36 3448adb0 2022-11-02 op .ucdname = "Uncased",
40 3448adb0 2022-11-02 op static uint_least8_t
41 3448adb0 2022-11-02 op handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
43 3448adb0 2022-11-02 op uint_least8_t result;
47 3448adb0 2022-11-02 op if ((!strcmp(case_property[prop1].enumname, "CASED") &&
48 3448adb0 2022-11-02 op !strcmp(case_property[prop2].enumname, "CASE_IGNORABLE")) ||
49 3448adb0 2022-11-02 op (!strcmp(case_property[prop1].enumname, "CASE_IGNORABLE") &&
50 3448adb0 2022-11-02 op !strcmp(case_property[prop2].enumname, "CASED"))) {
51 3448adb0 2022-11-02 op for (result = 0; result < LEN(case_property); result++) {
52 3448adb0 2022-11-02 op if (!strcmp(case_property[result].enumname,
53 3448adb0 2022-11-02 op "BOTH_CASED_CASE_IGNORABLE")) {
57 3448adb0 2022-11-02 op if (result == LEN(case_property)) {
58 3448adb0 2022-11-02 op fprintf(stderr, "handle_conflict: Internal error.\n");
62 3448adb0 2022-11-02 op fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
66 3448adb0 2022-11-02 op return result;
69 3448adb0 2022-11-02 op static struct properties *prop_upper = NULL, *prop_lower, *prop_title;
70 3448adb0 2022-11-02 op static struct special_case {
72 3448adb0 2022-11-02 op uint_least32_t *cp;
74 3448adb0 2022-11-02 op } upper, lower, title;
76 3448adb0 2022-11-02 op static size_t sclen = 0;
79 3448adb0 2022-11-02 op unicodedata_callback(const char *file, char **field, size_t nfields,
80 3448adb0 2022-11-02 op char *comment, void *payload)
82 3448adb0 2022-11-02 op uint_least32_t cp, upper, lower, title;
85 3448adb0 2022-11-02 op (void)comment;
86 3448adb0 2022-11-02 op (void)payload;
88 3448adb0 2022-11-02 op hextocp(field[0], strlen(field[0]), &cp);
90 3448adb0 2022-11-02 op upper = lower = title = cp;
92 3448adb0 2022-11-02 op if ((strlen(field[12]) > 0 && hextocp(field[12], strlen(field[12]), &upper)) ||
93 3448adb0 2022-11-02 op (strlen(field[13]) > 0 && hextocp(field[13], strlen(field[13]), &lower)) ||
94 3448adb0 2022-11-02 op (nfields >= 15 && strlen(field[14]) > 0 && hextocp(field[14], strlen(field[14]), &title))) {
98 3448adb0 2022-11-02 op prop_upper[cp].property = (int_least32_t)upper - (int_least32_t)cp;
99 3448adb0 2022-11-02 op prop_lower[cp].property = (int_least32_t)lower - (int_least32_t)cp;
100 3448adb0 2022-11-02 op prop_title[cp].property = (int_least32_t)title - (int_least32_t)cp;
106 3448adb0 2022-11-02 op parse_cp_list(const char *str, uint_least32_t **cp, size_t *cplen)
108 3448adb0 2022-11-02 op size_t count, i;
109 3448adb0 2022-11-02 op const char *tmp1 = NULL, *tmp2 = NULL;
111 3448adb0 2022-11-02 op /* count the number of spaces in the string and infer list length */
112 3448adb0 2022-11-02 op for (count = 1, tmp1 = str; (tmp2 = strchr(tmp1, ' ')) != NULL; count++, tmp1 = tmp2 + 1)
115 3448adb0 2022-11-02 op /* allocate resources */
116 3448adb0 2022-11-02 op if (!(*cp = calloc((*cplen = count), sizeof(**cp)))) {
117 3448adb0 2022-11-02 op fprintf(stderr, "calloc: %s\n", strerror(errno));
121 3448adb0 2022-11-02 op /* go through the string again, parsing the numbers */
122 3448adb0 2022-11-02 op for (i = 0, tmp1 = tmp2 = str; tmp2 != NULL; i++) {
123 3448adb0 2022-11-02 op tmp2 = strchr(tmp1, ' ');
124 3448adb0 2022-11-02 op if (hextocp(tmp1, tmp2 ? (size_t)(tmp2 - tmp1) : strlen(tmp1), &((*cp)[i]))) {
127 3448adb0 2022-11-02 op if (tmp2 != NULL) {
128 3448adb0 2022-11-02 op tmp1 = tmp2 + 1;
136 3448adb0 2022-11-02 op specialcasing_callback(const char *file, char **field, size_t nfields,
137 3448adb0 2022-11-02 op char *comment, void *payload)
139 3448adb0 2022-11-02 op uint_least32_t cp;
142 3448adb0 2022-11-02 op (void)comment;
143 3448adb0 2022-11-02 op (void)payload;
145 3448adb0 2022-11-02 op if (nfields > 4 && strlen(field[4]) > 0) {
147 3448adb0 2022-11-02 op * we have more than 4 fields, i.e. the rule has a
148 3448adb0 2022-11-02 op * condition (language-sensitive, etc.) and is discarded
153 3448adb0 2022-11-02 op /* parse affected codepoint */
154 3448adb0 2022-11-02 op hextocp(field[0], strlen(field[0]), &cp);
156 3448adb0 2022-11-02 op /* extend special case array */
157 3448adb0 2022-11-02 op if (!(sc = realloc(sc, (++sclen) * sizeof(*sc)))) {
158 3448adb0 2022-11-02 op fprintf(stderr, "realloc: %s\n", strerror(errno));
162 3448adb0 2022-11-02 op /* parse field data */
163 3448adb0 2022-11-02 op parse_cp_list(field[3], &(sc[sclen - 1].upper.cp),
164 3448adb0 2022-11-02 op &(sc[sclen - 1].upper.cplen));
165 3448adb0 2022-11-02 op parse_cp_list(field[1], &(sc[sclen - 1].lower.cp),
166 3448adb0 2022-11-02 op &(sc[sclen - 1].lower.cplen));
167 3448adb0 2022-11-02 op parse_cp_list(field[2], &(sc[sclen - 1].title.cp),
168 3448adb0 2022-11-02 op &(sc[sclen - 1].title.cplen));
171 3448adb0 2022-11-02 op * overwrite value in "single mapping" property table by the
172 3448adb0 2022-11-02 op * special value 0x110000 + (offset in special case array),
173 3448adb0 2022-11-02 op * even if the special case has length 1
175 3448adb0 2022-11-02 op prop_upper[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
176 3448adb0 2022-11-02 op prop_lower[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
177 3448adb0 2022-11-02 op prop_title[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
182 3448adb0 2022-11-02 op static int_least64_t
183 3448adb0 2022-11-02 op get_value(const struct properties *prop, size_t offset)
185 3448adb0 2022-11-02 op return prop[offset].property;
189 3448adb0 2022-11-02 op main(int argc, char *argv[])
191 3448adb0 2022-11-02 op struct properties_compressed comp_upper, comp_lower, comp_title;
192 3448adb0 2022-11-02 op struct properties_major_minor mm_upper, mm_lower, mm_title;
197 3448adb0 2022-11-02 op /* generate case property table from the specification */
198 3448adb0 2022-11-02 op properties_generate_break_property(case_property,
199 3448adb0 2022-11-02 op LEN(case_property),
200 3448adb0 2022-11-02 op handle_conflict, NULL, "case",
204 3448adb0 2022-11-02 op * allocate property buffers for all 0x110000 codepoints
206 3448adb0 2022-11-02 op * the buffers contain the offset from the "base" character
207 3448adb0 2022-11-02 op * to the respective case mapping. By callocing we set all fields
208 3448adb0 2022-11-02 op * to zero, which is also the Unicode "default" in the sense that
209 3448adb0 2022-11-02 op * there is no case mapping by default (unless we fill it in)
211 3448adb0 2022-11-02 op if (!(prop_upper = calloc(UINT32_C(0x110000), sizeof(*prop_upper))) ||
212 3448adb0 2022-11-02 op !(prop_lower = calloc(UINT32_C(0x110000), sizeof(*prop_lower))) ||
213 3448adb0 2022-11-02 op !(prop_title = calloc(UINT32_C(0x110000), sizeof(*prop_title)))) {
214 3448adb0 2022-11-02 op fprintf(stderr, "calloc: %s\n", strerror(errno));
217 3448adb0 2022-11-02 op parse_file_with_callback("data/UnicodeData.txt", unicodedata_callback,
219 3448adb0 2022-11-02 op parse_file_with_callback("data/SpecialCasing.txt", specialcasing_callback,
222 3448adb0 2022-11-02 op /* compress properties */
223 3448adb0 2022-11-02 op properties_compress(prop_upper, &comp_upper);
224 3448adb0 2022-11-02 op properties_compress(prop_lower, &comp_lower);
225 3448adb0 2022-11-02 op properties_compress(prop_title, &comp_title);
227 3448adb0 2022-11-02 op fprintf(stderr, "%s: LUT compression-ratios: upper=%.2f%%, lower=%.2f%%, title=%.2f%%\n",
228 3448adb0 2022-11-02 op argv[0], properties_get_major_minor(&comp_upper, &mm_upper),
229 3448adb0 2022-11-02 op properties_get_major_minor(&comp_lower, &mm_lower),
230 3448adb0 2022-11-02 op properties_get_major_minor(&comp_title, &mm_title));
232 3448adb0 2022-11-02 op /* print tables */
233 3448adb0 2022-11-02 op printf("/* Automatically generated by %s */\n#include <stdint.h>\n#include <stddef.h>\n\n", argv[0]);
235 3448adb0 2022-11-02 op printf("struct special_case {\n\tuint_least32_t *cp;\n\tsize_t cplen;\n};\n\n");
237 3448adb0 2022-11-02 op properties_print_lookup_table("upper_major", mm_upper.major, 0x1100);
238 3448adb0 2022-11-02 op printf("\n");
239 3448adb0 2022-11-02 op properties_print_derived_lookup_table("upper_minor", "int_least32_t", mm_upper.minor,
240 3448adb0 2022-11-02 op mm_upper.minorlen, get_value, comp_upper.data);
241 3448adb0 2022-11-02 op printf("\n");
242 3448adb0 2022-11-02 op properties_print_lookup_table("lower_major", mm_lower.major, 0x1100);
243 3448adb0 2022-11-02 op printf("\n");
244 3448adb0 2022-11-02 op properties_print_derived_lookup_table("lower_minor", "int_least32_t", mm_lower.minor,
245 3448adb0 2022-11-02 op mm_lower.minorlen, get_value, comp_lower.data);
246 3448adb0 2022-11-02 op printf("\n");
247 3448adb0 2022-11-02 op properties_print_lookup_table("title_major", mm_title.major, 0x1100);
248 3448adb0 2022-11-02 op printf("\n");
249 3448adb0 2022-11-02 op properties_print_derived_lookup_table("title_minor", "int_least32_t", mm_title.minor,
250 3448adb0 2022-11-02 op mm_title.minorlen, get_value, comp_title.data);
251 3448adb0 2022-11-02 op printf("\n");
253 3448adb0 2022-11-02 op printf("static const struct special_case upper_special[] = {\n");
254 3448adb0 2022-11-02 op for (i = 0; i < sclen; i++) {
255 3448adb0 2022-11-02 op printf("\t{\n");
257 3448adb0 2022-11-02 op printf("\t\t.cp = (uint_least32_t[]){");
258 3448adb0 2022-11-02 op for (j = 0; j < sc[i].upper.cplen; j++) {
259 3448adb0 2022-11-02 op printf(" UINT32_C(0x%06X)", sc[i].upper.cp[j]);
260 3448adb0 2022-11-02 op if (j + 1 < sc[i].upper.cplen) {
261 3448adb0 2022-11-02 op putchar(',');
264 3448adb0 2022-11-02 op printf(" },\n");
265 3448adb0 2022-11-02 op printf("\t\t.cplen = %zu,\n", sc[i].upper.cplen);
266 3448adb0 2022-11-02 op printf("\t},\n");
268 3448adb0 2022-11-02 op printf("};\n\n");
270 3448adb0 2022-11-02 op printf("static const struct special_case lower_special[] = {\n");
271 3448adb0 2022-11-02 op for (i = 0; i < sclen; i++) {
272 3448adb0 2022-11-02 op printf("\t{\n");
274 3448adb0 2022-11-02 op printf("\t\t.cp = (uint_least32_t[]){");
275 3448adb0 2022-11-02 op for (j = 0; j < sc[i].lower.cplen; j++) {
276 3448adb0 2022-11-02 op printf(" UINT32_C(0x%06X)", sc[i].lower.cp[j]);
277 3448adb0 2022-11-02 op if (j + 1 < sc[i].lower.cplen) {
278 3448adb0 2022-11-02 op putchar(',');
281 3448adb0 2022-11-02 op printf(" },\n");
282 3448adb0 2022-11-02 op printf("\t\t.cplen = %zu,\n", sc[i].lower.cplen);
283 3448adb0 2022-11-02 op printf("\t},\n");
285 3448adb0 2022-11-02 op printf("};\n\n");
287 3448adb0 2022-11-02 op printf("static const struct special_case title_special[] = {\n");
288 3448adb0 2022-11-02 op for (i = 0; i < sclen; i++) {
289 3448adb0 2022-11-02 op printf("\t{\n");
291 3448adb0 2022-11-02 op printf("\t\t.cp = (uint_least32_t[]){");
292 3448adb0 2022-11-02 op for (j = 0; j < sc[i].title.cplen; j++) {
293 3448adb0 2022-11-02 op printf(" UINT32_C(0x%06X)", sc[i].title.cp[j]);
294 3448adb0 2022-11-02 op if (j + 1 < sc[i].title.cplen) {
295 3448adb0 2022-11-02 op putchar(',');
298 3448adb0 2022-11-02 op printf(" },\n");
299 3448adb0 2022-11-02 op printf("\t\t.cplen = %zu,\n", sc[i].title.cplen);
300 3448adb0 2022-11-02 op printf("\t},\n");
302 3448adb0 2022-11-02 op printf("};\n\n");
304 3448adb0 2022-11-02 op free(comp_lower.data);
305 3448adb0 2022-11-02 op free(comp_lower.offset);
306 3448adb0 2022-11-02 op free(comp_title.data);
307 3448adb0 2022-11-02 op free(comp_title.offset);
308 3448adb0 2022-11-02 op free(comp_upper.data);
309 3448adb0 2022-11-02 op free(comp_upper.offset);
310 3448adb0 2022-11-02 op free(mm_lower.major);
311 3448adb0 2022-11-02 op free(mm_lower.minor);
312 3448adb0 2022-11-02 op free(mm_title.major);
313 3448adb0 2022-11-02 op free(mm_title.minor);
314 3448adb0 2022-11-02 op free(mm_upper.major);
315 3448adb0 2022-11-02 op free(mm_upper.minor);