1 /* See LICENSE file for copyright and license details. */
10 #define FILE_DCP "data/DerivedCoreProperties.txt"
12 static const struct property_spec case_property[] = {
19 .enumname = "BOTH_CASED_CASE_IGNORABLE",
29 .enumname = "CASE_IGNORABLE",
31 .ucdname = "Case_Ignorable",
34 .enumname = "UNCASED",
41 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
47 if ((!strcmp(case_property[prop1].enumname, "CASED") &&
48 !strcmp(case_property[prop2].enumname, "CASE_IGNORABLE")) ||
49 (!strcmp(case_property[prop1].enumname, "CASE_IGNORABLE") &&
50 !strcmp(case_property[prop2].enumname, "CASED"))) {
51 for (result = 0; result < LEN(case_property); result++) {
52 if (!strcmp(case_property[result].enumname,
53 "BOTH_CASED_CASE_IGNORABLE")) {
57 if (result == LEN(case_property)) {
58 fprintf(stderr, "handle_conflict: Internal error.\n");
62 fprintf(stderr, "handle_conflict: Cannot handle conflict.\n");
69 static struct properties *prop_upper = NULL, *prop_lower, *prop_title;
70 static struct special_case {
74 } upper, lower, title;
76 static size_t sclen = 0;
79 unicodedata_callback(const char *file, char **field, size_t nfields,
80 char *comment, void *payload)
82 uint_least32_t cp, upper, lower, title;
88 hextocp(field[0], strlen(field[0]), &cp);
90 upper = lower = title = cp;
92 if ((strlen(field[12]) > 0 && hextocp(field[12], strlen(field[12]), &upper)) ||
93 (strlen(field[13]) > 0 && hextocp(field[13], strlen(field[13]), &lower)) ||
94 (nfields >= 15 && strlen(field[14]) > 0 && hextocp(field[14], strlen(field[14]), &title))) {
98 prop_upper[cp].property = (int_least32_t)upper - (int_least32_t)cp;
99 prop_lower[cp].property = (int_least32_t)lower - (int_least32_t)cp;
100 prop_title[cp].property = (int_least32_t)title - (int_least32_t)cp;
106 parse_cp_list(const char *str, uint_least32_t **cp, size_t *cplen)
109 const char *tmp1 = NULL, *tmp2 = NULL;
111 /* count the number of spaces in the string and infer list length */
112 for (count = 1, tmp1 = str; (tmp2 = strchr(tmp1, ' ')) != NULL; count++, tmp1 = tmp2 + 1)
115 /* allocate resources */
116 if (!(*cp = calloc((*cplen = count), sizeof(**cp)))) {
117 fprintf(stderr, "calloc: %s\n", strerror(errno));
121 /* go through the string again, parsing the numbers */
122 for (i = 0, tmp1 = tmp2 = str; tmp2 != NULL; i++) {
123 tmp2 = strchr(tmp1, ' ');
124 if (hextocp(tmp1, tmp2 ? (size_t)(tmp2 - tmp1) : strlen(tmp1), &((*cp)[i]))) {
136 specialcasing_callback(const char *file, char **field, size_t nfields,
137 char *comment, void *payload)
145 if (nfields > 4 && strlen(field[4]) > 0) {
147 * we have more than 4 fields, i.e. the rule has a
148 * condition (language-sensitive, etc.) and is discarded
153 /* parse affected codepoint */
154 hextocp(field[0], strlen(field[0]), &cp);
156 /* extend special case array */
157 if (!(sc = realloc(sc, (++sclen) * sizeof(*sc)))) {
158 fprintf(stderr, "realloc: %s\n", strerror(errno));
162 /* parse field data */
163 parse_cp_list(field[3], &(sc[sclen - 1].upper.cp),
164 &(sc[sclen - 1].upper.cplen));
165 parse_cp_list(field[1], &(sc[sclen - 1].lower.cp),
166 &(sc[sclen - 1].lower.cplen));
167 parse_cp_list(field[2], &(sc[sclen - 1].title.cp),
168 &(sc[sclen - 1].title.cplen));
171 * overwrite value in "single mapping" property table by the
172 * special value 0x110000 + (offset in special case array),
173 * even if the special case has length 1
175 prop_upper[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
176 prop_lower[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
177 prop_title[cp].property = (int_least64_t)(UINT32_C(0x110000) + (sclen - 1));
183 get_value(const struct properties *prop, size_t offset)
185 return prop[offset].property;
189 main(int argc, char *argv[])
191 struct properties_compressed comp_upper, comp_lower, comp_title;
192 struct properties_major_minor mm_upper, mm_lower, mm_title;
197 /* generate case property table from the specification */
198 properties_generate_break_property(case_property,
200 handle_conflict, NULL, "case",
204 * allocate property buffers for all 0x110000 codepoints
206 * the buffers contain the offset from the "base" character
207 * to the respective case mapping. By callocing we set all fields
208 * to zero, which is also the Unicode "default" in the sense that
209 * there is no case mapping by default (unless we fill it in)
211 if (!(prop_upper = calloc(UINT32_C(0x110000), sizeof(*prop_upper))) ||
212 !(prop_lower = calloc(UINT32_C(0x110000), sizeof(*prop_lower))) ||
213 !(prop_title = calloc(UINT32_C(0x110000), sizeof(*prop_title)))) {
214 fprintf(stderr, "calloc: %s\n", strerror(errno));
217 parse_file_with_callback("data/UnicodeData.txt", unicodedata_callback,
219 parse_file_with_callback("data/SpecialCasing.txt", specialcasing_callback,
222 /* compress properties */
223 properties_compress(prop_upper, &comp_upper);
224 properties_compress(prop_lower, &comp_lower);
225 properties_compress(prop_title, &comp_title);
227 fprintf(stderr, "%s: LUT compression-ratios: upper=%.2f%%, lower=%.2f%%, title=%.2f%%\n",
228 argv[0], properties_get_major_minor(&comp_upper, &mm_upper),
229 properties_get_major_minor(&comp_lower, &mm_lower),
230 properties_get_major_minor(&comp_title, &mm_title));
233 printf("/* Automatically generated by %s */\n#include <stdint.h>\n#include <stddef.h>\n\n", argv[0]);
235 printf("struct special_case {\n\tuint_least32_t *cp;\n\tsize_t cplen;\n};\n\n");
237 properties_print_lookup_table("upper_major", mm_upper.major, 0x1100);
239 properties_print_derived_lookup_table("upper_minor", "int_least32_t", mm_upper.minor,
240 mm_upper.minorlen, get_value, comp_upper.data);
242 properties_print_lookup_table("lower_major", mm_lower.major, 0x1100);
244 properties_print_derived_lookup_table("lower_minor", "int_least32_t", mm_lower.minor,
245 mm_lower.minorlen, get_value, comp_lower.data);
247 properties_print_lookup_table("title_major", mm_title.major, 0x1100);
249 properties_print_derived_lookup_table("title_minor", "int_least32_t", mm_title.minor,
250 mm_title.minorlen, get_value, comp_title.data);
253 printf("static const struct special_case upper_special[] = {\n");
254 for (i = 0; i < sclen; i++) {
257 printf("\t\t.cp = (uint_least32_t[]){");
258 for (j = 0; j < sc[i].upper.cplen; j++) {
259 printf(" UINT32_C(0x%06X)", sc[i].upper.cp[j]);
260 if (j + 1 < sc[i].upper.cplen) {
265 printf("\t\t.cplen = %zu,\n", sc[i].upper.cplen);
270 printf("static const struct special_case lower_special[] = {\n");
271 for (i = 0; i < sclen; i++) {
274 printf("\t\t.cp = (uint_least32_t[]){");
275 for (j = 0; j < sc[i].lower.cplen; j++) {
276 printf(" UINT32_C(0x%06X)", sc[i].lower.cp[j]);
277 if (j + 1 < sc[i].lower.cplen) {
282 printf("\t\t.cplen = %zu,\n", sc[i].lower.cplen);
287 printf("static const struct special_case title_special[] = {\n");
288 for (i = 0; i < sclen; i++) {
291 printf("\t\t.cp = (uint_least32_t[]){");
292 for (j = 0; j < sc[i].title.cplen; j++) {
293 printf(" UINT32_C(0x%06X)", sc[i].title.cp[j]);
294 if (j + 1 < sc[i].title.cplen) {
299 printf("\t\t.cplen = %zu,\n", sc[i].title.cplen);
304 free(comp_lower.data);
305 free(comp_lower.offset);
306 free(comp_title.data);
307 free(comp_title.offset);
308 free(comp_upper.data);
309 free(comp_upper.offset);
310 free(mm_lower.major);
311 free(mm_lower.minor);
312 free(mm_title.major);
313 free(mm_title.minor);
314 free(mm_upper.major);
315 free(mm_upper.minor);