Blob


1 /* See LICENSE file for copyright and license details. */
2 #include <stdio.h>
3 #include <stdlib.h>
4 #include <string.h>
6 #include "util.h"
8 #define FILE_EAW "data/EastAsianWidth.txt"
9 #define FILE_EMOJI "data/emoji-data.txt"
10 #define FILE_LINE "data/LineBreak.txt"
12 static const struct property_spec line_break_property[] = {
13 {
14 .enumname = "AL",
15 .file = FILE_LINE,
16 .ucdname = "AL",
17 },
18 /*
19 * Both extended pictographic and cn are large classes,
20 * but we are only interested in their intersection for LB30b,
21 * so we have the following two temporary classes. At first
22 * the extpict-class is filled, then the cn-class, which leads
23 * to conflicts (that we handle by putting them in the "proper"
24 * class BOTH_CN_EXTPICT). We make use of the fact that there
25 * is no intersection between AL and Cn.
26 *
27 * Any consecutive conflicts are permitted to overwrite
28 * TMP_EXTENDED_PICTOGRAPHIC and TMP_CN, because we don't need
29 * them, and in the final postprocessing we "reset" all
30 * remaining matches (that then didn't fit any of the other
31 * classes) to the generic class AL.
32 */
33 {
34 .enumname = "TMP_CN",
35 .file = FILE_LINE,
36 .ucdname = "Cn",
37 },
38 {
39 .enumname = "TMP_EXTENDED_PICTOGRAPHIC",
40 .file = FILE_EMOJI,
41 .ucdname = "Extended_Pictographic",
42 },
43 /* end of special block */
44 {
45 .enumname = "B2",
46 .file = FILE_LINE,
47 .ucdname = "B2",
48 },
49 {
50 .enumname = "BA",
51 .file = FILE_LINE,
52 .ucdname = "BA",
53 },
54 {
55 .enumname = "BB",
56 .file = FILE_LINE,
57 .ucdname = "BB",
58 },
59 {
60 .enumname = "BK",
61 .file = FILE_LINE,
62 .ucdname = "BK",
63 },
64 {
65 .enumname = "BOTH_CN_EXTPICT",
66 .file = NULL,
67 .ucdname = NULL,
68 },
69 {
70 .enumname = "CB",
71 .file = FILE_LINE,
72 .ucdname = "CB",
73 },
74 {
75 .enumname = "CL",
76 .file = FILE_LINE,
77 .ucdname = "CL",
78 },
79 {
80 .enumname = "CM",
81 .file = FILE_LINE,
82 .ucdname = "CM",
83 },
84 {
85 .enumname = "CP_WITHOUT_EAW_HWF",
86 .file = FILE_LINE,
87 .ucdname = "CP",
88 },
89 {
90 .enumname = "CP_WITH_EAW_HWF",
91 .file = NULL,
92 .ucdname = NULL,
93 },
94 {
95 .enumname = "CR",
96 .file = FILE_LINE,
97 .ucdname = "CR",
98 },
99 {
100 .enumname = "EB",
101 .file = FILE_LINE,
102 .ucdname = "EB",
103 },
105 .enumname = "EM",
106 .file = FILE_LINE,
107 .ucdname = "EM",
108 },
110 .enumname = "EX",
111 .file = FILE_LINE,
112 .ucdname = "EX",
113 },
115 .enumname = "GL",
116 .file = FILE_LINE,
117 .ucdname = "GL",
118 },
120 .enumname = "H2",
121 .file = FILE_LINE,
122 .ucdname = "H2",
123 },
125 .enumname = "H3",
126 .file = FILE_LINE,
127 .ucdname = "H3",
128 },
130 .enumname = "HL",
131 .file = FILE_LINE,
132 .ucdname = "HL",
133 },
135 .enumname = "HY",
136 .file = FILE_LINE,
137 .ucdname = "HY",
138 },
140 .enumname = "ID",
141 .file = FILE_LINE,
142 .ucdname = "ID",
143 },
145 .enumname = "IN",
146 .file = FILE_LINE,
147 .ucdname = "IN",
148 },
150 .enumname = "IS",
151 .file = FILE_LINE,
152 .ucdname = "IS",
153 },
155 .enumname = "JL",
156 .file = FILE_LINE,
157 .ucdname = "JL",
158 },
160 .enumname = "JT",
161 .file = FILE_LINE,
162 .ucdname = "JT",
163 },
165 .enumname = "JV",
166 .file = FILE_LINE,
167 .ucdname = "JV",
168 },
170 .enumname = "LF",
171 .file = FILE_LINE,
172 .ucdname = "LF",
173 },
175 .enumname = "NL",
176 .file = FILE_LINE,
177 .ucdname = "NL",
178 },
180 .enumname = "NS",
181 .file = FILE_LINE,
182 .ucdname = "NS",
183 },
185 .enumname = "NU",
186 .file = FILE_LINE,
187 .ucdname = "NU",
188 },
190 .enumname = "OP_WITHOUT_EAW_HWF",
191 .file = FILE_LINE,
192 .ucdname = "OP",
193 },
195 .enumname = "OP_WITH_EAW_HWF",
196 .file = NULL,
197 .ucdname = NULL,
198 },
200 .enumname = "PO",
201 .file = FILE_LINE,
202 .ucdname = "PO",
203 },
205 .enumname = "PR",
206 .file = FILE_LINE,
207 .ucdname = "PR",
208 },
210 .enumname = "QU",
211 .file = FILE_LINE,
212 .ucdname = "QU",
213 },
215 .enumname = "RI",
216 .file = FILE_LINE,
217 .ucdname = "RI",
218 },
220 .enumname = "SP",
221 .file = FILE_LINE,
222 .ucdname = "SP",
223 },
225 .enumname = "SY",
226 .file = FILE_LINE,
227 .ucdname = "SY",
228 },
230 .enumname = "WJ",
231 .file = FILE_LINE,
232 .ucdname = "WJ",
233 },
235 .enumname = "ZW",
236 .file = FILE_LINE,
237 .ucdname = "ZW",
238 },
240 .enumname = "ZWJ",
241 .file = FILE_LINE,
242 .ucdname = "ZWJ",
243 },
245 .enumname = "TMP_AI",
246 .file = FILE_LINE,
247 .ucdname = "AI",
248 },
250 .enumname = "TMP_CJ",
251 .file = FILE_LINE,
252 .ucdname = "CJ",
253 },
255 .enumname = "TMP_XX",
256 .file = NULL,
257 .ucdname = NULL,
258 },
260 .enumname = "TMP_MN",
261 .file = FILE_LINE,
262 .ucdname = "Mn",
263 },
265 .enumname = "TMP_MC",
266 .file = FILE_LINE,
267 .ucdname = "Mc",
268 },
270 .enumname = "TMP_SA_WITHOUT_MN_OR_MC",
271 .file = FILE_LINE,
272 .ucdname = "SA",
273 },
275 .enumname = "TMP_SA_WITH_MN_OR_MC",
276 .file = FILE_LINE,
277 .ucdname = "SA",
278 },
280 .enumname = "TMP_SG",
281 .file = FILE_LINE,
282 .ucdname = "SG",
283 },
285 .enumname = "TMP_EAW_H",
286 .file = FILE_EAW,
287 .ucdname = "H",
288 },
290 .enumname = "TMP_EAW_W",
291 .file = FILE_EAW,
292 .ucdname = "W",
293 },
295 .enumname = "TMP_EAW_F",
296 .file = FILE_EAW,
297 .ucdname = "F",
298 },
299 };
301 static uint_least8_t
302 handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
304 uint_least8_t result = prop2;
305 char *target = NULL;
307 (void)cp;
309 if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") ||
310 !strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") ||
311 !strcmp(line_break_property[prop1].enumname, "TMP_EAW_F")) ||
312 (!strcmp(line_break_property[prop2].enumname, "TMP_EAW_H") ||
313 !strcmp(line_break_property[prop2].enumname, "TMP_EAW_W") ||
314 !strcmp(line_break_property[prop2].enumname, "TMP_EAW_F"))) {
315 if (!strcmp(line_break_property[prop1].enumname, "CP_WITHOUT_EAW_HWF") ||
316 !strcmp(line_break_property[prop2].enumname, "CP_WITHOUT_EAW_HWF")) {
317 target = "CP_WITH_EAW_HWF";
318 } else if (!strcmp(line_break_property[prop1].enumname, "OP_WITHOUT_EAW_HWF") ||
319 !strcmp(line_break_property[prop2].enumname, "OP_WITHOUT_EAW_HWF")) {
320 target = "OP_WITH_EAW_HWF";
321 } else {
322 /* ignore EAW for the rest */
323 if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") ||
324 !strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") ||
325 !strcmp(line_break_property[prop1].enumname, "TMP_EAW_F"))) {
326 result = prop2;
327 } else {
328 result = prop1;
331 } else if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN") ||
332 !strcmp(line_break_property[prop1].enumname, "TMP_MC")) ||
333 (!strcmp(line_break_property[prop2].enumname, "TMP_MN") ||
334 !strcmp(line_break_property[prop2].enumname, "TMP_MC"))) {
335 if (!strcmp(line_break_property[prop1].enumname, "SA_WITHOUT_MN_OR_MC") ||
336 !strcmp(line_break_property[prop2].enumname, "SA_WITHOUT_MN_OR_MC")) {
337 target = "SA_WITH_MN_OR_MC";
338 } else {
339 /* ignore Mn and Mc for the rest */
340 if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN") ||
341 !strcmp(line_break_property[prop1].enumname, "TMP_MC"))) {
342 result = prop2;
343 } else {
344 result = prop1;
347 } else if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
348 !strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
349 if (!strcmp(line_break_property[prop1].enumname, "TMP_EXTENDED_PICTOGRAPHIC") ||
350 !strcmp(line_break_property[prop2].enumname, "TMP_EXTENDED_PICTOGRAPHIC")) {
351 target = "BOTH_CN_EXTPICT";
352 } else {
353 /* ignore Cn for all the other properties */
354 if (!strcmp(line_break_property[prop1].enumname, "TMP_CN")) {
355 result = prop2;
356 } else {
357 result = prop1;
360 } else if (!strcmp(line_break_property[prop1].enumname, "TMP_EXTENDED_PICTOGRAPHIC") ||
361 !strcmp(line_break_property[prop2].enumname, "TMP_EXTENDED_PICTOGRAPHIC")) {
362 if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
363 !strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
364 target = "BOTH_CN_EXTPICT";
365 } else {
366 /* ignore Extended_Pictographic for all the other properties */
367 if (!strcmp(line_break_property[prop1].enumname, "TMP_EXTENDED_PICTOGRAPHIC")) {
368 result = prop2;
369 } else {
370 result = prop1;
373 } else {
374 fprintf(stderr, "handle_conflict: Cannot handle conflict %s <- %s.\n",
375 line_break_property[prop1].enumname, line_break_property[prop2].enumname);
376 exit(1);
379 if (target) {
380 for (result = 0; result < LEN(line_break_property); result++) {
381 if (!strcmp(line_break_property[result].enumname,
382 target)) {
383 break;
386 if (result == LEN(line_break_property)) {
387 fprintf(stderr, "handle_conflict: Internal error.\n");
388 exit(1);
392 return result;
395 static uint_least8_t
396 post_process(uint_least8_t prop)
398 const char *target = NULL;
399 uint_least8_t result;
401 /* LB1 */
402 if (!strcmp(line_break_property[prop].enumname, "TMP_AI") ||
403 !strcmp(line_break_property[prop].enumname, "TMP_SG") ||
404 !strcmp(line_break_property[prop].enumname, "TMP_XX")) {
405 /* map AI, SG and XX to AL */
406 target = "AL";
407 } else if (!strcmp(line_break_property[prop].enumname, "TMP_SA_WITH_MN_OR_MC")) {
408 /* map SA (with General_Category Mn or Mc) to CM */
409 target = "CM";
410 } else if (!strcmp(line_break_property[prop].enumname, "TMP_SA_WITHOUT_MN_OR_MC")) {
411 /* map SA (without General_Category Mn or Mc) to AL */
412 target = "AL";
413 } else if (!strcmp(line_break_property[prop].enumname, "TMP_CJ")) {
414 /* map CJ to NS */
415 target = "NS";
416 } else if (!strcmp(line_break_property[prop].enumname, "TMP_CN") ||
417 !strcmp(line_break_property[prop].enumname, "TMP_EXTENDED_PICTOGRAPHIC") ||
418 !strcmp(line_break_property[prop].enumname, "TMP_MN") ||
419 !strcmp(line_break_property[prop].enumname, "TMP_MC") ||
420 !strcmp(line_break_property[prop].enumname, "TMP_EAW_H") ||
421 !strcmp(line_break_property[prop].enumname, "TMP_EAW_W") ||
422 !strcmp(line_break_property[prop].enumname, "TMP_EAW_F")) {
423 /* map all the temporary classes "residue" to AL */
424 target = "AL";
427 if (target) {
428 for (result = 0; result < LEN(line_break_property); result++) {
429 if (!strcmp(line_break_property[result].enumname,
430 target)) {
431 break;
434 if (result == LEN(line_break_property)) {
435 fprintf(stderr, "handle_conflict: Internal error.\n");
436 exit(1);
439 return result;
440 } else {
441 return prop;
445 int
446 main(int argc, char *argv[])
448 (void)argc;
450 properties_generate_break_property(line_break_property,
451 LEN(line_break_property),
452 handle_conflict, post_process,
453 "line_break", argv[0]);
455 return 0;