1 3448adb0 2022-11-02 op /* See LICENSE file for copyright and license details. */
2 3448adb0 2022-11-02 op #include <stdio.h>
3 3448adb0 2022-11-02 op #include <stdlib.h>
4 3448adb0 2022-11-02 op #include <string.h>
6 3448adb0 2022-11-02 op #include "util.h"
8 3448adb0 2022-11-02 op #define FILE_EAW "data/EastAsianWidth.txt"
9 3448adb0 2022-11-02 op #define FILE_EMOJI "data/emoji-data.txt"
10 3448adb0 2022-11-02 op #define FILE_LINE "data/LineBreak.txt"
12 3448adb0 2022-11-02 op static const struct property_spec line_break_property[] = {
14 3448adb0 2022-11-02 op .enumname = "AL",
15 3448adb0 2022-11-02 op .file = FILE_LINE,
16 3448adb0 2022-11-02 op .ucdname = "AL",
19 3448adb0 2022-11-02 op * Both extended pictographic and cn are large classes,
20 3448adb0 2022-11-02 op * but we are only interested in their intersection for LB30b,
21 3448adb0 2022-11-02 op * so we have the following two temporary classes. At first
22 3448adb0 2022-11-02 op * the extpict-class is filled, then the cn-class, which leads
23 3448adb0 2022-11-02 op * to conflicts (that we handle by putting them in the "proper"
24 3448adb0 2022-11-02 op * class BOTH_CN_EXTPICT). We make use of the fact that there
25 3448adb0 2022-11-02 op * is no intersection between AL and Cn.
27 3448adb0 2022-11-02 op * Any consecutive conflicts are permitted to overwrite
28 3448adb0 2022-11-02 op * TMP_EXTENDED_PICTOGRAPHIC and TMP_CN, because we don't need
29 3448adb0 2022-11-02 op * them, and in the final postprocessing we "reset" all
30 3448adb0 2022-11-02 op * remaining matches (that then didn't fit any of the other
31 3448adb0 2022-11-02 op * classes) to the generic class AL.
34 3448adb0 2022-11-02 op .enumname = "TMP_CN",
35 3448adb0 2022-11-02 op .file = FILE_LINE,
36 3448adb0 2022-11-02 op .ucdname = "Cn",
39 3448adb0 2022-11-02 op .enumname = "TMP_EXTENDED_PICTOGRAPHIC",
40 3448adb0 2022-11-02 op .file = FILE_EMOJI,
41 3448adb0 2022-11-02 op .ucdname = "Extended_Pictographic",
43 3448adb0 2022-11-02 op /* end of special block */
45 3448adb0 2022-11-02 op .enumname = "B2",
46 3448adb0 2022-11-02 op .file = FILE_LINE,
47 3448adb0 2022-11-02 op .ucdname = "B2",
50 3448adb0 2022-11-02 op .enumname = "BA",
51 3448adb0 2022-11-02 op .file = FILE_LINE,
52 3448adb0 2022-11-02 op .ucdname = "BA",
55 3448adb0 2022-11-02 op .enumname = "BB",
56 3448adb0 2022-11-02 op .file = FILE_LINE,
57 3448adb0 2022-11-02 op .ucdname = "BB",
60 3448adb0 2022-11-02 op .enumname = "BK",
61 3448adb0 2022-11-02 op .file = FILE_LINE,
62 3448adb0 2022-11-02 op .ucdname = "BK",
65 3448adb0 2022-11-02 op .enumname = "BOTH_CN_EXTPICT",
67 3448adb0 2022-11-02 op .ucdname = NULL,
70 3448adb0 2022-11-02 op .enumname = "CB",
71 3448adb0 2022-11-02 op .file = FILE_LINE,
72 3448adb0 2022-11-02 op .ucdname = "CB",
75 3448adb0 2022-11-02 op .enumname = "CL",
76 3448adb0 2022-11-02 op .file = FILE_LINE,
77 3448adb0 2022-11-02 op .ucdname = "CL",
80 3448adb0 2022-11-02 op .enumname = "CM",
81 3448adb0 2022-11-02 op .file = FILE_LINE,
82 3448adb0 2022-11-02 op .ucdname = "CM",
85 3448adb0 2022-11-02 op .enumname = "CP_WITHOUT_EAW_HWF",
86 3448adb0 2022-11-02 op .file = FILE_LINE,
87 3448adb0 2022-11-02 op .ucdname = "CP",
90 3448adb0 2022-11-02 op .enumname = "CP_WITH_EAW_HWF",
92 3448adb0 2022-11-02 op .ucdname = NULL,
95 3448adb0 2022-11-02 op .enumname = "CR",
96 3448adb0 2022-11-02 op .file = FILE_LINE,
97 3448adb0 2022-11-02 op .ucdname = "CR",
100 3448adb0 2022-11-02 op .enumname = "EB",
101 3448adb0 2022-11-02 op .file = FILE_LINE,
102 3448adb0 2022-11-02 op .ucdname = "EB",
105 3448adb0 2022-11-02 op .enumname = "EM",
106 3448adb0 2022-11-02 op .file = FILE_LINE,
107 3448adb0 2022-11-02 op .ucdname = "EM",
110 3448adb0 2022-11-02 op .enumname = "EX",
111 3448adb0 2022-11-02 op .file = FILE_LINE,
112 3448adb0 2022-11-02 op .ucdname = "EX",
115 3448adb0 2022-11-02 op .enumname = "GL",
116 3448adb0 2022-11-02 op .file = FILE_LINE,
117 3448adb0 2022-11-02 op .ucdname = "GL",
120 3448adb0 2022-11-02 op .enumname = "H2",
121 3448adb0 2022-11-02 op .file = FILE_LINE,
122 3448adb0 2022-11-02 op .ucdname = "H2",
125 3448adb0 2022-11-02 op .enumname = "H3",
126 3448adb0 2022-11-02 op .file = FILE_LINE,
127 3448adb0 2022-11-02 op .ucdname = "H3",
130 3448adb0 2022-11-02 op .enumname = "HL",
131 3448adb0 2022-11-02 op .file = FILE_LINE,
132 3448adb0 2022-11-02 op .ucdname = "HL",
135 3448adb0 2022-11-02 op .enumname = "HY",
136 3448adb0 2022-11-02 op .file = FILE_LINE,
137 3448adb0 2022-11-02 op .ucdname = "HY",
140 3448adb0 2022-11-02 op .enumname = "ID",
141 3448adb0 2022-11-02 op .file = FILE_LINE,
142 3448adb0 2022-11-02 op .ucdname = "ID",
145 3448adb0 2022-11-02 op .enumname = "IN",
146 3448adb0 2022-11-02 op .file = FILE_LINE,
147 3448adb0 2022-11-02 op .ucdname = "IN",
150 3448adb0 2022-11-02 op .enumname = "IS",
151 3448adb0 2022-11-02 op .file = FILE_LINE,
152 3448adb0 2022-11-02 op .ucdname = "IS",
155 3448adb0 2022-11-02 op .enumname = "JL",
156 3448adb0 2022-11-02 op .file = FILE_LINE,
157 3448adb0 2022-11-02 op .ucdname = "JL",
160 3448adb0 2022-11-02 op .enumname = "JT",
161 3448adb0 2022-11-02 op .file = FILE_LINE,
162 3448adb0 2022-11-02 op .ucdname = "JT",
165 3448adb0 2022-11-02 op .enumname = "JV",
166 3448adb0 2022-11-02 op .file = FILE_LINE,
167 3448adb0 2022-11-02 op .ucdname = "JV",
170 3448adb0 2022-11-02 op .enumname = "LF",
171 3448adb0 2022-11-02 op .file = FILE_LINE,
172 3448adb0 2022-11-02 op .ucdname = "LF",
175 3448adb0 2022-11-02 op .enumname = "NL",
176 3448adb0 2022-11-02 op .file = FILE_LINE,
177 3448adb0 2022-11-02 op .ucdname = "NL",
180 3448adb0 2022-11-02 op .enumname = "NS",
181 3448adb0 2022-11-02 op .file = FILE_LINE,
182 3448adb0 2022-11-02 op .ucdname = "NS",
185 3448adb0 2022-11-02 op .enumname = "NU",
186 3448adb0 2022-11-02 op .file = FILE_LINE,
187 3448adb0 2022-11-02 op .ucdname = "NU",
190 3448adb0 2022-11-02 op .enumname = "OP_WITHOUT_EAW_HWF",
191 3448adb0 2022-11-02 op .file = FILE_LINE,
192 3448adb0 2022-11-02 op .ucdname = "OP",
195 3448adb0 2022-11-02 op .enumname = "OP_WITH_EAW_HWF",
196 3448adb0 2022-11-02 op .file = NULL,
197 3448adb0 2022-11-02 op .ucdname = NULL,
200 3448adb0 2022-11-02 op .enumname = "PO",
201 3448adb0 2022-11-02 op .file = FILE_LINE,
202 3448adb0 2022-11-02 op .ucdname = "PO",
205 3448adb0 2022-11-02 op .enumname = "PR",
206 3448adb0 2022-11-02 op .file = FILE_LINE,
207 3448adb0 2022-11-02 op .ucdname = "PR",
210 3448adb0 2022-11-02 op .enumname = "QU",
211 3448adb0 2022-11-02 op .file = FILE_LINE,
212 3448adb0 2022-11-02 op .ucdname = "QU",
215 3448adb0 2022-11-02 op .enumname = "RI",
216 3448adb0 2022-11-02 op .file = FILE_LINE,
217 3448adb0 2022-11-02 op .ucdname = "RI",
220 3448adb0 2022-11-02 op .enumname = "SP",
221 3448adb0 2022-11-02 op .file = FILE_LINE,
222 3448adb0 2022-11-02 op .ucdname = "SP",
225 3448adb0 2022-11-02 op .enumname = "SY",
226 3448adb0 2022-11-02 op .file = FILE_LINE,
227 3448adb0 2022-11-02 op .ucdname = "SY",
230 3448adb0 2022-11-02 op .enumname = "WJ",
231 3448adb0 2022-11-02 op .file = FILE_LINE,
232 3448adb0 2022-11-02 op .ucdname = "WJ",
235 3448adb0 2022-11-02 op .enumname = "ZW",
236 3448adb0 2022-11-02 op .file = FILE_LINE,
237 3448adb0 2022-11-02 op .ucdname = "ZW",
240 3448adb0 2022-11-02 op .enumname = "ZWJ",
241 3448adb0 2022-11-02 op .file = FILE_LINE,
242 3448adb0 2022-11-02 op .ucdname = "ZWJ",
245 3448adb0 2022-11-02 op .enumname = "TMP_AI",
246 3448adb0 2022-11-02 op .file = FILE_LINE,
247 3448adb0 2022-11-02 op .ucdname = "AI",
250 3448adb0 2022-11-02 op .enumname = "TMP_CJ",
251 3448adb0 2022-11-02 op .file = FILE_LINE,
252 3448adb0 2022-11-02 op .ucdname = "CJ",
255 3448adb0 2022-11-02 op .enumname = "TMP_XX",
256 3448adb0 2022-11-02 op .file = NULL,
257 3448adb0 2022-11-02 op .ucdname = NULL,
260 3448adb0 2022-11-02 op .enumname = "TMP_MN",
261 3448adb0 2022-11-02 op .file = FILE_LINE,
262 3448adb0 2022-11-02 op .ucdname = "Mn",
265 3448adb0 2022-11-02 op .enumname = "TMP_MC",
266 3448adb0 2022-11-02 op .file = FILE_LINE,
267 3448adb0 2022-11-02 op .ucdname = "Mc",
270 3448adb0 2022-11-02 op .enumname = "TMP_SA_WITHOUT_MN_OR_MC",
271 3448adb0 2022-11-02 op .file = FILE_LINE,
272 3448adb0 2022-11-02 op .ucdname = "SA",
275 3448adb0 2022-11-02 op .enumname = "TMP_SA_WITH_MN_OR_MC",
276 3448adb0 2022-11-02 op .file = FILE_LINE,
277 3448adb0 2022-11-02 op .ucdname = "SA",
280 3448adb0 2022-11-02 op .enumname = "TMP_SG",
281 3448adb0 2022-11-02 op .file = FILE_LINE,
282 3448adb0 2022-11-02 op .ucdname = "SG",
285 3448adb0 2022-11-02 op .enumname = "TMP_EAW_H",
286 3448adb0 2022-11-02 op .file = FILE_EAW,
287 3448adb0 2022-11-02 op .ucdname = "H",
290 3448adb0 2022-11-02 op .enumname = "TMP_EAW_W",
291 3448adb0 2022-11-02 op .file = FILE_EAW,
292 3448adb0 2022-11-02 op .ucdname = "W",
295 3448adb0 2022-11-02 op .enumname = "TMP_EAW_F",
296 3448adb0 2022-11-02 op .file = FILE_EAW,
297 3448adb0 2022-11-02 op .ucdname = "F",
301 3448adb0 2022-11-02 op static uint_least8_t
302 3448adb0 2022-11-02 op handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2)
304 3448adb0 2022-11-02 op uint_least8_t result = prop2;
305 3448adb0 2022-11-02 op char *target = NULL;
309 3448adb0 2022-11-02 op if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") ||
310 3448adb0 2022-11-02 op !strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") ||
311 3448adb0 2022-11-02 op !strcmp(line_break_property[prop1].enumname, "TMP_EAW_F")) ||
312 3448adb0 2022-11-02 op (!strcmp(line_break_property[prop2].enumname, "TMP_EAW_H") ||
313 3448adb0 2022-11-02 op !strcmp(line_break_property[prop2].enumname, "TMP_EAW_W") ||
314 3448adb0 2022-11-02 op !strcmp(line_break_property[prop2].enumname, "TMP_EAW_F"))) {
315 3448adb0 2022-11-02 op if (!strcmp(line_break_property[prop1].enumname, "CP_WITHOUT_EAW_HWF") ||
316 3448adb0 2022-11-02 op !strcmp(line_break_property[prop2].enumname, "CP_WITHOUT_EAW_HWF")) {
317 3448adb0 2022-11-02 op target = "CP_WITH_EAW_HWF";
318 3448adb0 2022-11-02 op } else if (!strcmp(line_break_property[prop1].enumname, "OP_WITHOUT_EAW_HWF") ||
319 3448adb0 2022-11-02 op !strcmp(line_break_property[prop2].enumname, "OP_WITHOUT_EAW_HWF")) {
320 3448adb0 2022-11-02 op target = "OP_WITH_EAW_HWF";
322 3448adb0 2022-11-02 op /* ignore EAW for the rest */
323 3448adb0 2022-11-02 op if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") ||
324 3448adb0 2022-11-02 op !strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") ||
325 3448adb0 2022-11-02 op !strcmp(line_break_property[prop1].enumname, "TMP_EAW_F"))) {
326 3448adb0 2022-11-02 op result = prop2;
328 3448adb0 2022-11-02 op result = prop1;
331 3448adb0 2022-11-02 op } else if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN") ||
332 3448adb0 2022-11-02 op !strcmp(line_break_property[prop1].enumname, "TMP_MC")) ||
333 3448adb0 2022-11-02 op (!strcmp(line_break_property[prop2].enumname, "TMP_MN") ||
334 3448adb0 2022-11-02 op !strcmp(line_break_property[prop2].enumname, "TMP_MC"))) {
335 3448adb0 2022-11-02 op if (!strcmp(line_break_property[prop1].enumname, "SA_WITHOUT_MN_OR_MC") ||
336 3448adb0 2022-11-02 op !strcmp(line_break_property[prop2].enumname, "SA_WITHOUT_MN_OR_MC")) {
337 3448adb0 2022-11-02 op target = "SA_WITH_MN_OR_MC";
339 3448adb0 2022-11-02 op /* ignore Mn and Mc for the rest */
340 3448adb0 2022-11-02 op if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN") ||
341 3448adb0 2022-11-02 op !strcmp(line_break_property[prop1].enumname, "TMP_MC"))) {
342 3448adb0 2022-11-02 op result = prop2;
344 3448adb0 2022-11-02 op result = prop1;
347 3448adb0 2022-11-02 op } else if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
348 3448adb0 2022-11-02 op !strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
349 3448adb0 2022-11-02 op if (!strcmp(line_break_property[prop1].enumname, "TMP_EXTENDED_PICTOGRAPHIC") ||
350 3448adb0 2022-11-02 op !strcmp(line_break_property[prop2].enumname, "TMP_EXTENDED_PICTOGRAPHIC")) {
351 3448adb0 2022-11-02 op target = "BOTH_CN_EXTPICT";
353 3448adb0 2022-11-02 op /* ignore Cn for all the other properties */
354 3448adb0 2022-11-02 op if (!strcmp(line_break_property[prop1].enumname, "TMP_CN")) {
355 3448adb0 2022-11-02 op result = prop2;
357 3448adb0 2022-11-02 op result = prop1;
360 3448adb0 2022-11-02 op } else if (!strcmp(line_break_property[prop1].enumname, "TMP_EXTENDED_PICTOGRAPHIC") ||
361 3448adb0 2022-11-02 op !strcmp(line_break_property[prop2].enumname, "TMP_EXTENDED_PICTOGRAPHIC")) {
362 3448adb0 2022-11-02 op if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") ||
363 3448adb0 2022-11-02 op !strcmp(line_break_property[prop2].enumname, "TMP_CN")) {
364 3448adb0 2022-11-02 op target = "BOTH_CN_EXTPICT";
366 3448adb0 2022-11-02 op /* ignore Extended_Pictographic for all the other properties */
367 3448adb0 2022-11-02 op if (!strcmp(line_break_property[prop1].enumname, "TMP_EXTENDED_PICTOGRAPHIC")) {
368 3448adb0 2022-11-02 op result = prop2;
370 3448adb0 2022-11-02 op result = prop1;
374 3448adb0 2022-11-02 op fprintf(stderr, "handle_conflict: Cannot handle conflict %s <- %s.\n",
375 3448adb0 2022-11-02 op line_break_property[prop1].enumname, line_break_property[prop2].enumname);
379 3448adb0 2022-11-02 op if (target) {
380 3448adb0 2022-11-02 op for (result = 0; result < LEN(line_break_property); result++) {
381 3448adb0 2022-11-02 op if (!strcmp(line_break_property[result].enumname,
386 3448adb0 2022-11-02 op if (result == LEN(line_break_property)) {
387 3448adb0 2022-11-02 op fprintf(stderr, "handle_conflict: Internal error.\n");
392 3448adb0 2022-11-02 op return result;
395 3448adb0 2022-11-02 op static uint_least8_t
396 3448adb0 2022-11-02 op post_process(uint_least8_t prop)
398 3448adb0 2022-11-02 op const char *target = NULL;
399 3448adb0 2022-11-02 op uint_least8_t result;
402 3448adb0 2022-11-02 op if (!strcmp(line_break_property[prop].enumname, "TMP_AI") ||
403 3448adb0 2022-11-02 op !strcmp(line_break_property[prop].enumname, "TMP_SG") ||
404 3448adb0 2022-11-02 op !strcmp(line_break_property[prop].enumname, "TMP_XX")) {
405 3448adb0 2022-11-02 op /* map AI, SG and XX to AL */
406 3448adb0 2022-11-02 op target = "AL";
407 3448adb0 2022-11-02 op } else if (!strcmp(line_break_property[prop].enumname, "TMP_SA_WITH_MN_OR_MC")) {
408 3448adb0 2022-11-02 op /* map SA (with General_Category Mn or Mc) to CM */
409 3448adb0 2022-11-02 op target = "CM";
410 3448adb0 2022-11-02 op } else if (!strcmp(line_break_property[prop].enumname, "TMP_SA_WITHOUT_MN_OR_MC")) {
411 3448adb0 2022-11-02 op /* map SA (without General_Category Mn or Mc) to AL */
412 3448adb0 2022-11-02 op target = "AL";
413 3448adb0 2022-11-02 op } else if (!strcmp(line_break_property[prop].enumname, "TMP_CJ")) {
414 3448adb0 2022-11-02 op /* map CJ to NS */
415 3448adb0 2022-11-02 op target = "NS";
416 3448adb0 2022-11-02 op } else if (!strcmp(line_break_property[prop].enumname, "TMP_CN") ||
417 3448adb0 2022-11-02 op !strcmp(line_break_property[prop].enumname, "TMP_EXTENDED_PICTOGRAPHIC") ||
418 3448adb0 2022-11-02 op !strcmp(line_break_property[prop].enumname, "TMP_MN") ||
419 3448adb0 2022-11-02 op !strcmp(line_break_property[prop].enumname, "TMP_MC") ||
420 3448adb0 2022-11-02 op !strcmp(line_break_property[prop].enumname, "TMP_EAW_H") ||
421 3448adb0 2022-11-02 op !strcmp(line_break_property[prop].enumname, "TMP_EAW_W") ||
422 3448adb0 2022-11-02 op !strcmp(line_break_property[prop].enumname, "TMP_EAW_F")) {
423 3448adb0 2022-11-02 op /* map all the temporary classes "residue" to AL */
424 3448adb0 2022-11-02 op target = "AL";
427 3448adb0 2022-11-02 op if (target) {
428 3448adb0 2022-11-02 op for (result = 0; result < LEN(line_break_property); result++) {
429 3448adb0 2022-11-02 op if (!strcmp(line_break_property[result].enumname,
434 3448adb0 2022-11-02 op if (result == LEN(line_break_property)) {
435 3448adb0 2022-11-02 op fprintf(stderr, "handle_conflict: Internal error.\n");
439 3448adb0 2022-11-02 op return result;
446 3448adb0 2022-11-02 op main(int argc, char *argv[])
450 3448adb0 2022-11-02 op properties_generate_break_property(line_break_property,
451 3448adb0 2022-11-02 op LEN(line_break_property),
452 3448adb0 2022-11-02 op handle_conflict, post_process,
453 3448adb0 2022-11-02 op "line_break", argv[0]);