/* See LICENSE file for copyright and license details. */ #include #include #include #include "util.h" #define FILE_EAW "data/EastAsianWidth.txt" #define FILE_EMOJI "data/emoji-data.txt" #define FILE_LINE "data/LineBreak.txt" static const struct property_spec line_break_property[] = { { .enumname = "AL", .file = FILE_LINE, .ucdname = "AL", }, /* * Both extended pictographic and cn are large classes, * but we are only interested in their intersection for LB30b, * so we have the following two temporary classes. At first * the extpict-class is filled, then the cn-class, which leads * to conflicts (that we handle by putting them in the "proper" * class BOTH_CN_EXTPICT). We make use of the fact that there * is no intersection between AL and Cn. * * Any consecutive conflicts are permitted to overwrite * TMP_EXTENDED_PICTOGRAPHIC and TMP_CN, because we don't need * them, and in the final postprocessing we "reset" all * remaining matches (that then didn't fit any of the other * classes) to the generic class AL. */ { .enumname = "TMP_CN", .file = FILE_LINE, .ucdname = "Cn", }, { .enumname = "TMP_EXTENDED_PICTOGRAPHIC", .file = FILE_EMOJI, .ucdname = "Extended_Pictographic", }, /* end of special block */ { .enumname = "B2", .file = FILE_LINE, .ucdname = "B2", }, { .enumname = "BA", .file = FILE_LINE, .ucdname = "BA", }, { .enumname = "BB", .file = FILE_LINE, .ucdname = "BB", }, { .enumname = "BK", .file = FILE_LINE, .ucdname = "BK", }, { .enumname = "BOTH_CN_EXTPICT", .file = NULL, .ucdname = NULL, }, { .enumname = "CB", .file = FILE_LINE, .ucdname = "CB", }, { .enumname = "CL", .file = FILE_LINE, .ucdname = "CL", }, { .enumname = "CM", .file = FILE_LINE, .ucdname = "CM", }, { .enumname = "CP_WITHOUT_EAW_HWF", .file = FILE_LINE, .ucdname = "CP", }, { .enumname = "CP_WITH_EAW_HWF", .file = NULL, .ucdname = NULL, }, { .enumname = "CR", .file = FILE_LINE, .ucdname = "CR", }, { .enumname = "EB", .file = FILE_LINE, .ucdname = "EB", }, { .enumname = "EM", .file = FILE_LINE, .ucdname = "EM", }, { .enumname = "EX", .file = FILE_LINE, .ucdname = "EX", }, { .enumname = "GL", .file = FILE_LINE, .ucdname = "GL", }, { .enumname = "H2", .file = FILE_LINE, .ucdname = "H2", }, { .enumname = "H3", .file = FILE_LINE, .ucdname = "H3", }, { .enumname = "HL", .file = FILE_LINE, .ucdname = "HL", }, { .enumname = "HY", .file = FILE_LINE, .ucdname = "HY", }, { .enumname = "ID", .file = FILE_LINE, .ucdname = "ID", }, { .enumname = "IN", .file = FILE_LINE, .ucdname = "IN", }, { .enumname = "IS", .file = FILE_LINE, .ucdname = "IS", }, { .enumname = "JL", .file = FILE_LINE, .ucdname = "JL", }, { .enumname = "JT", .file = FILE_LINE, .ucdname = "JT", }, { .enumname = "JV", .file = FILE_LINE, .ucdname = "JV", }, { .enumname = "LF", .file = FILE_LINE, .ucdname = "LF", }, { .enumname = "NL", .file = FILE_LINE, .ucdname = "NL", }, { .enumname = "NS", .file = FILE_LINE, .ucdname = "NS", }, { .enumname = "NU", .file = FILE_LINE, .ucdname = "NU", }, { .enumname = "OP_WITHOUT_EAW_HWF", .file = FILE_LINE, .ucdname = "OP", }, { .enumname = "OP_WITH_EAW_HWF", .file = NULL, .ucdname = NULL, }, { .enumname = "PO", .file = FILE_LINE, .ucdname = "PO", }, { .enumname = "PR", .file = FILE_LINE, .ucdname = "PR", }, { .enumname = "QU", .file = FILE_LINE, .ucdname = "QU", }, { .enumname = "RI", .file = FILE_LINE, .ucdname = "RI", }, { .enumname = "SP", .file = FILE_LINE, .ucdname = "SP", }, { .enumname = "SY", .file = FILE_LINE, .ucdname = "SY", }, { .enumname = "WJ", .file = FILE_LINE, .ucdname = "WJ", }, { .enumname = "ZW", .file = FILE_LINE, .ucdname = "ZW", }, { .enumname = "ZWJ", .file = FILE_LINE, .ucdname = "ZWJ", }, { .enumname = "TMP_AI", .file = FILE_LINE, .ucdname = "AI", }, { .enumname = "TMP_CJ", .file = FILE_LINE, .ucdname = "CJ", }, { .enumname = "TMP_XX", .file = NULL, .ucdname = NULL, }, { .enumname = "TMP_MN", .file = FILE_LINE, .ucdname = "Mn", }, { .enumname = "TMP_MC", .file = FILE_LINE, .ucdname = "Mc", }, { .enumname = "TMP_SA_WITHOUT_MN_OR_MC", .file = FILE_LINE, .ucdname = "SA", }, { .enumname = "TMP_SA_WITH_MN_OR_MC", .file = FILE_LINE, .ucdname = "SA", }, { .enumname = "TMP_SG", .file = FILE_LINE, .ucdname = "SG", }, { .enumname = "TMP_EAW_H", .file = FILE_EAW, .ucdname = "H", }, { .enumname = "TMP_EAW_W", .file = FILE_EAW, .ucdname = "W", }, { .enumname = "TMP_EAW_F", .file = FILE_EAW, .ucdname = "F", }, }; static uint_least8_t handle_conflict(uint_least32_t cp, uint_least8_t prop1, uint_least8_t prop2) { uint_least8_t result = prop2; char *target = NULL; (void)cp; if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") || !strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") || !strcmp(line_break_property[prop1].enumname, "TMP_EAW_F")) || (!strcmp(line_break_property[prop2].enumname, "TMP_EAW_H") || !strcmp(line_break_property[prop2].enumname, "TMP_EAW_W") || !strcmp(line_break_property[prop2].enumname, "TMP_EAW_F"))) { if (!strcmp(line_break_property[prop1].enumname, "CP_WITHOUT_EAW_HWF") || !strcmp(line_break_property[prop2].enumname, "CP_WITHOUT_EAW_HWF")) { target = "CP_WITH_EAW_HWF"; } else if (!strcmp(line_break_property[prop1].enumname, "OP_WITHOUT_EAW_HWF") || !strcmp(line_break_property[prop2].enumname, "OP_WITHOUT_EAW_HWF")) { target = "OP_WITH_EAW_HWF"; } else { /* ignore EAW for the rest */ if ((!strcmp(line_break_property[prop1].enumname, "TMP_EAW_H") || !strcmp(line_break_property[prop1].enumname, "TMP_EAW_W") || !strcmp(line_break_property[prop1].enumname, "TMP_EAW_F"))) { result = prop2; } else { result = prop1; } } } else if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN") || !strcmp(line_break_property[prop1].enumname, "TMP_MC")) || (!strcmp(line_break_property[prop2].enumname, "TMP_MN") || !strcmp(line_break_property[prop2].enumname, "TMP_MC"))) { if (!strcmp(line_break_property[prop1].enumname, "SA_WITHOUT_MN_OR_MC") || !strcmp(line_break_property[prop2].enumname, "SA_WITHOUT_MN_OR_MC")) { target = "SA_WITH_MN_OR_MC"; } else { /* ignore Mn and Mc for the rest */ if ((!strcmp(line_break_property[prop1].enumname, "TMP_MN") || !strcmp(line_break_property[prop1].enumname, "TMP_MC"))) { result = prop2; } else { result = prop1; } } } else if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") || !strcmp(line_break_property[prop2].enumname, "TMP_CN")) { if (!strcmp(line_break_property[prop1].enumname, "TMP_EXTENDED_PICTOGRAPHIC") || !strcmp(line_break_property[prop2].enumname, "TMP_EXTENDED_PICTOGRAPHIC")) { target = "BOTH_CN_EXTPICT"; } else { /* ignore Cn for all the other properties */ if (!strcmp(line_break_property[prop1].enumname, "TMP_CN")) { result = prop2; } else { result = prop1; } } } else if (!strcmp(line_break_property[prop1].enumname, "TMP_EXTENDED_PICTOGRAPHIC") || !strcmp(line_break_property[prop2].enumname, "TMP_EXTENDED_PICTOGRAPHIC")) { if (!strcmp(line_break_property[prop1].enumname, "TMP_CN") || !strcmp(line_break_property[prop2].enumname, "TMP_CN")) { target = "BOTH_CN_EXTPICT"; } else { /* ignore Extended_Pictographic for all the other properties */ if (!strcmp(line_break_property[prop1].enumname, "TMP_EXTENDED_PICTOGRAPHIC")) { result = prop2; } else { result = prop1; } } } else { fprintf(stderr, "handle_conflict: Cannot handle conflict %s <- %s.\n", line_break_property[prop1].enumname, line_break_property[prop2].enumname); exit(1); } if (target) { for (result = 0; result < LEN(line_break_property); result++) { if (!strcmp(line_break_property[result].enumname, target)) { break; } } if (result == LEN(line_break_property)) { fprintf(stderr, "handle_conflict: Internal error.\n"); exit(1); } } return result; } static uint_least8_t post_process(uint_least8_t prop) { const char *target = NULL; uint_least8_t result; /* LB1 */ if (!strcmp(line_break_property[prop].enumname, "TMP_AI") || !strcmp(line_break_property[prop].enumname, "TMP_SG") || !strcmp(line_break_property[prop].enumname, "TMP_XX")) { /* map AI, SG and XX to AL */ target = "AL"; } else if (!strcmp(line_break_property[prop].enumname, "TMP_SA_WITH_MN_OR_MC")) { /* map SA (with General_Category Mn or Mc) to CM */ target = "CM"; } else if (!strcmp(line_break_property[prop].enumname, "TMP_SA_WITHOUT_MN_OR_MC")) { /* map SA (without General_Category Mn or Mc) to AL */ target = "AL"; } else if (!strcmp(line_break_property[prop].enumname, "TMP_CJ")) { /* map CJ to NS */ target = "NS"; } else if (!strcmp(line_break_property[prop].enumname, "TMP_CN") || !strcmp(line_break_property[prop].enumname, "TMP_EXTENDED_PICTOGRAPHIC") || !strcmp(line_break_property[prop].enumname, "TMP_MN") || !strcmp(line_break_property[prop].enumname, "TMP_MC") || !strcmp(line_break_property[prop].enumname, "TMP_EAW_H") || !strcmp(line_break_property[prop].enumname, "TMP_EAW_W") || !strcmp(line_break_property[prop].enumname, "TMP_EAW_F")) { /* map all the temporary classes "residue" to AL */ target = "AL"; } if (target) { for (result = 0; result < LEN(line_break_property); result++) { if (!strcmp(line_break_property[result].enumname, target)) { break; } } if (result == LEN(line_break_property)) { fprintf(stderr, "handle_conflict: Internal error.\n"); exit(1); } return result; } else { return prop; } } int main(int argc, char *argv[]) { (void)argc; properties_generate_break_property(line_break_property, LEN(line_break_property), handle_conflict, post_process, "line_break", argv[0]); return 0; }