commit 4cd67caa74b004977098c2d1927cf8e28dd4c9ed from: Omar Polo date: Wed Feb 09 21:28:43 2022 UTC move some unicode-related code in u/ commit - d911d4082332774a3c323fc1858663831a7d4225 commit + 4cd67caa74b004977098c2d1927cf8e28dd4c9ed blob - f77aac3e4cec256eec023a46825e5dabd55b5d9f blob + 125e4827906544f46998f7057267fce0e01215aa --- .gitignore +++ .gitignore @@ -26,4 +26,4 @@ telescope pagebundler compile_flags.txt telescope-*.tar.gz -emoji-matcher.c +u/emoji-matcher.c blob - 7a339054b7fe65fb41b222274b2b6c11198c7658 blob + 8885cec3b098d222561b983bdffe75d8776f4b46 --- Makefile.am +++ Makefile.am @@ -13,10 +13,8 @@ telescope_SOURCES = cmd.c \ defaults.c \ defaults.h \ downloads.c \ - emoji-matcher.c \ fs.c \ fs.h \ - genemoji.sh \ gencmd.awk \ help.c \ hist.c \ @@ -43,13 +41,15 @@ telescope_SOURCES = cmd.c \ telescope.c \ telescope.h \ tofu.c \ + u/emoji-matcher.c \ + u/genemoji.sh \ + u/utf8.c \ + u/wrap.c \ ui.c \ ui.h \ - utf8.c \ utf8.h \ utils.c \ - utils.h \ - wrap.c + utils.h # phos bundled files telescope_SOURCES += phos/phos.h \ @@ -61,9 +61,9 @@ pagebundler_SOURCES = pagebundler.c pagebundler$(EXEEXT): pagebundler.c $(HOSTCC) $(HOSTCFLAGS) -o $@ $(srcdir)/pagebundler.c -BUILT_SOURCES = cmd.gen.c compile_flags.txt emoji-matcher.c pages.c +BUILT_SOURCES = cmd.gen.c compile_flags.txt u/emoji-matcher.c pages.c -CLEANFILES = cmd.gen.c compile_flags.txt emoji-matcher.c pages.c \ +CLEANFILES = cmd.gen.c compile_flags.txt u/emoji-matcher.c pages.c \ parse.c LDADD = $(LIBOBJS) @@ -75,8 +75,8 @@ dist_man1_MANS = telescope.1 cmd.gen.c: $(srcdir)/cmd.h $(srcdir)/gencmd.awk ${AWK} -f $(srcdir)/gencmd.awk < $(srcdir)/cmd.h > $@ -emoji-matcher.c: $(srcdir)/data/emoji.txt $(srcdir)/genemoji.sh - $(srcdir)/genemoji.sh $(srcdir)/data/emoji.txt > $@ +u/emoji-matcher.c: $(srcdir)/data/emoji.txt $(srcdir)/u/genemoji.sh + $(srcdir)/u/genemoji.sh $(srcdir)/data/emoji.txt > $@ compile_flags.txt: printf "%s\n" ${CFLAGS} > compile_flags.txt blob - ca50652b0a1fd5cc7aea6f54e7adc93b40f2e786 (mode 755) blob + /dev/null --- genemoji.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/sh - -file="${1:?missing input file}" - -sed -e '/^$/d' \ - -e '/^#/d' \ - -e 's/;.*//' \ - -e 's/[ \t]*$//' \ - -e 's/\.\./ /' \ - "$file" \ - | awk ' -BEGIN { - print "#include \"utf8.h\"" - print "int is_emoji(uint32_t cp) {" - - e="" -} - -{ - if (NF == 1) { - printf("%sif (cp == 0x%s)", e, $1); - } else { - printf("%sif (cp >= 0x%s && cp <= 0x%s)", e, $1, $2); - } - - print " return 1;" - - e="else " -} - -END { - print "return 0; }" -} -' blob - /dev/null blob + ca50652b0a1fd5cc7aea6f54e7adc93b40f2e786 (mode 755) --- /dev/null +++ u/genemoji.sh @@ -0,0 +1,34 @@ +#!/bin/sh + +file="${1:?missing input file}" + +sed -e '/^$/d' \ + -e '/^#/d' \ + -e 's/;.*//' \ + -e 's/[ \t]*$//' \ + -e 's/\.\./ /' \ + "$file" \ + | awk ' +BEGIN { + print "#include \"utf8.h\"" + print "int is_emoji(uint32_t cp) {" + + e="" +} + +{ + if (NF == 1) { + printf("%sif (cp == 0x%s)", e, $1); + } else { + printf("%sif (cp >= 0x%s && cp <= 0x%s)", e, $1, $2); + } + + print " return 1;" + + e="else " +} + +END { + print "return 0; }" +} +' blob - /dev/null blob + 5389e8cc921278dc8e97c46a1a42547b0564f70c (mode 644) --- /dev/null +++ u/utf8.c @@ -0,0 +1,262 @@ +/* Copyright (c) 2008-2009 Bjoern Hoehrmann + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "compat.h" + +#include +#include +#include +#include + +#include "telescope.h" +#include "utf8.h" + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 1 + +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +static inline uint32_t +decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte) +{ + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} + + +/* end of the converter, utility functions ahead */ + +#define ZERO_WIDTH_SPACE 0x200B + +/* public version of decode */ +uint32_t +utf8_decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte) +{ + return decode(state, codep, byte); +} + +/* encode cp in s. s must be at least 4 bytes wide */ +size_t +utf8_encode(uint32_t cp, char *s) +{ + if (cp <= 0x7F) { + *s = (uint8_t)cp; + return 1; + } else if (cp <= 0x7FF) { + s[1] = (uint8_t)(( cp & 0x3F ) + 0x80); + s[0] = (uint8_t)(((cp >> 6) & 0x1F) + 0xC0); + return 2; + } else if (cp <= 0xFFFF) { + s[2] = (uint8_t)(( cp & 0x3F) + 0x80); + s[1] = (uint8_t)(((cp >> 6) & 0x3F) + 0x80); + s[0] = (uint8_t)(((cp >> 12) & 0x0F) + 0xE0); + return 3; + } else if (cp <= 0x10FFFF) { + s[3] = (uint8_t)(( cp & 0x3F) + 0x80); + s[2] = (uint8_t)(((cp >> 6) & 0x3F) + 0x80); + s[1] = (uint8_t)(((cp >> 12) & 0x3F) + 0x80); + s[0] = (uint8_t)(((cp >> 18) & 0x07) + 0xF0); + return 4; + } else { + s[0] = '\0'; + return 0; + } +} + +char * +utf8_nth(char *s, size_t n) +{ + size_t i; + uint32_t cp = 0, state = 0; + + for (i = 0; *s && i < n; ++s) + if (!decode(&state, &cp, *s)) + ++i; + + if (state != UTF8_ACCEPT) + return NULL; + if (i == n) + return s; + return NULL; +} + +size_t +utf8_cplen(char *s) +{ + uint32_t cp = 0, state = 0; + size_t len; + + len = 0; + for (; *s; ++s) + if (!decode(&state, &cp, *s)) + len++; + return len; +} + +/* returns only 0, 1, 2 or 8. assumes sizeof(wchar_t) is 4 */ +size_t +utf8_chwidth(uint32_t cp) +{ + /* XXX: if we're running on a platform where sizeof(wchar_t) + * == 2 what to do? The manpage for wcwidth and wcs isn't + * clear about the encoding, but if it's 16 bit wide I assume + * it must use UTF-16... right? */ + assert(sizeof(wchar_t) == 4); + + /* + * quick and dirty fix for the tabs. In the future we may + * want to expand tabs into N spaces, but for the time being + * this seems to be good enough (tm). + */ + if (cp == '\t') + return 8; + + return wcwidth((wchar_t)cp); +} + +/* NOTE: n is the number of codepoints, NOT the byte length. In + * other words, s MUST be NUL-terminated. */ +size_t +utf8_snwidth(const char *s, size_t n) +{ + size_t i, tot; + uint32_t cp = 0, state = 0; + + tot = 0; + for (i = 0; *s && i < n; ++s) + if (!decode(&state, &cp, *s)) { + i++; + tot += utf8_chwidth(cp); + } + + return tot; +} + +size_t +utf8_swidth(const char *s) +{ + size_t tot; + uint32_t cp = 0, state = 0; + + tot = 0; + for (; *s; ++s) + if (!decode(&state, &cp, *s)) + tot += utf8_chwidth(cp); + + return tot; +} + +size_t +utf8_swidth_between(const char *str, const char *end) +{ + size_t tot; + uint32_t cp = 0, state = 0; + + tot = 0; + for (; *str && str < end; ++str) + if (!decode(&state, &cp, *str)) + tot += utf8_chwidth(cp); + return tot; +} + +char * +utf8_next_cp(const char *s) +{ + uint32_t cp = 0, state = 0; + + for (; *s; ++s) + if (!decode(&state, &cp, *s)) + break; + return (char*)s+1; +} + +char * +utf8_prev_cp(const char *start, const char *base) +{ + uint8_t c; + + for (; start > base; start--) { + c = *start; + if ((c & 0xC0) != 0x80) + return (char*)start; + } + + return (char*)base; +} + +/* + * XXX: This is not correct. There are codepoints classified as + * "emoji", but these can be joined toghether to form more complex + * emoji. There is an ufficial list of what these valid combinations + * are, but it would require a costly lookup (a trie can be used to + * reduce the times, but...). The following approach is conceptually + * simpler: if there is a sequence of "emoji codepoints" (or ZWS) and + * then a space, consider everything before the space a single emoji. + * It needs a special check for numbers (yes, 0..9 and # are + * technically speaking emojis) but otherwise seems to work well in + * practice. + */ +int +emojied_line(const char *s, const char **space_ret) +{ + uint32_t cp = 0, state = 0; + int only_numbers = 1; + + for (; *s; ++s) { + if (!decode(&state, &cp, *s)) { + if (cp == ZERO_WIDTH_SPACE) + continue; + if (cp == ' ') { + *space_ret = s; + return !only_numbers; + } + if (!is_emoji(cp)) + return 0; + if (cp < '0' || cp > '9') + only_numbers = 0; + } + } + + return 0; +} blob - /dev/null blob + 338068315003a7f604337fc0f6c91a4b98a7739e (mode 644) --- /dev/null +++ u/wrap.c @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2021 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include "compat.h" + +#include +#include +#include + +#include "defaults.h" +#include "telescope.h" +#include "utf8.h" + +/* + * Text wrapping + * ============= + * + * There's a simple text wrapping algorithm. + * + * 1. if it's a line in a pre-formatted block: + * a. hard wrap. + * b. repeat + * 2. otherwise advance the line char by char. + * 3. when ending the space, split the line at the last occurrence of + * a "word separator" (i.e. " \t-") or at point if none. + * 4. repeat + * + */ + +void +erase_buffer(struct buffer *buffer) +{ + empty_vlist(buffer); + empty_linelist(buffer); +} + +void +empty_linelist(struct buffer *buffer) +{ + struct line *l, *lt; + + TAILQ_FOREACH_SAFE(l, &buffer->page.head, lines, lt) { + TAILQ_REMOVE(&buffer->page.head, l, lines); + free(l->line); + + if (l->type != LINE_COMPL && + l->type != LINE_COMPL_CURRENT && + l->type != LINE_HELP) + free(l->alt); + + free(l); + } +} + +void +empty_vlist(struct buffer *buffer) +{ + struct vline *vl, *t; + + buffer->top_line = NULL; + buffer->line_off = 0; + buffer->current_line = NULL; + buffer->line_max = 0; + + TAILQ_FOREACH_SAFE(vl, &buffer->head, vlines, t) { + TAILQ_REMOVE(&buffer->head, vl, vlines); + free(vl->line); + free(vl); + } +} + +static int +push_line(struct buffer *buffer, struct line *l, const char *buf, size_t len, int flags) +{ + struct vline *vl; + const char *end; + + /* omit trailing spaces */ + if (len != 0) { + for (end = buf + len - 1; + end > buf && isspace(*end); + end--, len--) + ; /* nop */ + } + + if (!(l->flags & L_HIDDEN)) + buffer->line_max++; + + if ((vl = calloc(1, sizeof(*vl))) == NULL) + return 0; + + if (len != 0 && (vl->line = calloc(1, len+1)) == NULL) { + free(vl); + return 0; + } + + vl->parent = l; + if (len != 0) + memcpy(vl->line, buf, len); + vl->flags = flags; + + TAILQ_INSERT_TAIL(&buffer->head, vl, vlines); + return 1; +} + +/* + * Similar to wrap_text, but emit only o vline. + */ +int +wrap_one(struct buffer *buffer, const char *prfx, struct line *l, size_t width) +{ + struct vline *vl, *t; + + /* + * be lazy: call wrap_text and then discard the continuations. + */ + + if (!wrap_text(buffer, prfx, l, width)) + return 0; + + TAILQ_FOREACH_SAFE(vl, &buffer->head, vlines, t) { + if (vl->flags & L_CONTINUATION) { + TAILQ_REMOVE(&buffer->head, vl, vlines); + free(vl->line); + free(vl); + buffer->line_max--; + } + } + + return 1; +} + +/* + * Build a list of visual line by wrapping the given line, assuming + * that when printed will have a leading prefix prfx. + */ +int +wrap_text(struct buffer *buffer, const char *prfx, struct line *l, size_t width) +{ + const char *separators = " \t-"; + const char *start, *end, *line, *lastsep, *lastchar, *space; + uint32_t cp = 0, state = 0; + size_t cur, prfxwidth, w; + int flags; + + if ((line = l->line) == NULL) + return push_line(buffer, l, NULL, 0, 0); + + prfxwidth = utf8_swidth(prfx); + cur = prfxwidth; + start = line; + lastsep = NULL; + lastchar = line; + flags = 0; + + if (l->type == LINE_LINK && emojify_link && + emojied_line(l->line, &space)) { + prfxwidth = utf8_swidth_between(l->line, space); + cur = prfxwidth; + line = space + 1; + } + + for (; *line; line++) { + if (utf8_decode(&state, &cp, *line)) + continue; + w = utf8_chwidth(cp); + if (cur + w > width) { + end = lastsep == NULL + ? utf8_next_cp((char*)lastchar) + : utf8_next_cp((char*)lastsep); + if (!push_line(buffer, l, start, end - start, flags)) + return 0; + flags = L_CONTINUATION; + start = end; + cur = prfxwidth + utf8_swidth_between(start, lastchar); + } else if (strchr(separators, *line) != NULL) { + lastsep = line; + } + + lastchar = utf8_prev_cp(line, l->line); + cur += w; + } + + return push_line(buffer, l, start, line - start, flags); +} + +int +hardwrap_text(struct buffer *buffer, struct line *l, size_t width) +{ + const char *line, *start, *lastchar; + int cont; + uint32_t state = 0, cp = 0; + size_t cur, w; + + if ((line = l->line) == NULL) + return push_line(buffer, l, NULL, 0, 0); + + start = line; + lastchar = line; + cont = 0; + cur = 0; + for (; *line; line++) { + if (utf8_decode(&state, &cp, *line)) + continue; + w = utf8_chwidth(cp); + if (cur + w > width) { + if (!push_line(buffer, l, start, lastchar-start, cont)) + return 0; + cont = L_CONTINUATION; + if (dont_wrap_pre) + return 1; + cur = 0; + start = lastchar; + } + + lastchar = utf8_prev_cp(line, l->line); + cur += w; + } + + return push_line(buffer, l, start, line - start, cont); +} + +int +wrap_page(struct buffer *buffer, int width) +{ + struct line *l; + const struct line *top_orig, *orig; + struct vline *vl; + const char *prfx; + + top_orig = buffer->top_line == NULL ? NULL : buffer->top_line->parent; + orig = buffer->current_line == NULL ? NULL : buffer->current_line->parent; + + buffer->top_line = NULL; + buffer->current_line = NULL; + + buffer->force_redraw = 1; + buffer->curs_y = 0; + buffer->line_off = 0; + + empty_vlist(buffer); + + TAILQ_FOREACH(l, &buffer->page.head, lines) { + prfx = line_prefixes[l->type].prfx1; + switch (l->type) { + case LINE_TEXT: + case LINE_LINK: + case LINE_TITLE_1: + case LINE_TITLE_2: + case LINE_TITLE_3: + case LINE_ITEM: + case LINE_QUOTE: + case LINE_PRE_START: + case LINE_PRE_END: + wrap_text(buffer, prfx, l, MIN(fill_column, width)); + break; + case LINE_PRE_CONTENT: + case LINE_PATCH: + case LINE_PATCH_HDR: + case LINE_PATCH_HUNK_HDR: + case LINE_PATCH_ADD: + case LINE_PATCH_DEL: + hardwrap_text(buffer, l, MIN(fill_column, width)); + break; + case LINE_COMPL: + case LINE_COMPL_CURRENT: + case LINE_HELP: + case LINE_DOWNLOAD: + case LINE_DOWNLOAD_DONE: + case LINE_DOWNLOAD_INFO: + wrap_one(buffer, prfx, l, width); + break; + case LINE_FRINGE: + /* never, ever wrapped */ + break; + } + + if (top_orig == l && buffer->top_line == NULL) { + buffer->line_off = buffer->line_max-1; + buffer->top_line = TAILQ_LAST(&buffer->head, vhead); + + while (1) { + vl = TAILQ_PREV(buffer->top_line, vhead, vlines); + if (vl == NULL || vl->parent != orig) + break; + buffer->top_line = vl; + buffer->line_off--; + } + } + + if (orig == l && buffer->current_line == NULL) { + buffer->current_line = TAILQ_LAST(&buffer->head, vhead); + + while (1) { + vl = TAILQ_PREV(buffer->current_line, vhead, vlines); + if (vl == NULL || vl->parent != orig) + break; + buffer->current_line = vl; + } + } + } + + if (buffer->current_line == NULL) + buffer->current_line = TAILQ_FIRST(&buffer->head); + + if (buffer->top_line == NULL) + buffer->top_line = buffer->current_line; + + return 1; +} blob - 5389e8cc921278dc8e97c46a1a42547b0564f70c (mode 644) blob + /dev/null --- utf8.c +++ /dev/null @@ -1,262 +0,0 @@ -/* Copyright (c) 2008-2009 Bjoern Hoehrmann - * - * Permission is hereby granted, free of charge, to any person - * obtaining a copy of this software and associated documentation - * files (the "Software"), to deal in the Software without - * restriction, including without limitation the rights to use, copy, - * modify, merge, publish, distribute, sublicense, and/or sell copies - * of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be - * included in all copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND - * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS - * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN - * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN - * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#include "compat.h" - -#include -#include -#include -#include - -#include "telescope.h" -#include "utf8.h" - -#define UTF8_ACCEPT 0 -#define UTF8_REJECT 1 - -static const uint8_t utf8d[] = { - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f - 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f - 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf - 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df - 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef - 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff - 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 - 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 - 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 - 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 - 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 -}; - -static inline uint32_t -decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte) -{ - uint32_t type = utf8d[byte]; - - *codep = (*state != UTF8_ACCEPT) ? - (byte & 0x3fu) | (*codep << 6) : - (0xff >> type) & (byte); - - *state = utf8d[256 + *state*16 + type]; - return *state; -} - - -/* end of the converter, utility functions ahead */ - -#define ZERO_WIDTH_SPACE 0x200B - -/* public version of decode */ -uint32_t -utf8_decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte) -{ - return decode(state, codep, byte); -} - -/* encode cp in s. s must be at least 4 bytes wide */ -size_t -utf8_encode(uint32_t cp, char *s) -{ - if (cp <= 0x7F) { - *s = (uint8_t)cp; - return 1; - } else if (cp <= 0x7FF) { - s[1] = (uint8_t)(( cp & 0x3F ) + 0x80); - s[0] = (uint8_t)(((cp >> 6) & 0x1F) + 0xC0); - return 2; - } else if (cp <= 0xFFFF) { - s[2] = (uint8_t)(( cp & 0x3F) + 0x80); - s[1] = (uint8_t)(((cp >> 6) & 0x3F) + 0x80); - s[0] = (uint8_t)(((cp >> 12) & 0x0F) + 0xE0); - return 3; - } else if (cp <= 0x10FFFF) { - s[3] = (uint8_t)(( cp & 0x3F) + 0x80); - s[2] = (uint8_t)(((cp >> 6) & 0x3F) + 0x80); - s[1] = (uint8_t)(((cp >> 12) & 0x3F) + 0x80); - s[0] = (uint8_t)(((cp >> 18) & 0x07) + 0xF0); - return 4; - } else { - s[0] = '\0'; - return 0; - } -} - -char * -utf8_nth(char *s, size_t n) -{ - size_t i; - uint32_t cp = 0, state = 0; - - for (i = 0; *s && i < n; ++s) - if (!decode(&state, &cp, *s)) - ++i; - - if (state != UTF8_ACCEPT) - return NULL; - if (i == n) - return s; - return NULL; -} - -size_t -utf8_cplen(char *s) -{ - uint32_t cp = 0, state = 0; - size_t len; - - len = 0; - for (; *s; ++s) - if (!decode(&state, &cp, *s)) - len++; - return len; -} - -/* returns only 0, 1, 2 or 8. assumes sizeof(wchar_t) is 4 */ -size_t -utf8_chwidth(uint32_t cp) -{ - /* XXX: if we're running on a platform where sizeof(wchar_t) - * == 2 what to do? The manpage for wcwidth and wcs isn't - * clear about the encoding, but if it's 16 bit wide I assume - * it must use UTF-16... right? */ - assert(sizeof(wchar_t) == 4); - - /* - * quick and dirty fix for the tabs. In the future we may - * want to expand tabs into N spaces, but for the time being - * this seems to be good enough (tm). - */ - if (cp == '\t') - return 8; - - return wcwidth((wchar_t)cp); -} - -/* NOTE: n is the number of codepoints, NOT the byte length. In - * other words, s MUST be NUL-terminated. */ -size_t -utf8_snwidth(const char *s, size_t n) -{ - size_t i, tot; - uint32_t cp = 0, state = 0; - - tot = 0; - for (i = 0; *s && i < n; ++s) - if (!decode(&state, &cp, *s)) { - i++; - tot += utf8_chwidth(cp); - } - - return tot; -} - -size_t -utf8_swidth(const char *s) -{ - size_t tot; - uint32_t cp = 0, state = 0; - - tot = 0; - for (; *s; ++s) - if (!decode(&state, &cp, *s)) - tot += utf8_chwidth(cp); - - return tot; -} - -size_t -utf8_swidth_between(const char *str, const char *end) -{ - size_t tot; - uint32_t cp = 0, state = 0; - - tot = 0; - for (; *str && str < end; ++str) - if (!decode(&state, &cp, *str)) - tot += utf8_chwidth(cp); - return tot; -} - -char * -utf8_next_cp(const char *s) -{ - uint32_t cp = 0, state = 0; - - for (; *s; ++s) - if (!decode(&state, &cp, *s)) - break; - return (char*)s+1; -} - -char * -utf8_prev_cp(const char *start, const char *base) -{ - uint8_t c; - - for (; start > base; start--) { - c = *start; - if ((c & 0xC0) != 0x80) - return (char*)start; - } - - return (char*)base; -} - -/* - * XXX: This is not correct. There are codepoints classified as - * "emoji", but these can be joined toghether to form more complex - * emoji. There is an ufficial list of what these valid combinations - * are, but it would require a costly lookup (a trie can be used to - * reduce the times, but...). The following approach is conceptually - * simpler: if there is a sequence of "emoji codepoints" (or ZWS) and - * then a space, consider everything before the space a single emoji. - * It needs a special check for numbers (yes, 0..9 and # are - * technically speaking emojis) but otherwise seems to work well in - * practice. - */ -int -emojied_line(const char *s, const char **space_ret) -{ - uint32_t cp = 0, state = 0; - int only_numbers = 1; - - for (; *s; ++s) { - if (!decode(&state, &cp, *s)) { - if (cp == ZERO_WIDTH_SPACE) - continue; - if (cp == ' ') { - *space_ret = s; - return !only_numbers; - } - if (!is_emoji(cp)) - return 0; - if (cp < '0' || cp > '9') - only_numbers = 0; - } - } - - return 0; -} blob - 338068315003a7f604337fc0f6c91a4b98a7739e (mode 644) blob + /dev/null --- wrap.c +++ /dev/null @@ -1,323 +0,0 @@ -/* - * Copyright (c) 2021 Omar Polo - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include "compat.h" - -#include -#include -#include - -#include "defaults.h" -#include "telescope.h" -#include "utf8.h" - -/* - * Text wrapping - * ============= - * - * There's a simple text wrapping algorithm. - * - * 1. if it's a line in a pre-formatted block: - * a. hard wrap. - * b. repeat - * 2. otherwise advance the line char by char. - * 3. when ending the space, split the line at the last occurrence of - * a "word separator" (i.e. " \t-") or at point if none. - * 4. repeat - * - */ - -void -erase_buffer(struct buffer *buffer) -{ - empty_vlist(buffer); - empty_linelist(buffer); -} - -void -empty_linelist(struct buffer *buffer) -{ - struct line *l, *lt; - - TAILQ_FOREACH_SAFE(l, &buffer->page.head, lines, lt) { - TAILQ_REMOVE(&buffer->page.head, l, lines); - free(l->line); - - if (l->type != LINE_COMPL && - l->type != LINE_COMPL_CURRENT && - l->type != LINE_HELP) - free(l->alt); - - free(l); - } -} - -void -empty_vlist(struct buffer *buffer) -{ - struct vline *vl, *t; - - buffer->top_line = NULL; - buffer->line_off = 0; - buffer->current_line = NULL; - buffer->line_max = 0; - - TAILQ_FOREACH_SAFE(vl, &buffer->head, vlines, t) { - TAILQ_REMOVE(&buffer->head, vl, vlines); - free(vl->line); - free(vl); - } -} - -static int -push_line(struct buffer *buffer, struct line *l, const char *buf, size_t len, int flags) -{ - struct vline *vl; - const char *end; - - /* omit trailing spaces */ - if (len != 0) { - for (end = buf + len - 1; - end > buf && isspace(*end); - end--, len--) - ; /* nop */ - } - - if (!(l->flags & L_HIDDEN)) - buffer->line_max++; - - if ((vl = calloc(1, sizeof(*vl))) == NULL) - return 0; - - if (len != 0 && (vl->line = calloc(1, len+1)) == NULL) { - free(vl); - return 0; - } - - vl->parent = l; - if (len != 0) - memcpy(vl->line, buf, len); - vl->flags = flags; - - TAILQ_INSERT_TAIL(&buffer->head, vl, vlines); - return 1; -} - -/* - * Similar to wrap_text, but emit only o vline. - */ -int -wrap_one(struct buffer *buffer, const char *prfx, struct line *l, size_t width) -{ - struct vline *vl, *t; - - /* - * be lazy: call wrap_text and then discard the continuations. - */ - - if (!wrap_text(buffer, prfx, l, width)) - return 0; - - TAILQ_FOREACH_SAFE(vl, &buffer->head, vlines, t) { - if (vl->flags & L_CONTINUATION) { - TAILQ_REMOVE(&buffer->head, vl, vlines); - free(vl->line); - free(vl); - buffer->line_max--; - } - } - - return 1; -} - -/* - * Build a list of visual line by wrapping the given line, assuming - * that when printed will have a leading prefix prfx. - */ -int -wrap_text(struct buffer *buffer, const char *prfx, struct line *l, size_t width) -{ - const char *separators = " \t-"; - const char *start, *end, *line, *lastsep, *lastchar, *space; - uint32_t cp = 0, state = 0; - size_t cur, prfxwidth, w; - int flags; - - if ((line = l->line) == NULL) - return push_line(buffer, l, NULL, 0, 0); - - prfxwidth = utf8_swidth(prfx); - cur = prfxwidth; - start = line; - lastsep = NULL; - lastchar = line; - flags = 0; - - if (l->type == LINE_LINK && emojify_link && - emojied_line(l->line, &space)) { - prfxwidth = utf8_swidth_between(l->line, space); - cur = prfxwidth; - line = space + 1; - } - - for (; *line; line++) { - if (utf8_decode(&state, &cp, *line)) - continue; - w = utf8_chwidth(cp); - if (cur + w > width) { - end = lastsep == NULL - ? utf8_next_cp((char*)lastchar) - : utf8_next_cp((char*)lastsep); - if (!push_line(buffer, l, start, end - start, flags)) - return 0; - flags = L_CONTINUATION; - start = end; - cur = prfxwidth + utf8_swidth_between(start, lastchar); - } else if (strchr(separators, *line) != NULL) { - lastsep = line; - } - - lastchar = utf8_prev_cp(line, l->line); - cur += w; - } - - return push_line(buffer, l, start, line - start, flags); -} - -int -hardwrap_text(struct buffer *buffer, struct line *l, size_t width) -{ - const char *line, *start, *lastchar; - int cont; - uint32_t state = 0, cp = 0; - size_t cur, w; - - if ((line = l->line) == NULL) - return push_line(buffer, l, NULL, 0, 0); - - start = line; - lastchar = line; - cont = 0; - cur = 0; - for (; *line; line++) { - if (utf8_decode(&state, &cp, *line)) - continue; - w = utf8_chwidth(cp); - if (cur + w > width) { - if (!push_line(buffer, l, start, lastchar-start, cont)) - return 0; - cont = L_CONTINUATION; - if (dont_wrap_pre) - return 1; - cur = 0; - start = lastchar; - } - - lastchar = utf8_prev_cp(line, l->line); - cur += w; - } - - return push_line(buffer, l, start, line - start, cont); -} - -int -wrap_page(struct buffer *buffer, int width) -{ - struct line *l; - const struct line *top_orig, *orig; - struct vline *vl; - const char *prfx; - - top_orig = buffer->top_line == NULL ? NULL : buffer->top_line->parent; - orig = buffer->current_line == NULL ? NULL : buffer->current_line->parent; - - buffer->top_line = NULL; - buffer->current_line = NULL; - - buffer->force_redraw = 1; - buffer->curs_y = 0; - buffer->line_off = 0; - - empty_vlist(buffer); - - TAILQ_FOREACH(l, &buffer->page.head, lines) { - prfx = line_prefixes[l->type].prfx1; - switch (l->type) { - case LINE_TEXT: - case LINE_LINK: - case LINE_TITLE_1: - case LINE_TITLE_2: - case LINE_TITLE_3: - case LINE_ITEM: - case LINE_QUOTE: - case LINE_PRE_START: - case LINE_PRE_END: - wrap_text(buffer, prfx, l, MIN(fill_column, width)); - break; - case LINE_PRE_CONTENT: - case LINE_PATCH: - case LINE_PATCH_HDR: - case LINE_PATCH_HUNK_HDR: - case LINE_PATCH_ADD: - case LINE_PATCH_DEL: - hardwrap_text(buffer, l, MIN(fill_column, width)); - break; - case LINE_COMPL: - case LINE_COMPL_CURRENT: - case LINE_HELP: - case LINE_DOWNLOAD: - case LINE_DOWNLOAD_DONE: - case LINE_DOWNLOAD_INFO: - wrap_one(buffer, prfx, l, width); - break; - case LINE_FRINGE: - /* never, ever wrapped */ - break; - } - - if (top_orig == l && buffer->top_line == NULL) { - buffer->line_off = buffer->line_max-1; - buffer->top_line = TAILQ_LAST(&buffer->head, vhead); - - while (1) { - vl = TAILQ_PREV(buffer->top_line, vhead, vlines); - if (vl == NULL || vl->parent != orig) - break; - buffer->top_line = vl; - buffer->line_off--; - } - } - - if (orig == l && buffer->current_line == NULL) { - buffer->current_line = TAILQ_LAST(&buffer->head, vhead); - - while (1) { - vl = TAILQ_PREV(buffer->current_line, vhead, vlines); - if (vl == NULL || vl->parent != orig) - break; - buffer->current_line = vl; - } - } - } - - if (buffer->current_line == NULL) - buffer->current_line = TAILQ_FIRST(&buffer->head); - - if (buffer->top_line == NULL) - buffer->top_line = buffer->current_line; - - return 1; -}