op public repos

Blob

Date:: Mon Nov 7 18:18:53 2022 UTC
Message:: start to refactor the rendering Previously each vline (visual line) had a full copy of its associated string, this changes it so it only slices a part of the parent line. Reduces significantly the memory usage. This actually worsen the emojify-link glitch reported by Freezr after the some recent refactoring in the wrapping code. Not a big deal since I'm about to restructure the whole rendering bit by bit (hopefully!)
Actions:: History | Blame | Raw File
1 /* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
2  *
3  * Permission is hereby granted, free of charge, to any person
4  * obtaining a copy of this software and associated documentation
5  * files (the "Software"), to deal in the Software without
6  * restriction, including without limitation the rights to use, copy,
7  * modify, merge, publish, distribute, sublicense, and/or sell copies
8  * of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "compat.h"
25 
26 #include <assert.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 #include <wchar.h>
30 
31 #include "telescope.h"
32 #include "utf8.h"
33 
34 #define UTF8_ACCEPT 0
35 #define UTF8_REJECT 1
36 
37 static const uint8_t utf8d[] = {
38 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
39 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
40 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
41 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
42 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
43 	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
44 	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
45 	0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
46 	0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
47 	0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
48 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
49 	1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
50 	1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
51 	1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
52 };
53 
54 static inline uint32_t
55 decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte)
56 {
57 	uint32_t type = utf8d[byte];
58 
59 	*codep = (*state != UTF8_ACCEPT) ?
60 		(byte & 0x3fu) | (*codep << 6) :
61 		(0xff >> type) & (byte);
62 
63 	*state = utf8d[256 + *state*16 + type];
64 	return *state;
65 }
66 
67 
68 /* end of the converter, utility functions ahead */
69 
70 #define ZERO_WIDTH_SPACE 0x200B
71 
72 /* public version of decode */
73 uint32_t
74 utf8_decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte)
75 {
76 	return decode(state, codep, byte);
77 }
78 
79 /* encode cp in s.  s must be at least 4 bytes wide */
80 size_t
81 utf8_encode(uint32_t cp, char *s)
82 {
83 	if (cp <= 0x7F) {
84 		*s = (uint8_t)cp;
85 		return 1;
86 	} else if (cp <= 0x7FF) {
87 		s[1] = (uint8_t)(( cp        & 0x3F ) + 0x80);
88 		s[0] = (uint8_t)(((cp >>  6) & 0x1F) + 0xC0);
89 		return 2;
90 	} else if (cp <= 0xFFFF) {
91 		s[2] = (uint8_t)(( cp        & 0x3F) + 0x80);
92 		s[1] = (uint8_t)(((cp >>  6) & 0x3F) + 0x80);
93 		s[0] = (uint8_t)(((cp >> 12) & 0x0F) + 0xE0);
94 		return 3;
95 	} else if (cp <= 0x10FFFF) {
96 		s[3] = (uint8_t)(( cp        & 0x3F) + 0x80);
97 		s[2] = (uint8_t)(((cp >>  6) & 0x3F) + 0x80);
98 		s[1] = (uint8_t)(((cp >> 12) & 0x3F) + 0x80);
99 		s[0] = (uint8_t)(((cp >> 18) & 0x07) + 0xF0);
100 		return 4;
101 	} else {
102 		s[0] = '\0';
103 		return 0;
104 	}
105 }
106 
107 char *
108 utf8_nth(char *s, size_t n)
109 {
110 	size_t i;
111 	uint32_t cp = 0, state = 0;
112 
113 	for (i = 0; *s && i < n; ++s)
114 		if (!decode(&state, &cp, *s))
115 			++i;
116 
117 	if (state != UTF8_ACCEPT)
118 		return NULL;
119 	if (i == n)
120 		return s;
121 	return NULL;
122 }
123 
124 size_t
125 utf8_cplen(char *s)
126 {
127 	uint32_t cp = 0, state = 0;
128 	size_t len;
129 
130 	len = 0;
131 	for (; *s; ++s)
132 		if (!decode(&state, &cp, *s))
133 			len++;
134 	return len;
135 }
136 
137 size_t
138 utf8_ncplen(const char *s, size_t slen)
139 {
140 	uint32_t cp = 0, state = 0;
141 	size_t len = 0;
142 
143 	for (; slen > 0 && *s; ++s, --slen)
144 		if (!decode(&state, &cp, *s))
145 			len++;
146 	return len;
147 }
148 
149 /* returns only 0, 1, 2 or 8.  assumes sizeof(wchar_t) is 4 */
150 size_t
151 utf8_chwidth(uint32_t cp)
152 {
153 	/* XXX: if we're running on a platform where sizeof(wchar_t)
154 	 * == 2 what to do?  The manpage for wcwidth and wcs isn't
155 	 * clear about the encoding, but if it's 16 bit wide I assume
156 	 * it must use UTF-16... right? */
157 	assert(sizeof(wchar_t) == 4);
158 
159 	/*
160 	 * quick and dirty fix for the tabs.  In the future we may
161 	 * want to expand tabs into N spaces, but for the time being
162 	 * this seems to be good enough (tm).
163 	 */
164 	if (cp == '\t')
165 		return 8;
166 
167 	return wcwidth((wchar_t)cp);
168 }
169 
170 /* NOTE: n is the number of codepoints, NOT the byte length.  In
171  * other words, s MUST be NUL-terminated. */
172 size_t
173 utf8_snwidth(const char *s, size_t n)
174 {
175 	size_t i, tot;
176 	uint32_t cp = 0, state = 0;
177 
178 	tot = 0;
179 	for (i = 0; *s && i < n; ++s)
180 		if (!decode(&state, &cp, *s)) {
181 			i++;
182 			tot += utf8_chwidth(cp);
183 		}
184 
185 	return tot;
186 }
187 
188 size_t
189 utf8_swidth(const char *s)
190 {
191 	size_t tot;
192 	uint32_t cp = 0, state = 0;
193 
194 	tot = 0;
195 	for (; *s; ++s)
196 		if (!decode(&state, &cp, *s))
197 			tot += utf8_chwidth(cp);
198 
199 	return tot;
200 }
201 
202 size_t
203 utf8_swidth_between(const char *str, const char *end)
204 {
205 	size_t tot;
206 	uint32_t cp = 0, state = 0;
207 
208 	tot = 0;
209 	for (; *str && str < end; ++str)
210 		if (!decode(&state, &cp, *str))
211 			tot += utf8_chwidth(cp);
212 	return tot;
213 }
214 
215 char *
216 utf8_next_cp(const char *s)
217 {
218 	uint32_t cp = 0, state = 0;
219 
220 	for (; *s; ++s)
221 		if (!decode(&state, &cp, *s))
222 			break;
223 	return (char*)s+1;
224 }
225 
226 char *
227 utf8_prev_cp(const char *start, const char *base)
228 {
229 	uint8_t c;
230 
231 	for (; start > base; start--) {
232 		c = *start;
233 		if ((c & 0xC0) != 0x80)
234 			return (char*)start;
235 	}
236 
237 	return (char*)base;
238 }
239 
240 /*
241  * XXX: This is not correct.  There are codepoints classified as
242  * "emoji", but these can be joined toghether to form more complex
243  * emoji.  There is an official list of what these valid combinations
244  * are, but it would require a costly lookup (a trie can be used to
245  * reduce the times, but...).  The following approach is conceptually
246  * simpler: if there is a sequence of "emoji codepoints" (or ZWS) and
247  * then a space, consider everything before the space a single emoji.
248  * It needs a special check for numbers (yes, 0..9 and # are
249  * technically speaking emojis) but otherwise seems to work well in
250  * practice.
251  */
252 int
253 emojied_line(const char *s, const char **space_ret)
254 {
255 	uint32_t cp = 0, state = 0;
256 	int only_numbers = 1;
257 
258 	for (; *s; ++s) {
259 		if (!decode(&state, &cp, *s)) {
260 			if (cp == ZERO_WIDTH_SPACE)
261 				continue;
262 			if (cp == ' ') {
263 				*space_ret = s;
264 				return !only_numbers;
265 			}
266 			if (!is_emoji(cp))
267 				return 0;
268 			if (cp < '0' || cp > '9')
269 				only_numbers = 0;
270 		}
271 	}
272 
273 	return 0;
274 }