op public repos

Blob

Date:: Sun Mar 21 09:42:41 2021 UTC
Message:: cursor handling with utf8 support the code relative to the cursors movements now respects the width of the characters (zero, one or two cells).
Actions:: History | Blame | Raw File
1 /* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
2  *
3  * Permission is hereby granted, free of charge, to any person
4  * obtaining a copy of this software and associated documentation
5  * files (the "Software"), to deal in the Software without
6  * restriction, including without limitation the rights to use, copy,
7  * modify, merge, publish, distribute, sublicense, and/or sell copies
8  * of the Software, and to permit persons to whom the Software is
9  * furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be
12  * included in all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21  * SOFTWARE.
22  */
23 
24 #include "telescope.h"
25 
26 #include <assert.h>
27 #include <stddef.h>
28 #include <stdint.h>
29 #include <wchar.h>
30 
31 #define UTF8_ACCEPT 0
32 #define UTF8_REJECT 1
33 
34 static const uint8_t utf8d[] = {
35 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
36 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
37 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
38 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
39 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
40 	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
41 	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
42 	0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
43 	0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
44 	0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
45 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
46 	1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
47 	1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
48 	1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
49 };
50 
51 static inline uint32_t
52 utf8_decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte) {
53 	uint32_t type = utf8d[byte];
54 
55 	*codep = (*state != UTF8_ACCEPT) ?
56 		(byte & 0x3fu) | (*codep << 6) :
57 		(0xff >> type) & (byte);
58 
59 	*state = utf8d[256 + *state*16 + type];
60 	return *state;
61 }
62 
63 
64 /* end of the converter, utility functions ahead */
65 
66 char *
67 utf8_nth(char *s, size_t n)
68 {
69 	size_t i;
70 	uint32_t cp = 0, state = 0;
71 
72 	for (i = 0; *s && i < n; ++s)
73 		if (!utf8_decode(&state, &cp, *s))
74 			++i;
75 
76 	if (state != UTF8_ACCEPT)
77 		return NULL;
78 	if (i == n)
79 		return s;
80 	return NULL;
81 }
82 
83 size_t
84 utf8_cplen(char *s)
85 {
86 	uint32_t cp = 0, state = 0;
87 	size_t len;
88 
89 	len = 0;
90 	for (; *s; ++s)
91 		if (!utf8_decode(&state, &cp, *s))
92 			len++;
93 	return len;
94 }
95 
96 /* returns only 0, 1 or 2.  assumes sizeof(wchar_t) is 4 */
97 size_t
98 utf8_chwidth(uint32_t cp)
99 {
100 	/* XXX: if we're running on a platform where sizeof(wchar_t)
101 	 * == 2 what to do?  The manpage for wcwidth and wcs isn't
102 	 * clear about the encoding, but if it's 16 bit wide I assume
103 	 * it must use UTF-16... right? */
104 	assert(sizeof(wchar_t) == 4);
105 
106 	return wcwidth((wchar_t)cp);
107 }
108 
109 /* NOTE: n is the number of codepoints, NOT the byte length.  In
110  * other words, s MUST be NUL-terminated. */
111 size_t
112 utf8_snwidth(const char *s, size_t n)
113 {
114 	size_t i, tot;
115 	uint32_t cp = 0, state = 0;
116 
117 	tot = 0;
118 	for (i = 0; *s && i < n; ++s)
119 		if (!utf8_decode(&state, &cp, *s)) {
120 			i++;
121 			tot += utf8_chwidth(cp);
122 		}
123 
124 	return tot;
125 }
126 
127 size_t
128 utf8_swidth(const char *s)
129 {
130 	size_t tot;
131 	uint32_t cp = 0, state = 0;
132 
133 	tot = 0;
134 	for (; *s; ++s)
135 		if (!utf8_decode(&state, &cp, *s))
136 			tot += utf8_chwidth(cp);
137 
138 	return tot;
139 }