Blob


1 /* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
2 *
3 * Permission is hereby granted, free of charge, to any person
4 * obtaining a copy of this software and associated documentation
5 * files (the "Software"), to deal in the Software without
6 * restriction, including without limitation the rights to use, copy,
7 * modify, merge, publish, distribute, sublicense, and/or sell copies
8 * of the Software, and to permit persons to whom the Software is
9 * furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice shall be
12 * included in all copies or substantial portions of the Software.
13 *
14 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
15 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
16 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
17 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
18 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
19 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
20 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 * SOFTWARE.
22 */
24 #include <assert.h>
25 #include <stddef.h>
26 #include <stdint.h>
27 #include <wchar.h>
29 #include "telescope.h"
30 #include "utf8.h"
32 #define UTF8_ACCEPT 0
33 #define UTF8_REJECT 1
35 static const uint8_t utf8d[] = {
36 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
37 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
38 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
39 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
40 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
41 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
42 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
43 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
44 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
45 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
46 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
47 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
48 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
49 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
50 };
52 uint32_t
53 utf8_decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte) {
54 uint32_t type = utf8d[byte];
56 *codep = (*state != UTF8_ACCEPT) ?
57 (byte & 0x3fu) | (*codep << 6) :
58 (0xff >> type) & (byte);
60 *state = utf8d[256 + *state*16 + type];
61 return *state;
62 }
65 /* end of the converter, utility functions ahead */
67 /* encode cp in s. s must be at least 4 bytes wide */
68 size_t
69 utf8_encode(uint32_t cp, char *s)
70 {
71 if (cp <= 0x7F) {
72 *s = (uint8_t)cp;
73 return 1;
74 } else if (cp <= 0x7FF) {
75 s[1] = (uint8_t)(( cp & 0x3F ) + 0x80);
76 s[0] = (uint8_t)(((cp >> 6) & 0x1F) + 0xC0);
77 return 2;
78 } else if (cp <= 0xFFFF) {
79 s[2] = (uint8_t)(( cp & 0x3F) + 0x80);
80 s[1] = (uint8_t)(((cp >> 6) & 0x3F) + 0x80);
81 s[0] = (uint8_t)(((cp >> 12) & 0x0F) + 0xE0);
82 return 3;
83 } else if (cp <= 0x10FFFF) {
84 s[3] = (uint8_t)(( cp & 0x3F) + 0x80);
85 s[2] = (uint8_t)(((cp >> 6) & 0x3F) + 0x80);
86 s[1] = (uint8_t)(((cp >> 12) & 0x3F) + 0x80);
87 s[0] = (uint8_t)(((cp >> 18) & 0x07) + 0xF0);
88 return 4;
89 } else {
90 s[0] = '\0';
91 return 0;
92 }
93 }
95 char *
96 utf8_nth(char *s, size_t n)
97 {
98 size_t i;
99 uint32_t cp = 0, state = 0;
101 for (i = 0; *s && i < n; ++s)
102 if (!utf8_decode(&state, &cp, *s))
103 ++i;
105 if (state != UTF8_ACCEPT)
106 return NULL;
107 if (i == n)
108 return s;
109 return NULL;
112 size_t
113 utf8_cplen(char *s)
115 uint32_t cp = 0, state = 0;
116 size_t len;
118 len = 0;
119 for (; *s; ++s)
120 if (!utf8_decode(&state, &cp, *s))
121 len++;
122 return len;
125 /* returns only 0, 1, 2 or 8. assumes sizeof(wchar_t) is 4 */
126 size_t
127 utf8_chwidth(uint32_t cp)
129 /* XXX: if we're running on a platform where sizeof(wchar_t)
130 * == 2 what to do? The manpage for wcwidth and wcs isn't
131 * clear about the encoding, but if it's 16 bit wide I assume
132 * it must use UTF-16... right? */
133 assert(sizeof(wchar_t) == 4);
135 /*
136 * quick and dirty fix for the tabs. In the future we may
137 * want to expand tabs into N spaces, but for the time being
138 * this seems to be good enough (tm).
139 */
140 if (cp == '\t')
141 return 8;
143 return wcwidth((wchar_t)cp);
146 /* NOTE: n is the number of codepoints, NOT the byte length. In
147 * other words, s MUST be NUL-terminated. */
148 size_t
149 utf8_snwidth(const char *s, size_t n)
151 size_t i, tot;
152 uint32_t cp = 0, state = 0;
154 tot = 0;
155 for (i = 0; *s && i < n; ++s)
156 if (!utf8_decode(&state, &cp, *s)) {
157 i++;
158 tot += utf8_chwidth(cp);
161 return tot;
164 size_t
165 utf8_swidth(const char *s)
167 size_t tot;
168 uint32_t cp = 0, state = 0;
170 tot = 0;
171 for (; *s; ++s)
172 if (!utf8_decode(&state, &cp, *s))
173 tot += utf8_chwidth(cp);
175 return tot;
178 size_t
179 utf8_swidth_between(const char *str, const char *end)
181 size_t tot;
182 uint32_t cp = 0, state = 0;
184 tot = 0;
185 for (; *str && str < end; ++str)
186 if (!utf8_decode(&state, &cp, *str))
187 tot += utf8_chwidth(cp);
188 return tot;
191 char *
192 utf8_next_cp(const char *s)
194 uint32_t cp = 0, state = 0;
196 for (; *s; ++s)
197 if (!utf8_decode(&state, &cp, *s))
198 break;
199 return (char*)s+1;
202 char *
203 utf8_prev_cp(const char *start, const char *base)
205 uint8_t c;
207 for (; start > base; start--) {
208 c = *start;
209 if ((c & 0xC0) != 0x80)
210 return (char*)start;
213 return (char*)base;