Blame
Date:
Mon Oct 18 10:05:55 2021 UTC
Message:
fmt
01
2021-07-07
op
/*
02
2021-07-07
op
* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
03
2021-01-09
op
*
04
2021-01-09
op
* Permission is hereby granted, free of charge, to any person
05
2021-01-09
op
* obtaining a copy of this software and associated documentation
06
2021-01-09
op
* files (the "Software"), to deal in the Software without
07
2021-01-09
op
* restriction, including without limitation the rights to use, copy,
08
2021-01-09
op
* modify, merge, publish, distribute, sublicense, and/or sell copies
09
2021-01-09
op
* of the Software, and to permit persons to whom the Software is
10
2021-01-09
op
* furnished to do so, subject to the following conditions:
11
2021-01-09
op
*
12
2021-01-09
op
* The above copyright notice and this permission notice shall be
13
2021-01-09
op
* included in all copies or substantial portions of the Software.
14
2021-01-09
op
*
15
2021-01-09
op
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16
2021-01-09
op
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17
2021-01-09
op
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18
2021-01-09
op
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19
2021-01-09
op
* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20
2021-01-09
op
* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21
2021-01-09
op
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
2021-01-09
op
* SOFTWARE.
23
2021-01-09
op
*/
24
2021-01-09
op
25
2021-02-12
op
#include "gmid.h"
26
2021-02-12
op
27
2021-01-09
op
#include <stddef.h>
28
2021-01-09
op
#include <stdint.h>
29
2021-01-09
op
30
2021-01-09
op
#define UTF8_ACCEPT 0
31
2021-01-09
op
#define UTF8_REJECT 1
32
2021-01-09
op
33
2021-01-09
op
static const uint8_t utf8d[] = {
34
2021-01-09
op
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
35
2021-01-09
op
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
36
2021-01-09
op
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
37
2021-01-09
op
0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
38
2021-01-09
op
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
39
2021-01-09
op
7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
40
2021-01-09
op
8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
41
2021-01-09
op
0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
42
2021-01-09
op
0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
43
2021-01-09
op
0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
44
2021-01-09
op
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
45
2021-01-09
op
1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
46
2021-01-09
op
1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
47
2021-01-09
op
1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
48
2021-01-09
op
};
49
2021-01-09
op
50
2021-01-10
op
static inline uint32_t
51
2021-01-09
op
utf8_decode(uint32_t* state, uint32_t* codep, uint8_t byte) {
52
2021-01-09
op
uint32_t type = utf8d[byte];
53
2021-01-09
op
54
2021-01-09
op
*codep = (*state != UTF8_ACCEPT) ?
55
2021-01-09
op
(byte & 0x3fu) | (*codep << 6) :
56
2021-01-09
op
(0xff >> type) & (byte);
57
2021-01-09
op
58
2021-01-09
op
*state = utf8d[256 + *state*16 + type];
59
2021-01-09
op
return *state;
60
2021-01-09
op
}
61
2021-01-09
op
62
2021-01-09
op
/* for the iri parser. Modelled after printCodePoints */
63
2021-01-09
op
int
64
2021-01-09
op
valid_multibyte_utf8(struct parser *p)
65
2021-01-09
op
{
66
2021-01-09
op
uint32_t cp = 0, state = 0;
67
2021-01-09
op
68
2021-10-18
op
for (; *p->iri; p->iri++)
69
2021-01-11
op
if (!utf8_decode(&state, &cp, *p->iri))
70
2021-01-09
op
break;
71
2021-01-09
op
72
2021-01-09
op
/* reject the ASCII range */
73
2021-01-09
op
if (state || cp <= 0x7F) {
74
2021-01-09
op
/* XXX: do some error recovery? */
75
2021-01-09
op
if (state)
76
2021-01-09
op
p->err = "invalid UTF-8 character";
77
2021-01-09
op
return 0;
78
2021-01-09
op
}
79
2021-01-09
op
return 1;
80
2021-01-09
op
}
81
2021-01-27
op
82
2021-01-27
op
char *
83
2021-01-27
op
utf8_nth(char *s, size_t n)
84
2021-01-27
op
{
85
2021-01-27
op
size_t i;
86
2021-01-27
op
uint32_t cp = 0, state = 0;
87
2021-01-27
op
88
2021-01-27
op
for (i = 0; *s && i < n; ++s)
89
2021-01-27
op
if (!utf8_decode(&state, &cp, *s))
90
2021-01-27
op
++i;
91
2021-01-27
op
92
2021-01-27
op
if (state != UTF8_ACCEPT)
93
2021-01-27
op
return NULL;
94
2021-01-27
op
if (i == n)
95
2021-01-27
op
return s;
96
2021-01-27
op
return NULL;
97
2021-01-27
op
}
Omar Polo