Commit Diff
Diff:
578ba2d81b2fa5f839314190f42bb9116069db7a
ef04b55160759b22db67f14c703a4343c4741e8b
Commit:
ef04b55160759b22db67f14c703a4343c4741e8b
Tree:
fff26b0d56a0506b2ef0ebb84ffa50f2d655ebe6
Author:
Omar Polo <op@omarpolo.com>
Date:
Sat Jan 9 20:32:23 2021 UTC
Message:
switch to Bjoern Hoehrmann UTF-8 decoder

It's correct, while my hacked valid_multibyte_utf8 would allow things
that aren't technically UTF8.
commit - 578ba2d81b2fa5f839314190f42bb9116069db7a
commit + ef04b55160759b22db67f14c703a4343c4741e8b
blob - 1d355e3ba2d5b1d5fd244ed77062bc9b6952b0c1
blob + 13cfae04d8424a84283849e46e75d981bb271bf1
--- Makefile
+++ Makefile
@@ -6,11 +6,11 @@ gmid: gmid.o uri.o
all: gmid TAGS README.md
-gmid: gmid.o uri.o
- ${CC} gmid.o uri.o -o gmid ${LDFLAGS}
+gmid: gmid.o uri.o utf8.o
+ ${CC} gmid.o uri.o utf8.o -o gmid ${LDFLAGS}
-TAGS: gmid.c uri.c
- -etags gmid.c uri.c || true
+TAGS: gmid.c uri.c utf8.c
+ -etags gmid.c uri.c utf8.c || true
README.md: gmid.1
mandoc -Tmarkdown gmid.1 | sed -e '1d' -e '$$d' > README.md
@@ -18,8 +18,8 @@ uri_test: uri_test.o uri.o
clean:
rm -f *.o gmid
-uri_test: uri_test.o uri.o
- ${CC} uri_test.o uri.o -o uri_test ${LDFLAGS}
+uri_test: uri_test.o uri.o utf8.o
+ ${CC} uri_test.o uri.o utf8.o -o uri_test ${LDFLAGS}
test: uri_test
./uri_test
blob - 16a1b3919227497d33a592eabc9e009fafc75a32
blob + 3e9e46128e3aec385c67204bdd22394472211019
--- README.md
+++ README.md
@@ -212,6 +212,12 @@ since it's relative to the document root.
*docs/cgi-bin*,
since it's relative to the document root.
+# ACKNOWLEDGEMENTS
+
+**gmid**
+uses the "Flexible and Economical" UTF-8 decoder written by
+Bjoern Hoehrmann.
+
# CAVEATS
* it doesn't support virtual hosts: the host part of the request URL is
blob - 5c9aeb6613600df71d049f8f1cd51af53d7c976f
blob + 12fa78247ffdb69d151e04621c442b41315fb271
--- gmid.1
+++ gmid.1
@@ -184,6 +184,10 @@ since it's relative to the document root.
and not
.Pa docs/cgi-bin ,
since it's relative to the document root.
+.Sh ACKNOWLEDGEMENTS
+.Nm
+uses the "Flexible and Economical" UTF-8 decoder written by
+.An Bjoern Hoehrmann .
.Sh CAVEATS
.Bl -bullet
.It
blob - c8f451487ccf97c59f4e955bb1c1b35b0bd92d86
blob + 3ee5fe4dd4a929a973d8217610e0011d3eb02086
--- gmid.h
+++ gmid.h
@@ -117,6 +117,12 @@ enum {
char *fragment;
};
+struct parser {
+ char *uri;
+ struct uri *parsed;
+ const char *err;
+};
+
enum {
FILE_EXISTS,
FILE_EXECUTABLE,
@@ -151,6 +157,9 @@ void usage(const char*);
void usage(const char*);
+/* utf8.c */
+int valid_multibyte_utf8(struct parser*);
+
/* uri.c */
int parse_uri(char*, struct uri*, const char**);
int trim_req_uri(char*);
blob - 2bf56eb67d1a37f8e17f79f4051139d0f25532f7
blob + 5af32b78c6b1a896154993dab383ded90d3f6ee4
--- uri.c
+++ uri.c
@@ -87,14 +87,6 @@ struct parser {
*
*/
-struct parser {
- char *uri;
- struct uri *parsed;
- const char *err;
-};
-
-#define CONT_BYTE(b) ((b & 0xC0) == 0x80)
-
/* XXX: these macros will expand multiple times their argument */
#define UNRESERVED(p) \
@@ -116,49 +108,7 @@ struct parser {
|| p == ',' \
|| p == ';' \
|| p == '=')
-
-/* NOTE: the increments are one less what they should be, because the
- * caller will add one byte after we return. */
-static int
-valid_multibyte_utf8(struct parser *p)
-{
- uint32_t c;
- uint8_t s;
-
- c = 0;
- s = *p->uri;
-
- if ((s & 0xE0) == 0xC0) {
- if (!CONT_BYTE(*(p->uri+1)))
- return 0;
- c = ((s & 0x1F) << 6) | (*(p->uri+1) & 0x3F);
- p->uri += 1;
- } else if ((s & 0xF0) == 0xE0) {
- if (!CONT_BYTE(*(p->uri+1)) ||
- !CONT_BYTE(*(p->uri+2)))
- return 0;
- c = (s & 0x0F) << 12
- | ((*(p->uri+1) & 0x3F) << 6)
- | ((*(p->uri+2) & 0x3F));
- p->uri += 2;
- } else if ((s & 0xF8) == 0xF0) {
- if (!CONT_BYTE(*(p->uri+1)) ||
- !CONT_BYTE(*(p->uri+2)) ||
- !CONT_BYTE(*(p->uri+3)))
- return 0;
- c = (s & 0x07) << 18
- | ((*(p->uri+1) & 0x3F) << 12)
- | ((*(p->uri+2) & 0x3F) << 6)
- | ((*(p->uri+3) & 0x3F));
- p->uri += 3;
- } else
- return 0;
- return (((0x080 <= c) && (c <= 0x7FF))
- || (((0x800 <= c) && (c <= 0xFFFF)))
- || (((0x10000 <= c) && (c <= 0x10FFFF))));
-}
-
static int
parse_pct_encoded(struct parser *p)
{
blob - /dev/null
blob + 81a75845afc65a0a25611da39d428a407f077662 (mode 644)
--- /dev/null
+++ utf8.c
@@ -0,0 +1,79 @@
+/* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "gmid.h"
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 1
+
+static const uint8_t utf8d[] = {
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+ 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+ 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+ 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+ 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+ 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+ 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+ 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+ 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+ 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+ 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+static uint32_t inline
+utf8_decode(uint32_t* state, uint32_t* codep, uint8_t byte) {
+ uint32_t type = utf8d[byte];
+
+ *codep = (*state != UTF8_ACCEPT) ?
+ (byte & 0x3fu) | (*codep << 6) :
+ (0xff >> type) & (byte);
+
+ *state = utf8d[256 + *state*16 + type];
+ return *state;
+}
+
+/* for the iri parser. Modelled after printCodePoints */
+int
+valid_multibyte_utf8(struct parser *p)
+{
+ uint32_t cp = 0, state = 0;
+
+ for (; *p->uri; p->uri++)
+ if (!utf8_decode(&state, &cp, *p->uri))
+ break;
+
+ /* reject the ASCII range */
+ if (state || cp <= 0x7F) {
+ /* XXX: do some error recovery? */
+ if (state)
+ p->err = "invalid UTF-8 character";
+ return 0;
+ }
+ return 1;
+}
Omar Polo