commit ef04b55160759b22db67f14c703a4343c4741e8b
from: Omar Polo <op@omarpolo.com>
date: Sat Jan 09 20:32:23 2021 UTC

switch to Bjoern Hoehrmann UTF-8 decoder

It's correct, while my hacked valid_multibyte_utf8 would allow things
that aren't technically UTF8.

commit - 578ba2d81b2fa5f839314190f42bb9116069db7a
commit + ef04b55160759b22db67f14c703a4343c4741e8b
blob - 1d355e3ba2d5b1d5fd244ed77062bc9b6952b0c1
blob + 13cfae04d8424a84283849e46e75d981bb271bf1
--- Makefile
+++ Makefile
@@ -6,11 +6,11 @@ LDFLAGS =	-ltls
 
 all: gmid TAGS README.md
 
-gmid: gmid.o uri.o
-	${CC} gmid.o uri.o -o gmid ${LDFLAGS}
+gmid: gmid.o uri.o utf8.o
+	${CC} gmid.o uri.o utf8.o -o gmid ${LDFLAGS}
 
-TAGS: gmid.c uri.c
-	-etags gmid.c uri.c || true
+TAGS: gmid.c uri.c utf8.c
+	-etags gmid.c uri.c utf8.c || true
 
 README.md: gmid.1
 	mandoc -Tmarkdown gmid.1 | sed -e '1d' -e '$$d' > README.md
@@ -18,8 +18,8 @@ README.md: gmid.1
 clean:
 	rm -f *.o gmid
 
-uri_test: uri_test.o uri.o
-	${CC} uri_test.o uri.o -o uri_test ${LDFLAGS}
+uri_test: uri_test.o uri.o utf8.o
+	${CC} uri_test.o uri.o utf8.o -o uri_test ${LDFLAGS}
 
 test: uri_test
 	./uri_test
blob - 16a1b3919227497d33a592eabc9e009fafc75a32
blob + 3e9e46128e3aec385c67204bdd22394472211019
--- README.md
+++ README.md
@@ -212,6 +212,12 @@ and not
 *docs/cgi-bin*,
 since it's relative to the document root.
 
+# ACKNOWLEDGEMENTS
+
+**gmid**
+uses the "Flexible and Economical" UTF-8 decoder written by
+Bjoern Hoehrmann.
+
 # CAVEATS
 
 *	it doesn't support virtual hosts: the host part of the request URL is
blob - 5c9aeb6613600df71d049f8f1cd51af53d7c976f
blob + 12fa78247ffdb69d151e04621c442b41315fb271
--- gmid.1
+++ gmid.1
@@ -184,6 +184,10 @@ option is
 and not
 .Pa docs/cgi-bin ,
 since it's relative to the document root.
+.Sh ACKNOWLEDGEMENTS
+.Nm
+uses the "Flexible and Economical" UTF-8 decoder written by
+.An Bjoern Hoehrmann .
 .Sh CAVEATS
 .Bl -bullet
 .It
blob - c8f451487ccf97c59f4e955bb1c1b35b0bd92d86
blob + 3ee5fe4dd4a929a973d8217610e0011d3eb02086
--- gmid.h
+++ gmid.h
@@ -117,6 +117,12 @@ struct uri {
 	char		*fragment;
 };
 
+struct parser {
+	char		*uri;
+	struct uri	*parsed;
+	const char	*err;
+};
+
 enum {
 	FILE_EXISTS,
 	FILE_EXECUTABLE,
@@ -151,6 +157,9 @@ void		 loop(struct tls*, int);
 
 void		 usage(const char*);
 
+/* utf8.c */
+int		 valid_multibyte_utf8(struct parser*);
+
 /* uri.c */
 int		 parse_uri(char*, struct uri*, const char**);
 int		 trim_req_uri(char*);
blob - 2bf56eb67d1a37f8e17f79f4051139d0f25532f7
blob + 5af32b78c6b1a896154993dab383ded90d3f6ee4
--- uri.c
+++ uri.c
@@ -87,14 +87,6 @@
  *
  */
 
-struct parser {
-	char		*uri;
-	struct uri	*parsed;
-	const char	*err;
-};
-
-#define CONT_BYTE(b) ((b & 0xC0) == 0x80)
-
 /* XXX: these macros will expand multiple times their argument */
 
 #define UNRESERVED(p)				\
@@ -116,49 +108,7 @@ struct parser {
 	    || p == ','				\
 	    || p == ';'				\
 	    || p == '=')
-
-/* NOTE: the increments are one less what they should be, because the
- * caller will add one byte after we return. */
-static int
-valid_multibyte_utf8(struct parser *p)
-{
-	uint32_t c;
-	uint8_t s;
-
-	c = 0;
-	s = *p->uri;
-
-	if ((s & 0xE0) == 0xC0) {
-		if (!CONT_BYTE(*(p->uri+1)))
-			return 0;
-		c = ((s & 0x1F) << 6) | (*(p->uri+1) & 0x3F);
-		p->uri += 1;
-	} else if ((s & 0xF0) == 0xE0) {
-		if (!CONT_BYTE(*(p->uri+1)) ||
-		    !CONT_BYTE(*(p->uri+2)))
-			return 0;
-		c = (s & 0x0F) << 12
-			| ((*(p->uri+1) & 0x3F) << 6)
-			| ((*(p->uri+2) & 0x3F));
-		p->uri += 2;
-	} else if ((s & 0xF8) == 0xF0) {
-		if (!CONT_BYTE(*(p->uri+1)) ||
-		    !CONT_BYTE(*(p->uri+2)) ||
-		    !CONT_BYTE(*(p->uri+3)))
-			return 0;
-		c = (s & 0x07) << 18
-			| ((*(p->uri+1) & 0x3F) << 12)
-			| ((*(p->uri+2) & 0x3F) << 6)
-			| ((*(p->uri+3) & 0x3F));
-		p->uri += 3;
-	} else
-		return 0;
 
-	return (((0x080 <= c) && (c <= 0x7FF))
-	    || (((0x800 <= c) && (c <= 0xFFFF)))
-	    || (((0x10000 <= c) && (c <= 0x10FFFF))));
-}
-
 static int
 parse_pct_encoded(struct parser *p)
 {
blob - /dev/null
blob + 81a75845afc65a0a25611da39d428a407f077662 (mode 644)
--- /dev/null
+++ utf8.c
@@ -0,0 +1,79 @@
+/* Copyright (c) 2008-2009 Bjoern Hoehrmann <bjoern@hoehrmann.de>
+ *
+ * Permission is hereby granted, free of charge, to any person
+ * obtaining a copy of this software and associated documentation
+ * files (the "Software"), to deal in the Software without
+ * restriction, including without limitation the rights to use, copy,
+ * modify, merge, publish, distribute, sublicense, and/or sell copies
+ * of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <stddef.h>
+#include <stdint.h>
+
+#include "gmid.h"
+
+#define UTF8_ACCEPT 0
+#define UTF8_REJECT 1
+
+static const uint8_t utf8d[] = {
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
+	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
+	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
+	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
+	0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
+	0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
+	0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
+	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
+	1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
+	1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
+	1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
+};
+
+static uint32_t inline
+utf8_decode(uint32_t* state, uint32_t* codep, uint8_t byte) {
+	uint32_t type = utf8d[byte];
+
+	*codep = (*state != UTF8_ACCEPT) ?
+		(byte & 0x3fu) | (*codep << 6) :
+		(0xff >> type) & (byte);
+
+	*state = utf8d[256 + *state*16 + type];
+	return *state;
+}
+
+/* for the iri parser.  Modelled after printCodePoints */
+int
+valid_multibyte_utf8(struct parser *p)
+{
+	uint32_t cp = 0, state = 0;
+
+        for (; *p->uri; p->uri++)
+		if (!utf8_decode(&state, &cp, *p->uri))
+			break;
+
+	/* reject the ASCII range */
+	if (state || cp <= 0x7F) {
+		/* XXX: do some error recovery? */
+		if (state)
+			p->err = "invalid UTF-8 character";
+		return 0;
+	}
+	return 1;
+}