commit df6ca41da36c3f617cbbf3302ab120721ebfcfd2
from: Omar Polo <op@omarpolo.com>
date: Fri Dec 25 23:33:11 2020 UTC

IRI support

This extends the URI parser so it supports full IRI (Internationalized
Resource Identifiers, RFC3987).  Some areas of it can/may be improved,
but here's a start.

Note: we assume UTF-8 encoded IRI.

commit - 043acc97b16be18d85bb1914da50f7ce2aa2623e
commit + df6ca41da36c3f617cbbf3302ab120721ebfcfd2
blob - 597391a9c578320b2e748a05ffc019358eac8a39
blob + 1c9b75f2c471e4d3b9e67dba136bfb081e1a8d7e
--- README.md
+++ README.md
@@ -20,11 +20,8 @@ is a very simple and minimal gemini server that can se
 and execute CGI scripts.
 
 **gmid**
-will strip any sequence of
-*../*
-or trailing
-*..*
-in the requests made by clients and will refuse to follow symlinks.
+won't serve files outside the given directory and won't follow
+symlinks.
 Furthermore, on
 OpenBSD,
 pledge(2)
@@ -35,6 +32,10 @@ are used to ensure that
 dosen't do anything else than read files from the given directory,
 accept network connections and, optionally, execute CGI scripts.
 
+**gmid**
+fully supports IRIs (Internationalized Resource Identifiers, see
+RFC3987).
+
 It should be noted that
 **gmid**
 is very simple in its implementation, and so it may not be appropriate
blob - 77ef87db7e835803323eb5c04d6794dcfec67099
blob + edf67d5281ed871d651552902d513166ee9d2d2c
--- gmid.1
+++ gmid.1
@@ -33,11 +33,8 @@ is a very simple and minimal gemini server that can se
 and execute CGI scripts.
 .Pp
 .Nm
-will strip any sequence of
-.Pa ../
-or trailing
-.Pa ..
-in the requests made by clients and will refuse to follow symlinks.
+won't serve files outside the given directory and won't follow
+symlinks.
 Furthermore, on
 .Ox ,
 .Xr pledge 2
@@ -48,6 +45,10 @@ are used to ensure that
 dosen't do anything else than read files from the given directory,
 accept network connections and, optionally, execute CGI scripts.
 .Pp
+.Nm
+fully supports IRIs (Internationalized Resource Identifiers, see
+RFC3987).
+.Pp
 It should be noted that
 .Nm
 is very simple in its implementation, and so it may not be appropriate
blob - 245928ac2a36f5594f0eda298dc56e4c1c8b7900
blob + 3f81b762a5c8152496864bf438e9f8c16be84c63
--- uri.c
+++ uri.c
@@ -93,6 +93,8 @@ struct parser {
 	const char	*err;
 };
 
+#define CONT_BYTE(b) ((b & 0xC0) == 0x80)
+
 /* XXX: these macros will expand multiple times their argument */
 
 #define UNRESERVED(p)				\
@@ -114,7 +116,49 @@ struct parser {
 	    || p == ','				\
 	    || p == ';'				\
 	    || p == '=')
+
+/* NOTE: the increment are one less what it should be, because the
+ * caller will add one byte after we return. */
+static int
+valid_multibyte_utf8(struct parser *p)
+{
+	uint32_t c;
+	uint8_t s;
+
+	c = 0;
+	s = *p->uri;
+
+	if ((s & 0xE0) == 0xC0) {
+		if (!CONT_BYTE(*(p->uri+1)))
+			return 0;
+		c = ((s & 0x1F) << 6) | (*(p->uri+1) & 0x3F);
+		p->uri += 1;
+	} else if ((s & 0xF0) == 0xE0) {
+		if (!CONT_BYTE(*(p->uri+1)) ||
+		    !CONT_BYTE(*(p->uri+2)))
+			return 0;
+		c = (s & 0x0F) << 12
+			| ((*(p->uri+1) & 0x3F) << 6)
+			| ((*(p->uri+2) & 0x3F));
+		p->uri += 2;
+	} else if ((s & 0xF8) == 0xF0) {
+		if (!CONT_BYTE(*(p->uri+1)) ||
+		    !CONT_BYTE(*(p->uri+2)) ||
+		    !CONT_BYTE(*(p->uri+3)))
+			return 0;
+		c = (s & 0x07) << 18
+			| ((*(p->uri+1) & 0x3F) << 12)
+			| ((*(p->uri+2) & 0x3F) << 6)
+			| ((*(p->uri+3) & 0x3F));
+		p->uri += 3;
+	} else
+		return 0;
 
+	return (((0x080 <= c) && (c <= 0x7FF))
+	    || (((0x800 <= c) && (c <= 0xFFFF)))
+	    || (((0x10000 <= c) && (c <= 0x10FFFF))));
+}
+
 static int
 parse_pct_encoded(struct parser *p)
 {
@@ -308,7 +352,8 @@ parse_query(struct parser *p)
 	    || SUB_DELIMITERS(*p->uri)
 	    || *p->uri == '/'
 	    || *p->uri == '?'
-	    || parse_pct_encoded(p))
+	    || parse_pct_encoded(p)
+	    || valid_multibyte_utf8(p))
 		p->uri++;
 
 	if (*p->uri != '\0' && *p->uri != '#') {
@@ -348,7 +393,8 @@ parse_path(struct parser *p)
 	while (UNRESERVED(*p->uri)
 	    || SUB_DELIMITERS(*p->uri)
 	    || *p->uri == '/'
-	    || parse_pct_encoded(p))
+	    || parse_pct_encoded(p)
+	    || valid_multibyte_utf8(p))
 		p->uri++;
 
 	if (*p->uri != '\0' && *p->uri != '?' && *p->uri != '#') {
blob - c6521f668c8263cd6b9162966dd9b212dc529948
blob + f322c1e1f6e7a85c1b328cd3825dbb091521f9d5
--- uri_test.c
+++ uri_test.c
@@ -87,6 +87,12 @@ main(void)
 {
 	struct uri empty = {"", "", "", PASS, "", "", ""};
 
+	TEST("foo://bar.com/foo%00?baz",
+	    FAIL,
+	    empty,
+	    "rejects %00");
+	return 0;
+
 	TEST("http://omarpolo.com",
 	    PASS,
 	    URI("http", "omarpolo.com", "", "", "", ""),
@@ -153,6 +159,10 @@ main(void)
 	    FAIL,
             empty,
 	    "reject paths that would escape the root");
+	TEST("gemini://omarpolo.com/foo/../foo/../././/bar/baz/.././.././/",
+	    PASS,
+            URI("gemini", "omarpolo.com", "", "", "", ""),
+	    "parse path with lots of cleaning available");
 
 	/* query */
 	TEST("foo://example.com/foo/?gne",
@@ -179,6 +189,44 @@ main(void)
 	    PASS,
 	    URI("foo", "bar.com", "", "cafè.gmi", "", ""),
 	    "can decode");
+	TEST("foo://bar.com/caff%C3%A8%20macchiato.gmi",
+	    PASS,
+	    URI("foo", "bar.com", "", "caffè macchiato.gmi", "", ""),
+	    "can decode");
+	TEST("foo://bar.com/caff%C3%A8+macchiato.gmi",
+	    PASS,
+	    URI("foo", "bar.com", "", "caffè+macchiato.gmi", "", ""),
+	    "can decode");
+	TEST("foo://bar.com/foo%2F..%2F..",
+	    FAIL,
+	    empty,
+	    "conversion and checking are done in the correct order");
+	TEST("foo://bar.com/foo%00?baz",
+	    FAIL,
+	    empty,
+	    "rejects %00");
 
+	/* IRI */
+        TEST("foo://bar.com/cafè.gmi",
+	    PASS,
+	    URI("foo", "bar.com", "", "cafè.gmi", "" , ""),
+	    "decode IRI (with a 2-byte utf8 seq)");
+	TEST("foo://bar.com/世界.gmi",
+	    PASS,
+	    URI("foo", "bar.com", "", "世界.gmi", "" , ""),
+	    "decode IRI");
+	TEST("foo://bar.com/😼.gmi",
+	    PASS,
+	    URI("foo", "bar.com", "", "😼.gmi", "" , ""),
+	    "decode IRI (with a 3-byte utf8 seq)");
+	TEST("foo://bar.com/😼/𤭢.gmi",
+	    PASS,
+	    URI("foo", "bar.com", "", "😼/𤭢.gmi", "" , ""),
+	    "decode IRI (with a 3-byte and a 4-byte utf8 seq)");
+	TEST("foo://bar.com/世界/\xC0\x80",
+	    FAIL,
+	    empty,
+	    "reject invalid sequence (overlong NUL)");
+
 	return 0;
 }