commit df6ca41da36c3f617cbbf3302ab120721ebfcfd2 from: Omar Polo date: Fri Dec 25 23:33:11 2020 UTC IRI support This extends the URI parser so it supports full IRI (Internationalized Resource Identifiers, RFC3987). Some areas of it can/may be improved, but here's a start. Note: we assume UTF-8 encoded IRI. commit - 043acc97b16be18d85bb1914da50f7ce2aa2623e commit + df6ca41da36c3f617cbbf3302ab120721ebfcfd2 blob - 597391a9c578320b2e748a05ffc019358eac8a39 blob + 1c9b75f2c471e4d3b9e67dba136bfb081e1a8d7e --- README.md +++ README.md @@ -20,11 +20,8 @@ is a very simple and minimal gemini server that can se and execute CGI scripts. **gmid** -will strip any sequence of -*../* -or trailing -*..* -in the requests made by clients and will refuse to follow symlinks. +won't serve files outside the given directory and won't follow +symlinks. Furthermore, on OpenBSD, pledge(2) @@ -35,6 +32,10 @@ are used to ensure that dosen't do anything else than read files from the given directory, accept network connections and, optionally, execute CGI scripts. +**gmid** +fully supports IRIs (Internationalized Resource Identifiers, see +RFC3987). + It should be noted that **gmid** is very simple in its implementation, and so it may not be appropriate blob - 77ef87db7e835803323eb5c04d6794dcfec67099 blob + edf67d5281ed871d651552902d513166ee9d2d2c --- gmid.1 +++ gmid.1 @@ -33,11 +33,8 @@ is a very simple and minimal gemini server that can se and execute CGI scripts. .Pp .Nm -will strip any sequence of -.Pa ../ -or trailing -.Pa .. -in the requests made by clients and will refuse to follow symlinks. +won't serve files outside the given directory and won't follow +symlinks. Furthermore, on .Ox , .Xr pledge 2 @@ -48,6 +45,10 @@ are used to ensure that dosen't do anything else than read files from the given directory, accept network connections and, optionally, execute CGI scripts. .Pp +.Nm +fully supports IRIs (Internationalized Resource Identifiers, see +RFC3987). +.Pp It should be noted that .Nm is very simple in its implementation, and so it may not be appropriate blob - 245928ac2a36f5594f0eda298dc56e4c1c8b7900 blob + 3f81b762a5c8152496864bf438e9f8c16be84c63 --- uri.c +++ uri.c @@ -93,6 +93,8 @@ struct parser { const char *err; }; +#define CONT_BYTE(b) ((b & 0xC0) == 0x80) + /* XXX: these macros will expand multiple times their argument */ #define UNRESERVED(p) \ @@ -114,7 +116,49 @@ struct parser { || p == ',' \ || p == ';' \ || p == '=') + +/* NOTE: the increment are one less what it should be, because the + * caller will add one byte after we return. */ +static int +valid_multibyte_utf8(struct parser *p) +{ + uint32_t c; + uint8_t s; + + c = 0; + s = *p->uri; + + if ((s & 0xE0) == 0xC0) { + if (!CONT_BYTE(*(p->uri+1))) + return 0; + c = ((s & 0x1F) << 6) | (*(p->uri+1) & 0x3F); + p->uri += 1; + } else if ((s & 0xF0) == 0xE0) { + if (!CONT_BYTE(*(p->uri+1)) || + !CONT_BYTE(*(p->uri+2))) + return 0; + c = (s & 0x0F) << 12 + | ((*(p->uri+1) & 0x3F) << 6) + | ((*(p->uri+2) & 0x3F)); + p->uri += 2; + } else if ((s & 0xF8) == 0xF0) { + if (!CONT_BYTE(*(p->uri+1)) || + !CONT_BYTE(*(p->uri+2)) || + !CONT_BYTE(*(p->uri+3))) + return 0; + c = (s & 0x07) << 18 + | ((*(p->uri+1) & 0x3F) << 12) + | ((*(p->uri+2) & 0x3F) << 6) + | ((*(p->uri+3) & 0x3F)); + p->uri += 3; + } else + return 0; + return (((0x080 <= c) && (c <= 0x7FF)) + || (((0x800 <= c) && (c <= 0xFFFF))) + || (((0x10000 <= c) && (c <= 0x10FFFF)))); +} + static int parse_pct_encoded(struct parser *p) { @@ -308,7 +352,8 @@ parse_query(struct parser *p) || SUB_DELIMITERS(*p->uri) || *p->uri == '/' || *p->uri == '?' - || parse_pct_encoded(p)) + || parse_pct_encoded(p) + || valid_multibyte_utf8(p)) p->uri++; if (*p->uri != '\0' && *p->uri != '#') { @@ -348,7 +393,8 @@ parse_path(struct parser *p) while (UNRESERVED(*p->uri) || SUB_DELIMITERS(*p->uri) || *p->uri == '/' - || parse_pct_encoded(p)) + || parse_pct_encoded(p) + || valid_multibyte_utf8(p)) p->uri++; if (*p->uri != '\0' && *p->uri != '?' && *p->uri != '#') { blob - c6521f668c8263cd6b9162966dd9b212dc529948 blob + f322c1e1f6e7a85c1b328cd3825dbb091521f9d5 --- uri_test.c +++ uri_test.c @@ -87,6 +87,12 @@ main(void) { struct uri empty = {"", "", "", PASS, "", "", ""}; + TEST("foo://bar.com/foo%00?baz", + FAIL, + empty, + "rejects %00"); + return 0; + TEST("http://omarpolo.com", PASS, URI("http", "omarpolo.com", "", "", "", ""), @@ -153,6 +159,10 @@ main(void) FAIL, empty, "reject paths that would escape the root"); + TEST("gemini://omarpolo.com/foo/../foo/../././/bar/baz/.././.././/", + PASS, + URI("gemini", "omarpolo.com", "", "", "", ""), + "parse path with lots of cleaning available"); /* query */ TEST("foo://example.com/foo/?gne", @@ -179,6 +189,44 @@ main(void) PASS, URI("foo", "bar.com", "", "cafè.gmi", "", ""), "can decode"); + TEST("foo://bar.com/caff%C3%A8%20macchiato.gmi", + PASS, + URI("foo", "bar.com", "", "caffè macchiato.gmi", "", ""), + "can decode"); + TEST("foo://bar.com/caff%C3%A8+macchiato.gmi", + PASS, + URI("foo", "bar.com", "", "caffè+macchiato.gmi", "", ""), + "can decode"); + TEST("foo://bar.com/foo%2F..%2F..", + FAIL, + empty, + "conversion and checking are done in the correct order"); + TEST("foo://bar.com/foo%00?baz", + FAIL, + empty, + "rejects %00"); + /* IRI */ + TEST("foo://bar.com/cafè.gmi", + PASS, + URI("foo", "bar.com", "", "cafè.gmi", "" , ""), + "decode IRI (with a 2-byte utf8 seq)"); + TEST("foo://bar.com/世界.gmi", + PASS, + URI("foo", "bar.com", "", "世界.gmi", "" , ""), + "decode IRI"); + TEST("foo://bar.com/😼.gmi", + PASS, + URI("foo", "bar.com", "", "😼.gmi", "" , ""), + "decode IRI (with a 3-byte utf8 seq)"); + TEST("foo://bar.com/😼/𤭢.gmi", + PASS, + URI("foo", "bar.com", "", "😼/𤭢.gmi", "" , ""), + "decode IRI (with a 3-byte and a 4-byte utf8 seq)"); + TEST("foo://bar.com/世界/\xC0\x80", + FAIL, + empty, + "reject invalid sequence (overlong NUL)"); + return 0; }