commit ef04b55160759b22db67f14c703a4343c4741e8b from: Omar Polo date: Sat Jan 09 20:32:23 2021 UTC switch to Bjoern Hoehrmann UTF-8 decoder It's correct, while my hacked valid_multibyte_utf8 would allow things that aren't technically UTF8. commit - 578ba2d81b2fa5f839314190f42bb9116069db7a commit + ef04b55160759b22db67f14c703a4343c4741e8b blob - 1d355e3ba2d5b1d5fd244ed77062bc9b6952b0c1 blob + 13cfae04d8424a84283849e46e75d981bb271bf1 --- Makefile +++ Makefile @@ -6,11 +6,11 @@ LDFLAGS = -ltls all: gmid TAGS README.md -gmid: gmid.o uri.o - ${CC} gmid.o uri.o -o gmid ${LDFLAGS} +gmid: gmid.o uri.o utf8.o + ${CC} gmid.o uri.o utf8.o -o gmid ${LDFLAGS} -TAGS: gmid.c uri.c - -etags gmid.c uri.c || true +TAGS: gmid.c uri.c utf8.c + -etags gmid.c uri.c utf8.c || true README.md: gmid.1 mandoc -Tmarkdown gmid.1 | sed -e '1d' -e '$$d' > README.md @@ -18,8 +18,8 @@ README.md: gmid.1 clean: rm -f *.o gmid -uri_test: uri_test.o uri.o - ${CC} uri_test.o uri.o -o uri_test ${LDFLAGS} +uri_test: uri_test.o uri.o utf8.o + ${CC} uri_test.o uri.o utf8.o -o uri_test ${LDFLAGS} test: uri_test ./uri_test blob - 16a1b3919227497d33a592eabc9e009fafc75a32 blob + 3e9e46128e3aec385c67204bdd22394472211019 --- README.md +++ README.md @@ -212,6 +212,12 @@ and not *docs/cgi-bin*, since it's relative to the document root. +# ACKNOWLEDGEMENTS + +**gmid** +uses the "Flexible and Economical" UTF-8 decoder written by +Bjoern Hoehrmann. + # CAVEATS * it doesn't support virtual hosts: the host part of the request URL is blob - 5c9aeb6613600df71d049f8f1cd51af53d7c976f blob + 12fa78247ffdb69d151e04621c442b41315fb271 --- gmid.1 +++ gmid.1 @@ -184,6 +184,10 @@ option is and not .Pa docs/cgi-bin , since it's relative to the document root. +.Sh ACKNOWLEDGEMENTS +.Nm +uses the "Flexible and Economical" UTF-8 decoder written by +.An Bjoern Hoehrmann . .Sh CAVEATS .Bl -bullet .It blob - c8f451487ccf97c59f4e955bb1c1b35b0bd92d86 blob + 3ee5fe4dd4a929a973d8217610e0011d3eb02086 --- gmid.h +++ gmid.h @@ -117,6 +117,12 @@ struct uri { char *fragment; }; +struct parser { + char *uri; + struct uri *parsed; + const char *err; +}; + enum { FILE_EXISTS, FILE_EXECUTABLE, @@ -151,6 +157,9 @@ void loop(struct tls*, int); void usage(const char*); +/* utf8.c */ +int valid_multibyte_utf8(struct parser*); + /* uri.c */ int parse_uri(char*, struct uri*, const char**); int trim_req_uri(char*); blob - 2bf56eb67d1a37f8e17f79f4051139d0f25532f7 blob + 5af32b78c6b1a896154993dab383ded90d3f6ee4 --- uri.c +++ uri.c @@ -87,14 +87,6 @@ * */ -struct parser { - char *uri; - struct uri *parsed; - const char *err; -}; - -#define CONT_BYTE(b) ((b & 0xC0) == 0x80) - /* XXX: these macros will expand multiple times their argument */ #define UNRESERVED(p) \ @@ -116,49 +108,7 @@ struct parser { || p == ',' \ || p == ';' \ || p == '=') - -/* NOTE: the increments are one less what they should be, because the - * caller will add one byte after we return. */ -static int -valid_multibyte_utf8(struct parser *p) -{ - uint32_t c; - uint8_t s; - - c = 0; - s = *p->uri; - - if ((s & 0xE0) == 0xC0) { - if (!CONT_BYTE(*(p->uri+1))) - return 0; - c = ((s & 0x1F) << 6) | (*(p->uri+1) & 0x3F); - p->uri += 1; - } else if ((s & 0xF0) == 0xE0) { - if (!CONT_BYTE(*(p->uri+1)) || - !CONT_BYTE(*(p->uri+2))) - return 0; - c = (s & 0x0F) << 12 - | ((*(p->uri+1) & 0x3F) << 6) - | ((*(p->uri+2) & 0x3F)); - p->uri += 2; - } else if ((s & 0xF8) == 0xF0) { - if (!CONT_BYTE(*(p->uri+1)) || - !CONT_BYTE(*(p->uri+2)) || - !CONT_BYTE(*(p->uri+3))) - return 0; - c = (s & 0x07) << 18 - | ((*(p->uri+1) & 0x3F) << 12) - | ((*(p->uri+2) & 0x3F) << 6) - | ((*(p->uri+3) & 0x3F)); - p->uri += 3; - } else - return 0; - return (((0x080 <= c) && (c <= 0x7FF)) - || (((0x800 <= c) && (c <= 0xFFFF))) - || (((0x10000 <= c) && (c <= 0x10FFFF)))); -} - static int parse_pct_encoded(struct parser *p) { blob - /dev/null blob + 81a75845afc65a0a25611da39d428a407f077662 (mode 644) --- /dev/null +++ utf8.c @@ -0,0 +1,79 @@ +/* Copyright (c) 2008-2009 Bjoern Hoehrmann + * + * Permission is hereby granted, free of charge, to any person + * obtaining a copy of this software and associated documentation + * files (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, + * modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "gmid.h" + +#define UTF8_ACCEPT 0 +#define UTF8_REJECT 1 + +static const uint8_t utf8d[] = { + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f + 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf + 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df + 0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef + 0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff + 0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0 + 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2 + 1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4 + 1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6 + 1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8 +}; + +static uint32_t inline +utf8_decode(uint32_t* state, uint32_t* codep, uint8_t byte) { + uint32_t type = utf8d[byte]; + + *codep = (*state != UTF8_ACCEPT) ? + (byte & 0x3fu) | (*codep << 6) : + (0xff >> type) & (byte); + + *state = utf8d[256 + *state*16 + type]; + return *state; +} + +/* for the iri parser. Modelled after printCodePoints */ +int +valid_multibyte_utf8(struct parser *p) +{ + uint32_t cp = 0, state = 0; + + for (; *p->uri; p->uri++) + if (!utf8_decode(&state, &cp, *p->uri)) + break; + + /* reject the ASCII range */ + if (state || cp <= 0x7F) { + /* XXX: do some error recovery? */ + if (state) + p->err = "invalid UTF-8 character"; + return 0; + } + return 1; +}