commit 3c1cf9d07cb679ba444566159538b510902f2de9 from: Omar Polo date: Mon Jan 11 13:08:00 2021 UTC s/uri/iri since we accept IRIs commit - 28778244d67be7024868a5095e5eedda22a3ed98 commit + 3c1cf9d07cb679ba444566159538b510902f2de9 blob - a7794ec77895de2aa815675b27a8a09f22d1f956 blob + 5660e44f37b2ae2d269500e367612556ee6b2e80 --- .gitignore +++ .gitignore @@ -2,6 +2,6 @@ cert.pem key.pem TAGS gmid -uri_test +iri_test *.o docs blob - 76f05bd5b384c8a23073a5d6f29ba5a0312c16a1 blob + 3e0b72e31bb3c1c16b34e6062dbc4b93db793e29 --- Makefile +++ Makefile @@ -6,17 +6,17 @@ LDFLAGS = -ltls all: gmid TAGS README.md -gmid: gmid.o uri.o utf8.o - ${CC} gmid.o uri.o utf8.o -o gmid ${LDFLAGS} +gmid: gmid.o iri.o utf8.o + ${CC} gmid.o iri.o utf8.o -o gmid ${LDFLAGS} -TAGS: gmid.c uri.c utf8.c - -etags gmid.c uri.c utf8.c || true +TAGS: gmid.c iri.c utf8.c + -etags gmid.c iri.c utf8.c || true clean: - rm -f *.o gmid + rm -f *.o gmid iri_test -uri_test: uri_test.o uri.o utf8.o - ${CC} uri_test.o uri.o utf8.o -o uri_test ${LDFLAGS} +iri_test: iri_test.o iri.o utf8.o + ${CC} iri_test.o iri.o utf8.o -o iri_test ${LDFLAGS} -test: uri_test - ./uri_test +test: iri_test + ./iri_test blob - ef12066eac747e32dae1c6136b4167366076242a blob + 0c1bed23a05e30fef4ebcf534d1a0cc2e4bf455d --- gmid.c +++ gmid.c @@ -572,7 +572,7 @@ handle(struct pollfd *fds, struct client *client) { char buf[GEMINI_URL_LEN]; const char *parse_err; - struct uri uri; + struct iri iri; switch (client->state) { case S_OPEN: @@ -593,7 +593,7 @@ handle(struct pollfd *fds, struct client *client) } parse_err = "invalid request"; - if (!trim_req_uri(buf) || !parse_uri(buf, &uri, &parse_err)) { + if (!trim_req_iri(buf) || !parse_iri(buf, &iri, &parse_err)) { if (!start_reply(fds, client, BAD_REQUEST, parse_err)) return; goodbye(fds, client); @@ -601,11 +601,11 @@ handle(struct pollfd *fds, struct client *client) } LOGI(client, "GET %s%s%s", - *uri.path ? uri.path : "/", - *uri.query ? "?" : "", - *uri.query ? uri.query : ""); + *iri.path ? iri.path : "/", + *iri.query ? "?" : "", + *iri.query ? iri.query : ""); - send_file(uri.path, uri.query, fds, client); + send_file(iri.path, iri.query, fds, client); break; case S_INITIALIZING: blob - 64effdeb7170cd4219199d787810443f618e1c52 blob + ecca57ffd095eb684a593403e0f0f25964f70438 --- gmid.h +++ gmid.h @@ -70,7 +70,7 @@ struct client { struct sockaddr_storage addr; }; -struct uri { +struct iri { char *schema; char *host; char *port; @@ -81,8 +81,8 @@ struct uri { }; struct parser { - char *uri; - struct uri *parsed; + char *iri; + struct iri *parsed; const char *err; }; @@ -123,8 +123,8 @@ void usage(const char*); /* utf8.c */ int valid_multibyte_utf8(struct parser*); -/* uri.c */ -int parse_uri(char*, struct uri*, const char**); -int trim_req_uri(char*); +/* iri.c */ +int parse_iri(char*, struct iri*, const char**); +int trim_req_iri(char*); #endif blob - 1258abbf91161ba655231f964d5e501b4c7c4e67 (mode 644) blob + /dev/null --- uri.c +++ /dev/null @@ -1,426 +0,0 @@ -/* - * Copyright (c) 2020 Omar Polo - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include - -#include "gmid.h" - -/* - * Notes from RFC3986 - * - * => gemini://tanso.net/rfc/rfc3986.txt - * - * - * ABNF - * ==== - * - * pct-encoded "%" HEXDIG HEXDIG - * - * reserved = gen-delims / sub-delimis - * gen-delims = ":" / "/" / "?" / "#" / "[" / "]" / "@" - * sub-delims = "!" / "$" / "&" / "'" / "(" / ")" - * / "*" / "+" / "," / ";" / "=" - * - * unreserved = ALPHA / DIGIT / "-" / "." / "_" / "~" - * - * URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ] - * - * hier-part = "//" authority path-abempty - * / path-absolute - * / path-rootless - * / path-empty - * - * scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) - * - * authority = [ userinfo "@" ] host [ ":" port ] - * - * (note that userinfo isn't used for Gemini URL) - * - * host = IP-literal / IPv4address / reg-name - * reg-name = *( unreserved / pct-encoded / sub-delims ) - * - * port = *DIGIT - * - * path = path-abemty ; begins with "/" or is empty - * / path-absolute ; begins with "/" but not "//" - * / path-noscheme ; begins with a non-colon segment - * / path-rootless ; begins with a segment - * / path-empty ; zero characters - * - * path-abemty = *( "/" segment ) - * path-absolute = "/" [ segment-nz *( "/" segment ) ] - * path-noscheme = ; not used - * path-rootless = ; not used - * path-empty = ; not used - * - * segment = *pchar - * segment-nz = 1*pchar - * segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" ) - * pchar = unreserved / pct-encoded / sub-delims / ":" / "@" - * - * query = *( pchar / "/" / "?" ) - * - * fragment = *( pchar / "/" / "?" ) - * - * - * EXAMPLE - * ======= - * - * foo://example.com:8042/over/there?name=ferret#nose - * \_/ \______________/\_________/ \_________/ \__/ - * | | | | | - * scheme authority path query fragment - * - */ - -static inline int -unreserved(int p) -{ - return isalnum(p) - || p == '-' - || p == '.' - || p == '_' - || p == '~'; -} - -static inline int -sub_delimiters(int p) -{ - return p == '!' - || p == '$' - || p == '&' - || p == '\'' - || p == '(' - || p == ')' - || p == '*' - || p == '+' - || p == ',' - || p == ';' - || p == '='; -} - -static int -parse_pct_encoded(struct parser *p) -{ - if (*p->uri != '%') - return 0; - - if (!isxdigit(*(p->uri+1)) || !isxdigit(*(p->uri+2))) { - p->err = "illegal percent-encoding"; - return 0; - } - - sscanf(p->uri+1, "%2hhx", p->uri); - memmove(p->uri+1, p->uri+3, strlen(p->uri+3)+1); - if (*p->uri == '\0') { - p->err = "illegal percent-encoding"; - return 0; - } - - return 1; -} - -/* ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) "://" */ -static int -parse_scheme(struct parser *p) -{ - p->parsed->schema = p->uri; - - if (!isalpha(*p->uri)) { - p->err = "illegal character in scheme"; - return 0; - } - - p->uri++; - while (isalnum(*p->uri) - || *p->uri == '+' - || *p->uri == '-' - || *p->uri == '.') - p->uri++; - - if (*p->uri != ':') { - p->err = "illegal character in scheme"; - return 0; - } - - *p->uri = '\0'; - if (*(++p->uri) != '/' || *(++p->uri) != '/') { - p->err = "invalid marker after scheme"; - return 0; - } - - p->uri++; - return 1; -} - -/* *DIGIT */ -static int -parse_port(struct parser *p) -{ - uint32_t i = 0; - - p->parsed->port = p->uri; - - for (; isdigit(*p->uri); p->uri++) { - i = i * 10 + *p->uri - '0'; - if (i > UINT16_MAX) { - p->err = "port number too large"; - return 0; - } - } - - if (*p->uri != '/' && *p->uri != '\0') { - p->err = "illegal character in port number"; - return 0; - } - - p->parsed->port_no = i; - - if (*p->uri != '\0') { - *p->uri = '\0'; - p->uri++; - } - - return 1; -} - -/* TODO: add support for ip-literal and ipv4addr ? */ -/* *( unreserved / sub-delims / pct-encoded ) */ -static int -parse_authority(struct parser *p) -{ - p->parsed->host = p->uri; - - while (unreserved(*p->uri) - || sub_delimiters(*p->uri) - || parse_pct_encoded(p)) - p->uri++; - - if (p->err != NULL) - return 0; - - if (*p->uri == ':') { - *p->uri = '\0'; - p->uri++; - return parse_port(p); - } - - if (*p->uri == '/') { - *p->uri = '\0'; - p->uri++; - return 1; - } - - if (*p->uri == '\0') - return 1; - - p->err = "illegal character in authority section"; - return 0; -} - -/* Routine for path_clean. Elide the pointed .. with the preceding - * element. Return 0 if it's not possible. incr is the length of - * the increment, 3 for ../ and 2 for .. */ -static int -path_elide_dotdot(char *path, char *i, int incr) -{ - char *j; - - if (i == path) - return 0; - for (j = i-2; j != path && *j != '/'; j--) - /* noop */ ; - if (*j == '/') - j++; - i += incr; - memmove(j, i, strlen(i)+1); - return 1; -} - -/* - * Use an algorithm similar to the one implemented in go' path.Clean: - * - * 1. Replace multiple slashes with a single slash - * 2. Eliminate each . path name element - * 3. Eliminate each inner .. along with the non-.. element that precedes it - * 4. Eliminate trailing .. if possible or error (go would only discard) - * - * Unlike path.Clean, this function return the empty string if the - * original path is equivalent to "/". - */ -static int -path_clean(char *path) -{ - char *i; - - /* 1. replace multiple slashes with a single one */ - for (i = path; *i; ++i) { - if (*i == '/' && *(i+1) == '/') { - memmove(i, i+1, strlen(i)); /* move also the \0 */ - i--; - } - } - - /* 2. eliminate each . path name element */ - for (i = path; *i; ++i) { - if ((i == path || *i == '/') && *(i+1) == '.' && - *(i+2) == '/') { - /* move also the \0 */ - memmove(i, i+2, strlen(i)-1); - i--; - } - } - if (!strcmp(path, ".") || !strcmp(path, "/.")) { - *path = '\0'; - return 1; - } - - /* 3. eliminate each inner .. along with the preceding non-.. */ - for (i = strstr(path, "../"); i != NULL; i = strstr(path, "..")) - if (!path_elide_dotdot(path, i, 3)) - return 0; - - /* 4. eliminate trailing ..*/ - if ((i = strstr(path, "..")) != NULL) - if (!path_elide_dotdot(path, i, 2)) - return 0; - - return 1; -} - -static int -parse_query(struct parser *p) -{ - p->parsed->query = p->uri; - if (*p->uri == '\0') - return 1; - - while (unreserved(*p->uri) - || sub_delimiters(*p->uri) - || *p->uri == '/' - || *p->uri == '?' - || parse_pct_encoded(p) - || valid_multibyte_utf8(p)) - p->uri++; - - if (p->err != NULL) - return 0; - - if (*p->uri != '\0' && *p->uri != '#') { - p->err = "illegal character in query"; - return 0; - } - - if (*p->uri != '\0') { - *p->uri = '\0'; - p->uri++; - } - - return 1; -} - -/* don't even bother */ -static int -parse_fragment(struct parser *p) -{ - p->parsed->fragment = p->uri; - return 1; -} - -/* XXX: is it too broad? */ -/* *(pchar / "/") */ -static int -parse_path(struct parser *p) -{ - char c; - - p->parsed->path = p->uri; - if (*p->uri == '\0') { - p->parsed->query = p->parsed->fragment = p->uri; - return 1; - } - - while (unreserved(*p->uri) - || sub_delimiters(*p->uri) - || *p->uri == '/' - || parse_pct_encoded(p) - || valid_multibyte_utf8(p)) - p->uri++; - - if (p->err != NULL) - return 0; - - if (*p->uri != '\0' && *p->uri != '?' && *p->uri != '#') { - p->err = "illegal character in path"; - return 0; - } - - if (*p->uri != '\0') { - c = *p->uri; - *p->uri = '\0'; - p->uri++; - - if (c == '#') { - if (!parse_fragment(p)) - return 0; - } else - if (!parse_query(p) || !parse_fragment(p)) - return 0; - } - - if (!path_clean(p->parsed->path)) { - p->err = "illegal path"; - return 0; - } - - return 1; -} - -int -parse_uri(char *uri, struct uri *ret, const char **err_ret) -{ - char *end; - struct parser p = {uri, ret, NULL}; - - bzero(ret, sizeof(*ret)); - - /* initialize optional stuff to the empty string */ - end = uri + strlen(uri); - p.parsed->port = end; - p.parsed->path = end; - p.parsed->query = end; - p.parsed->fragment = end; - - if (!parse_scheme(&p) || !parse_authority(&p) || !parse_path(&p)) { - *err_ret = p.err; - return 0; - } - - *err_ret = NULL; - return 1; -} - -int -trim_req_uri(char *uri) -{ - char *i; - - if ((i = strstr(uri, "\r\n")) == NULL) - return 0; - *i = '\0'; - return 1; -} blob - /dev/null blob + 8c020392806387e3f84ca44d08d68c89abba42e9 (mode 644) --- /dev/null +++ iri.c @@ -0,0 +1,358 @@ +/* + * Copyright (c) 2020 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include + +#include "gmid.h" + +static inline int +unreserved(int p) +{ + return isalnum(p) + || p == '-' + || p == '.' + || p == '_' + || p == '~'; +} + +static inline int +sub_delimiters(int p) +{ + return p == '!' + || p == '$' + || p == '&' + || p == '\'' + || p == '(' + || p == ')' + || p == '*' + || p == '+' + || p == ',' + || p == ';' + || p == '='; +} + +static int +parse_pct_encoded(struct parser *p) +{ + if (*p->iri != '%') + return 0; + + if (!isxdigit(*(p->iri+1)) || !isxdigit(*(p->iri+2))) { + p->err = "illegal percent-encoding"; + return 0; + } + + sscanf(p->iri+1, "%2hhx", p->iri); + memmove(p->iri+1, p->iri+3, strlen(p->iri+3)+1); + if (*p->iri == '\0') { + p->err = "illegal percent-encoding"; + return 0; + } + + return 1; +} + +/* ALPHA *( ALPHA / DIGIT / "+" / "-" / "." ) "://" */ +static int +parse_scheme(struct parser *p) +{ + p->parsed->schema = p->iri; + + if (!isalpha(*p->iri)) { + p->err = "illegal character in scheme"; + return 0; + } + + p->iri++; + while (isalnum(*p->iri) + || *p->iri == '+' + || *p->iri == '-' + || *p->iri == '.') + p->iri++; + + if (*p->iri != ':') { + p->err = "illegal character in scheme"; + return 0; + } + + *p->iri = '\0'; + if (*(++p->iri) != '/' || *(++p->iri) != '/') { + p->err = "invalid marker after scheme"; + return 0; + } + + p->iri++; + return 1; +} + +/* *DIGIT */ +static int +parse_port(struct parser *p) +{ + uint32_t i = 0; + + p->parsed->port = p->iri; + + for (; isdigit(*p->iri); p->iri++) { + i = i * 10 + *p->iri - '0'; + if (i > UINT16_MAX) { + p->err = "port number too large"; + return 0; + } + } + + if (*p->iri != '/' && *p->iri != '\0') { + p->err = "illegal character in port number"; + return 0; + } + + p->parsed->port_no = i; + + if (*p->iri != '\0') { + *p->iri = '\0'; + p->iri++; + } + + return 1; +} + +/* TODO: add support for ip-literal and ipv4addr ? */ +/* *( unreserved / sub-delims / pct-encoded ) */ +static int +parse_authority(struct parser *p) +{ + p->parsed->host = p->iri; + + while (unreserved(*p->iri) + || sub_delimiters(*p->iri) + || parse_pct_encoded(p)) + p->iri++; + + if (p->err != NULL) + return 0; + + if (*p->iri == ':') { + *p->iri = '\0'; + p->iri++; + return parse_port(p); + } + + if (*p->iri == '/') { + *p->iri = '\0'; + p->iri++; + return 1; + } + + if (*p->iri == '\0') + return 1; + + p->err = "illegal character in authority section"; + return 0; +} + +/* Routine for path_clean. Elide the pointed .. with the preceding + * element. Return 0 if it's not possible. incr is the length of + * the increment, 3 for ../ and 2 for .. */ +static int +path_elide_dotdot(char *path, char *i, int incr) +{ + char *j; + + if (i == path) + return 0; + for (j = i-2; j != path && *j != '/'; j--) + /* noop */ ; + if (*j == '/') + j++; + i += incr; + memmove(j, i, strlen(i)+1); + return 1; +} + +/* + * Use an algorithm similar to the one implemented in go' path.Clean: + * + * 1. Replace multiple slashes with a single slash + * 2. Eliminate each . path name element + * 3. Eliminate each inner .. along with the non-.. element that precedes it + * 4. Eliminate trailing .. if possible or error (go would only discard) + * + * Unlike path.Clean, this function return the empty string if the + * original path is equivalent to "/". + */ +static int +path_clean(char *path) +{ + char *i; + + /* 1. replace multiple slashes with a single one */ + for (i = path; *i; ++i) { + if (*i == '/' && *(i+1) == '/') { + memmove(i, i+1, strlen(i)); /* move also the \0 */ + i--; + } + } + + /* 2. eliminate each . path name element */ + for (i = path; *i; ++i) { + if ((i == path || *i == '/') && *(i+1) == '.' && + *(i+2) == '/') { + /* move also the \0 */ + memmove(i, i+2, strlen(i)-1); + i--; + } + } + if (!strcmp(path, ".") || !strcmp(path, "/.")) { + *path = '\0'; + return 1; + } + + /* 3. eliminate each inner .. along with the preceding non-.. */ + for (i = strstr(path, "../"); i != NULL; i = strstr(path, "..")) + if (!path_elide_dotdot(path, i, 3)) + return 0; + + /* 4. eliminate trailing ..*/ + if ((i = strstr(path, "..")) != NULL) + if (!path_elide_dotdot(path, i, 2)) + return 0; + + return 1; +} + +static int +parse_query(struct parser *p) +{ + p->parsed->query = p->iri; + if (*p->iri == '\0') + return 1; + + while (unreserved(*p->iri) + || sub_delimiters(*p->iri) + || *p->iri == '/' + || *p->iri == '?' + || parse_pct_encoded(p) + || valid_multibyte_utf8(p)) + p->iri++; + + if (p->err != NULL) + return 0; + + if (*p->iri != '\0' && *p->iri != '#') { + p->err = "illegal character in query"; + return 0; + } + + if (*p->iri != '\0') { + *p->iri = '\0'; + p->iri++; + } + + return 1; +} + +/* don't even bother */ +static int +parse_fragment(struct parser *p) +{ + p->parsed->fragment = p->iri; + return 1; +} + +/* XXX: is it too broad? */ +/* *(pchar / "/") */ +static int +parse_path(struct parser *p) +{ + char c; + + p->parsed->path = p->iri; + if (*p->iri == '\0') { + p->parsed->query = p->parsed->fragment = p->iri; + return 1; + } + + while (unreserved(*p->iri) + || sub_delimiters(*p->iri) + || *p->iri == '/' + || parse_pct_encoded(p) + || valid_multibyte_utf8(p)) + p->iri++; + + if (p->err != NULL) + return 0; + + if (*p->iri != '\0' && *p->iri != '?' && *p->iri != '#') { + p->err = "illegal character in path"; + return 0; + } + + if (*p->iri != '\0') { + c = *p->iri; + *p->iri = '\0'; + p->iri++; + + if (c == '#') { + if (!parse_fragment(p)) + return 0; + } else + if (!parse_query(p) || !parse_fragment(p)) + return 0; + } + + if (!path_clean(p->parsed->path)) { + p->err = "illegal path"; + return 0; + } + + return 1; +} + +int +parse_iri(char *iri, struct iri *ret, const char **err_ret) +{ + char *end; + struct parser p = {iri, ret, NULL}; + + bzero(ret, sizeof(*ret)); + + /* initialize optional stuff to the empty string */ + end = iri + strlen(iri); + p.parsed->port = end; + p.parsed->path = end; + p.parsed->query = end; + p.parsed->fragment = end; + + if (!parse_scheme(&p) || !parse_authority(&p) || !parse_path(&p)) { + *err_ret = p.err; + return 0; + } + + *err_ret = NULL; + return 1; +} + +int +trim_req_iri(char *iri) +{ + char *i; + + if ((i = strstr(iri, "\r\n")) == NULL) + return 0; + *i = '\0'; + return 1; +} blob - 3e4d823343de421e402c437962283292e8742b29 (mode 644) blob + /dev/null --- uri_test.c +++ /dev/null @@ -1,226 +0,0 @@ -/* - * Copyright (c) 2020 Omar Polo - * - * Permission to use, copy, modify, and distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR - * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN - * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF - * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - */ - -#include -#include -#include - -#include "gmid.h" - -#define TEST(uri, fail, exp, descr) \ - if (!run_test(uri, fail, exp)) { \ - fprintf(stderr, "%s:%d: error: %s\n", \ - __FILE__, __LINE__, descr); \ - exit(1); \ - } - -#define URI(schema, host, port, path, query, frag) \ - ((struct uri){schema, host, port, 0, path, query, frag}) - -#define DIFF(wanted, got, field) \ - if (wanted->field == NULL || got->field == NULL || \ - strcmp(wanted->field, got->field)) { \ - fprintf(stderr, #field ":\n\tgot: %s\n\twanted: %s\n", \ - got->field, wanted->field); \ - return 0; \ - } - -#define PASS 0 -#define FAIL 1 - -int -diff_uri(struct uri *p, struct uri *exp) -{ - DIFF(p, exp, schema); - DIFF(p, exp, host); - DIFF(p, exp, port); - DIFF(p, exp, path); - DIFF(p, exp, query); - DIFF(p, exp, fragment); - return 1; -} - -int -run_test(const char *uri, int should_fail, struct uri expected) -{ - int failed, ok = 1; - char *uri_copy; - struct uri parsed; - const char *error; - - if ((uri_copy = strdup(uri)) == NULL) - err(1, "strdup"); - - fprintf(stderr, "=> %s\n", uri); - failed = !parse_uri(uri_copy, &parsed, &error); - - if (failed && should_fail) - goto done; - - if (error != NULL) - fprintf(stderr, "> %s\n", error); - - ok = !failed && !should_fail; - if (ok) - ok = diff_uri(&expected, &parsed); - -done: - free(uri_copy); - return ok; -} - -int -main(void) -{ - struct uri empty = {"", "", "", PASS, "", "", ""}; - - TEST("http://omarpolo.com", - PASS, - URI("http", "omarpolo.com", "", "", "", ""), - "can parse uri with empty path"); - - /* schema */ - TEST("omarpolo.com", FAIL, empty, "FAIL when the schema is missing"); - TEST("gemini:/omarpolo.com", FAIL, empty, "FAIL with invalid marker"); - TEST("gemini//omarpolo.com", FAIL, empty, "FAIL with invalid marker"); - TEST("h!!p://omarpolo.com", FAIL, empty, "FAIL with invalid schema"); - - /* authority */ - TEST("gemini://omarpolo.com", - PASS, - URI("gemini", "omarpolo.com", "", "", "", ""), - "can parse authority with empty path"); - TEST("gemini://omarpolo.com/", - PASS, - URI("gemini", "omarpolo.com", "", "", "", ""), - "can parse authority with empty path (alt)") - TEST("gemini://omarpolo.com:1965", - PASS, - URI("gemini", "omarpolo.com", "1965", "", "", ""), - "can parse with port and empty path"); - TEST("gemini://omarpolo.com:1965/", - PASS, - URI("gemini", "omarpolo.com", "1965", "", "", ""), - "can parse with port and empty path") - TEST("gemini://omarpolo.com:196s", - FAIL, - empty, - "FAIL with invalid port number"); - - /* path */ - TEST("gemini://omarpolo.com/foo/bar/baz", - PASS, - URI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""), - "parse simple paths"); - TEST("gemini://omarpolo.com/foo//bar///baz", - PASS, - URI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""), - "parse paths with multiple slashes"); - TEST("gemini://omarpolo.com/foo/./bar/./././baz", - PASS, - URI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""), - "parse paths with . elements"); - TEST("gemini://omarpolo.com/foo/bar/../bar/baz", - PASS, - URI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""), - "parse paths with .. elements"); - TEST("gemini://omarpolo.com/foo/../foo/bar/../bar/baz/../baz", - PASS, - URI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""), - "parse paths with multiple .. elements"); - TEST("gemini://omarpolo.com/foo/..", - PASS, - URI("gemini", "omarpolo.com", "", "", "", ""), - "parse paths with a trailing .."); - TEST("gemini://omarpolo.com/foo/../", - PASS, - URI("gemini", "omarpolo.com", "", "", "", ""), - "parse paths with a trailing .."); - TEST("gemini://omarpolo.com/foo/../..", - FAIL, - empty, - "reject paths that would escape the root"); - TEST("gemini://omarpolo.com/foo/../foo/../././/bar/baz/.././.././/", - PASS, - URI("gemini", "omarpolo.com", "", "", "", ""), - "parse path with lots of cleaning available"); - - /* query */ - TEST("foo://example.com/foo/?gne", - PASS, - URI("foo", "example.com", "", "foo/", "gne", ""), - "parse query strings"); - TEST("foo://example.com/foo/?gne&foo", - PASS, - URI("foo", "example.com", "", "foo/", "gne&foo", ""), - "parse query strings"); - TEST("foo://example.com/foo/?gne%2F", - PASS, - URI("foo", "example.com", "", "foo/", "gne/", ""), - "parse query strings"); - - /* fragment */ - TEST("foo://bar.co/#foo", - PASS, - URI("foo", "bar.co", "", "", "", "foo"), - "can recognize fragments"); - - /* percent encoding */ - TEST("foo://bar.com/caf%C3%A8.gmi", - PASS, - URI("foo", "bar.com", "", "cafè.gmi", "", ""), - "can decode"); - TEST("foo://bar.com/caff%C3%A8%20macchiato.gmi", - PASS, - URI("foo", "bar.com", "", "caffè macchiato.gmi", "", ""), - "can decode"); - TEST("foo://bar.com/caff%C3%A8+macchiato.gmi", - PASS, - URI("foo", "bar.com", "", "caffè+macchiato.gmi", "", ""), - "can decode"); - TEST("foo://bar.com/foo%2F..%2F..", - FAIL, - empty, - "conversion and checking are done in the correct order"); - TEST("foo://bar.com/foo%00?baz", - FAIL, - empty, - "rejects %00"); - - /* IRI */ - TEST("foo://bar.com/cafè.gmi", - PASS, - URI("foo", "bar.com", "", "cafè.gmi", "" , ""), - "decode IRI (with a 2-byte utf8 seq)"); - TEST("foo://bar.com/世界.gmi", - PASS, - URI("foo", "bar.com", "", "世界.gmi", "" , ""), - "decode IRI"); - TEST("foo://bar.com/😼.gmi", - PASS, - URI("foo", "bar.com", "", "😼.gmi", "" , ""), - "decode IRI (with a 3-byte utf8 seq)"); - TEST("foo://bar.com/😼/𤭢.gmi", - PASS, - URI("foo", "bar.com", "", "😼/𤭢.gmi", "" , ""), - "decode IRI (with a 3-byte and a 4-byte utf8 seq)"); - TEST("foo://bar.com/世界/\xC0\x80", - FAIL, - empty, - "reject invalid sequence (overlong NUL)"); - - return 0; -} blob - /dev/null blob + 6200cb7422181a3280a9e6b9e1ab5cd96ebd0f93 (mode 644) --- /dev/null +++ iri_test.c @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2020 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include + +#include "gmid.h" + +#define TEST(iri, fail, exp, descr) \ + if (!run_test(iri, fail, exp)) { \ + fprintf(stderr, "%s:%d: error: %s\n", \ + __FILE__, __LINE__, descr); \ + exit(1); \ + } + +#define IRI(schema, host, port, path, query, frag) \ + ((struct iri){schema, host, port, 0, path, query, frag}) + +#define DIFF(wanted, got, field) \ + if (wanted->field == NULL || got->field == NULL || \ + strcmp(wanted->field, got->field)) { \ + fprintf(stderr, #field ":\n\tgot: %s\n\twanted: %s\n", \ + got->field, wanted->field); \ + return 0; \ + } + +#define PASS 0 +#define FAIL 1 + +int +diff_iri(struct iri *p, struct iri *exp) +{ + DIFF(p, exp, schema); + DIFF(p, exp, host); + DIFF(p, exp, port); + DIFF(p, exp, path); + DIFF(p, exp, query); + DIFF(p, exp, fragment); + return 1; +} + +int +run_test(const char *iri, int should_fail, struct iri expected) +{ + int failed, ok = 1; + char *iri_copy; + struct iri parsed; + const char *error; + + if ((iri_copy = strdup(iri)) == NULL) + err(1, "strdup"); + + fprintf(stderr, "=> %s\n", iri); + failed = !parse_iri(iri_copy, &parsed, &error); + + if (failed && should_fail) + goto done; + + if (error != NULL) + fprintf(stderr, "> %s\n", error); + + ok = !failed && !should_fail; + if (ok) + ok = diff_iri(&expected, &parsed); + +done: + free(iri_copy); + return ok; +} + +int +main(void) +{ + struct iri empty = {"", "", "", PASS, "", "", ""}; + + TEST("http://omarpolo.com", + PASS, + IRI("http", "omarpolo.com", "", "", "", ""), + "can parse iri with empty path"); + + /* schema */ + TEST("omarpolo.com", FAIL, empty, "FAIL when the schema is missing"); + TEST("gemini:/omarpolo.com", FAIL, empty, "FAIL with invalid marker"); + TEST("gemini//omarpolo.com", FAIL, empty, "FAIL with invalid marker"); + TEST("h!!p://omarpolo.com", FAIL, empty, "FAIL with invalid schema"); + + /* authority */ + TEST("gemini://omarpolo.com", + PASS, + IRI("gemini", "omarpolo.com", "", "", "", ""), + "can parse authority with empty path"); + TEST("gemini://omarpolo.com/", + PASS, + IRI("gemini", "omarpolo.com", "", "", "", ""), + "can parse authority with empty path (alt)") + TEST("gemini://omarpolo.com:1965", + PASS, + IRI("gemini", "omarpolo.com", "1965", "", "", ""), + "can parse with port and empty path"); + TEST("gemini://omarpolo.com:1965/", + PASS, + IRI("gemini", "omarpolo.com", "1965", "", "", ""), + "can parse with port and empty path") + TEST("gemini://omarpolo.com:196s", + FAIL, + empty, + "FAIL with invalid port number"); + + /* path */ + TEST("gemini://omarpolo.com/foo/bar/baz", + PASS, + IRI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""), + "parse simple paths"); + TEST("gemini://omarpolo.com/foo//bar///baz", + PASS, + IRI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""), + "parse paths with multiple slashes"); + TEST("gemini://omarpolo.com/foo/./bar/./././baz", + PASS, + IRI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""), + "parse paths with . elements"); + TEST("gemini://omarpolo.com/foo/bar/../bar/baz", + PASS, + IRI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""), + "parse paths with .. elements"); + TEST("gemini://omarpolo.com/foo/../foo/bar/../bar/baz/../baz", + PASS, + IRI("gemini", "omarpolo.com", "", "foo/bar/baz", "", ""), + "parse paths with multiple .. elements"); + TEST("gemini://omarpolo.com/foo/..", + PASS, + IRI("gemini", "omarpolo.com", "", "", "", ""), + "parse paths with a trailing .."); + TEST("gemini://omarpolo.com/foo/../", + PASS, + IRI("gemini", "omarpolo.com", "", "", "", ""), + "parse paths with a trailing .."); + TEST("gemini://omarpolo.com/foo/../..", + FAIL, + empty, + "reject paths that would escape the root"); + TEST("gemini://omarpolo.com/foo/../foo/../././/bar/baz/.././.././/", + PASS, + IRI("gemini", "omarpolo.com", "", "", "", ""), + "parse path with lots of cleaning available"); + + /* query */ + TEST("foo://example.com/foo/?gne", + PASS, + IRI("foo", "example.com", "", "foo/", "gne", ""), + "parse query strings"); + TEST("foo://example.com/foo/?gne&foo", + PASS, + IRI("foo", "example.com", "", "foo/", "gne&foo", ""), + "parse query strings"); + TEST("foo://example.com/foo/?gne%2F", + PASS, + IRI("foo", "example.com", "", "foo/", "gne/", ""), + "parse query strings"); + + /* fragment */ + TEST("foo://bar.co/#foo", + PASS, + IRI("foo", "bar.co", "", "", "", "foo"), + "can recognize fragments"); + + /* percent encoding */ + TEST("foo://bar.com/caf%C3%A8.gmi", + PASS, + IRI("foo", "bar.com", "", "cafè.gmi", "", ""), + "can decode"); + TEST("foo://bar.com/caff%C3%A8%20macchiato.gmi", + PASS, + IRI("foo", "bar.com", "", "caffè macchiato.gmi", "", ""), + "can decode"); + TEST("foo://bar.com/caff%C3%A8+macchiato.gmi", + PASS, + IRI("foo", "bar.com", "", "caffè+macchiato.gmi", "", ""), + "can decode"); + TEST("foo://bar.com/foo%2F..%2F..", + FAIL, + empty, + "conversion and checking are done in the correct order"); + TEST("foo://bar.com/foo%00?baz", + FAIL, + empty, + "rejects %00"); + + /* IRI */ + TEST("foo://bar.com/cafè.gmi", + PASS, + IRI("foo", "bar.com", "", "cafè.gmi", "" , ""), + "decode IRI (with a 2-byte utf8 seq)"); + TEST("foo://bar.com/世界.gmi", + PASS, + IRI("foo", "bar.com", "", "世界.gmi", "" , ""), + "decode IRI"); + TEST("foo://bar.com/😼.gmi", + PASS, + IRI("foo", "bar.com", "", "😼.gmi", "" , ""), + "decode IRI (with a 3-byte utf8 seq)"); + TEST("foo://bar.com/😼/𤭢.gmi", + PASS, + IRI("foo", "bar.com", "", "😼/𤭢.gmi", "" , ""), + "decode IRI (with a 3-byte and a 4-byte utf8 seq)"); + TEST("foo://bar.com/世界/\xC0\x80", + FAIL, + empty, + "reject invalid sequence (overlong NUL)"); + + return 0; +} blob - 09aad1cd4b4aed0af095e949d47fb4056764eded blob + 8f530b0203310cdb0c880adf60ad6a610ae7823b --- utf8.c +++ utf8.c @@ -64,8 +64,8 @@ valid_multibyte_utf8(struct parser *p) { uint32_t cp = 0, state = 0; - for (; *p->uri; p->uri++) - if (!utf8_decode(&state, &cp, *p->uri)) + for (; *p->iri; p->iri++) + if (!utf8_decode(&state, &cp, *p->iri)) break; /* reject the ASCII range */