commit 5546fdd511cd0207944cd7e0436a21ba60ab5718 from: Omar Polo date: Fri Dec 23 22:08:06 2022 UTC draft of the new iri parser for the moment is just a URI (RFC3986) parser, and not even a complete one, but I like it much more than phos_uri. It's more gentle in the memory consumption and has a cleaner API. It's not complete, some parts are still stubbed or completely to implement, but it's the start. It compiles. Step-by-step integration of it in telescope and testing will come later. commit - cf8fcf6437e729c6de0cd278af49a70535cd81be commit + 5546fdd511cd0207944cd7e0436a21ba60ab5718 blob - /dev/null blob + ac838023e0eb83c222129878c8b2b0eb79b3da8b (mode 644) --- /dev/null +++ iri.c @@ -0,0 +1,555 @@ +/* + * Copyright (c) 2022 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include + +#include "iri.h" + +/* TODO: URI -> IRI. accept IRI but emit always URI */ + +static inline int +cpstr(const char *start, const char *till, char *buf, size_t len) +{ + size_t slen = till - start; + + if (slen + 1 >= len) + return (-1); + memcpy(buf, start, slen); + buf[slen] = '\0'; + return (0); +} + +static inline int +unreserved(int c) +{ + return (isalnum((unsigned char)c) || + c == '-' || + c == '.' || + c == '_' || + c == '~'); +} + +static inline int +pctenc(const char *s) +{ + const char *t = s; + + return (t[0] == '%' && + isxdigit((unsigned char)t[1]) && + isxdigit((unsigned char)t[2])); +} + +static inline int +sub_delims(int c) +{ + return (c == '!' || c == '$' || c == '&' || c == '\'' || + c == '(' || c == ')' || c == '*' || c == '+' || c == ',' || + c == ';' || c == '='); +} + +static inline const char * +advance_pchar(const char *s) +{ + if (unreserved(*s) || sub_delims(*s) || *s == ':' || *s == '@') + return (s + 1); + if (pctenc(s)) + return (s + 3); + return (NULL); +} + +static inline const char * +advance_segment(const char *s) +{ + const char *t = s; + + while ((t = advance_pchar(s)) != NULL) + s = t; + return (s); +} + +static inline const char * +advance_segment_nz(const char *s) +{ + const char *t; + + if ((t = advance_pchar(s)) == NULL) + return (NULL); + return (advance_segment(t)); +} + +static inline const char * +advance_segment_nz_nc(const char *s) +{ + const char *t = s; + + for (;;) { + if (unreserved(*t) || sub_delims(*t) || *t == '@') + t++; + else if (pctenc(t)) + t += 3; + else + break; + } + + return (t != s ? t : NULL); +} + +static const char * +parse_scheme(const char *s, struct iri *iri) +{ + const char *t = s; + + if (!isalpha((unsigned char)*t)) + return (NULL); + + while (isalnum((unsigned char)*t) || + *t == '+' || + *t == '-' || + *t == '.') + t++; + + if (cpstr(s, t, iri->iri_scheme, sizeof(iri->iri_scheme)) == -1) + return (NULL); + + iri->iri_flags |= IH_SCHEME; + return (t); +} + +/* userinfo is always optional */ +static const char * +parse_uinfo(const char *s, struct iri *iri) +{ + const char *t = s; + + for (;;) { + if (unreserved(*t) || sub_delims(*t) || *t == ':') + t++; + else if (pctenc(t)) + t += 3; + else + break; + } + + if (*t != '@') + return (s); + + if (cpstr(s, t, iri->iri_uinfo, sizeof(iri->iri_uinfo)) == -1) + return (NULL); + iri->iri_flags |= IH_UINFO; + return (t + 1); +} + +static const char * +parse_host(const char *s, struct iri *iri) +{ + const char *t = s; + + /* + * cheating a bit by relaxing and merging the rule for + * IPv6address and IPvFuture and by merging IPv4address and + * reg-name. + */ + + if (*t == '[') { + while (*t && *t != ']') + ++t; + if (*t == '\0') + return (NULL); + t++; + if (cpstr(s, t, iri->iri_host, sizeof(iri->iri_host)) == -1) + return (NULL); + iri->iri_flags |= IH_HOST; + return (t); + } + + for (;;) { + if (unreserved(*t) || sub_delims(*t)) + t++; + else if (pctenc(t)) + t += 3; + else + break; + } + + if (cpstr(s, t, iri->iri_host, sizeof(iri->iri_host)) == -1) + return (NULL); + iri->iri_flags |= IH_HOST; + return (t); +} + +static const char * +parse_port(const char *s, struct iri *iri) +{ + const char *t = s; + const char *errstr; + + while (isdigit((unsigned char)*t)) + t++; + if (cpstr(s, t, iri->iri_portstr, sizeof(iri->iri_portstr)) == -1) + return (NULL); + iri->iri_port = strtonum(iri->iri_portstr, 1, UINT16_MAX, &errstr); + if (errstr) + return (NULL); + iri->iri_flags |= IH_PORT; + return (t); +} + +static const char * +parse_authority(const char *s, struct iri *iri) +{ + const char *t; + + if ((t = parse_uinfo(s, iri)) == NULL) + return (NULL); + + if ((t = parse_host(t, iri)) == NULL) + return (NULL); + + if (*t == ':') + return (parse_port(t, iri)); + + return (t); +} + +static const char * +parse_path_abempty(const char *s, struct iri *iri) +{ + const char *t = s; + + while (*t == '/') + t = advance_segment(t + 1); + + if (cpstr(s, t, iri->iri_path, sizeof(iri->iri_path)) == -1) + return (NULL); + iri->iri_flags |= IH_PATH; + return (t); +} + +static const char * +parse_path_absolute(const char *s, struct iri *iri) +{ + const char *t; + + if (*s != '/') + return (NULL); + + if ((t = advance_segment_nz(s + 1)) == NULL) + return (s + 1); + + while (*t == '/') + t = advance_segment(t + 1); + + if (cpstr(s, t, iri->iri_path, sizeof(iri->iri_path)) == -1) + return (NULL); + iri->iri_flags |= IH_PATH; + return (t); +} + +static const char * +parse_path_rootless(const char *s, struct iri *iri) +{ + const char *t; + + if ((t = advance_segment_nz(s)) == NULL) + return (NULL); + + while (*t == '/') + t = advance_segment(t + 1); + + if (cpstr(s, t, iri->iri_path, sizeof(iri->iri_path)) == -1) + return (NULL); + iri->iri_flags |= IH_PATH; + return (t); +} + +static const char * +parse_path_noscheme(const char *s, struct iri *iri) +{ + const char *t; + + if ((t = advance_segment_nz_nc(s)) == NULL) + return (NULL); + + while (*t == '/') + t = advance_segment(t + 1); + + if (cpstr(s, t, iri->iri_path, sizeof(iri->iri_path)) == -1) + return (NULL); + iri->iri_flags |= IH_PATH; + return (t); +} + +static const char * +parse_path_empty(const char *s, struct iri *iri) +{ + iri->iri_path[0] = '\0'; + iri->iri_flags |= IH_PATH; + return (s); +} + +static const char * +parse_hier(const char *s, struct iri *iri) +{ + const char *t; + + if (!strncmp(s, "//", 2)) { + if ((t = parse_authority(s + 2, iri)) == NULL) + return (NULL); + return (parse_path_abempty(t, iri)); + } + + if ((t = parse_path_absolute(s, iri)) != NULL) + return (t); + + if ((t = parse_path_rootless(s, iri)) != NULL) + return (t); + + return (parse_path_empty(s, iri)); +} + +static const char * +parse_relative(const char *s, struct iri *iri) +{ + const char *t = s; + + if (!strncmp(s, "//", 2)) { + if ((t = parse_authority(s + 2, iri)) == NULL) + return (NULL); + return (parse_path_abempty(t, iri)); + } + + if ((t = parse_path_absolute(s, iri)) != NULL) + return (t); + + if ((t = parse_path_noscheme(s, iri)) != NULL) + return (t); + + return (parse_path_empty(s, iri)); +} + +static const char * +parse_query(const char *s, struct iri *iri) +{ + const char *n, *t = s; + + for (;;) { + if ((n = advance_pchar(t)) != NULL) + t = n; + else if (*t == '/' || *t == '?') + t++; + else + break; + } + + if (cpstr(s, t, iri->iri_query, sizeof(iri->iri_query)) == -1) + return (NULL); + iri->iri_flags |= IH_QUERY; + return (t); +} + +static int +parse_uri(const char *s, struct iri *iri) +{ + if ((s = parse_scheme(s, iri)) == NULL) + return (-1); + + if (*s != ':') + return (-1); + + if ((s = parse_hier(s + 1, iri)) == NULL) + return (-1); + + if (*s == '?' && (s = parse_query(s + 1, iri)) == NULL) + return (-1); + + /* skip fragments */ + if (*s == '#' || *s == '\0') + return (0); + + return (-1); +} + +static int +parse_relative_ref(const char *s, struct iri *iri) +{ + if ((s = parse_relative(s, iri)) == NULL) + return (-1); + + if (*s == '?' && (s = parse_query(s + 1, iri)) == NULL) + return (-1); + + /* skip fragments */ + if (*s == '#' || *s == '\0') + return (0); + + return (-1); +} + +static int +parse(const char *s, struct iri *iri) +{ + iri->iri_flags = 0; + + if (s == NULL) + return (0); + + if (parse_uri(s, iri) == -1) { + iri->iri_flags = 0; + if (parse_relative_ref(s, iri) == -1) + return (-1); + } + + return (0); +} + +static void +cpfields(struct iri *dest, const struct iri *src, int flags) +{ + if (flags & IH_SCHEME) { + dest->iri_flags |= IH_SCHEME; + if (src->iri_flags & IH_SCHEME) + memcpy(dest->iri_scheme, src->iri_scheme, + sizeof(dest->iri_scheme)); + } + if (flags & IH_UINFO) { + dest->iri_flags |= IH_UINFO; + if (src->iri_flags & IH_UINFO) + memcpy(dest->iri_uinfo, src->iri_uinfo, + sizeof(dest->iri_uinfo)); + } + if (flags & IH_HOST) { + dest->iri_flags |= IH_HOST; + if (src->iri_flags & IH_HOST) + memcpy(dest->iri_host, src->iri_host, + sizeof(dest->iri_host)); + } + if (flags & IH_PORT) { + dest->iri_flags |= IH_PORT; + if (src->iri_flags & IH_PORT) + dest->iri_port = src->iri_port; + } + if (flags & IH_PATH) { + dest->iri_flags |= IH_PATH; + if (src->iri_flags & IH_PATH) + memcpy(dest->iri_path, src->iri_path, + sizeof(dest->iri_path)); + } + if (flags & IH_QUERY) { + dest->iri_flags |= IH_QUERY; + if (src->iri_flags & IH_QUERY) + memcpy(dest->iri_query, src->iri_query, + sizeof(dest->iri_query)); + } +} + +static inline void +remove_dot_segments(struct iri *iri) +{ + /* TODO: fixup iri->iri_path */ + return; +} + +static inline void +mergepath(char *out, size_t len, const char *a, const char *b) +{ + /* TODO: compute into out path `b' resolved from `a' */ + memset(out, 0, len); + return; +} + +int +iri_parse(const char *base, const char *str, struct iri *iri) +{ + static struct iri ibase, iparsed; + + memset(iri, 0, sizeof(*iri)); + + if (base == NULL) { + ibase.iri_flags = 0; + if (parse_uri(str, &iparsed) == -1) + return (-1); + } else { + if (parse_uri(base, &ibase) == -1) + return (-1); + if (parse(str, &iparsed) == -1) + return (-1); + } + + if (iparsed.iri_flags & IH_SCHEME) { + cpfields(iri, &iparsed, iparsed.iri_flags); + remove_dot_segments(iri); + return (0); + } + + /* if fragments are supported, copy iparsed fragment to iri */ + + cpfields(iri, &ibase, IH_SCHEME); + + if (iparsed.iri_flags & IH_HOST) { + cpfields(iri, &iparsed, IH_AUTHORITY|IH_PATH|IH_QUERY); + remove_dot_segments(iri); + return (0); + } + + cpfields(iri, &ibase, IH_AUTHORITY); + + if ((iparsed.iri_flags & IH_PATH) && *iparsed.iri_path == '\0') { + cpfields(iri, &ibase, IH_PATH); + if (iparsed.iri_flags & IH_QUERY) + cpfields(iri, &iparsed, IH_QUERY); + else + cpfields(iri, &ibase, IH_QUERY); + return (0); + } + + cpfields(iri, &iparsed, IH_QUERY); + if ((iparsed.iri_flags & IH_PATH) && !strcmp(iparsed.iri_path, "/")) + cpfields(iri, &iparsed, IH_PATH); + else { + if (!(ibase.iri_flags & IH_PATH)) + ibase.iri_path[0] = '\0'; + if (!(iparsed.iri_flags & IH_PATH)) + iparsed.iri_path[0] = '\0'; + mergepath(iri->iri_path, sizeof(iri->iri_path), + ibase.iri_path, iparsed.iri_path); + } + remove_dot_segments(iri); + cpfields(iri, &ibase, IH_QUERY); + return (0); +} + +int +iri_unparse(const struct iri *iri, char *buf, size_t buflen) +{ + memset(buf, 0, buflen); + return (-1); +} + +int +iri_human(const struct iri *iri, char *buf, size_t buflen) +{ + memset(buf, 0, buflen); + return (-1); +} + +int +iri_setquery(struct iri *iri, const char *text) +{ + return (-1); +} blob - /dev/null blob + 385c9423629d58a2d8807928a7bda8e6446c0418 (mode 644) --- /dev/null +++ iri.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2022 Omar Polo + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#ifndef IRI_H +#define IRI_H + +struct iri { + char iri_scheme[32]; + char iri_uinfo[254]; + char iri_host[1024]; + char iri_portstr[6]; + uint16_t iri_port; + char iri_path[1024]; + char iri_query[1024]; + +#define IH_SCHEME 0x01 +#define IH_UINFO 0x02 +#define IH_HOST 0x04 +#define IH_PORT 0x08 +#define IH_AUTHORITY (IH_UINFO|IH_HOST|IH_PORT) +#define IH_PATH 0x10 +#define IH_QUERY 0x20 + int iri_flags; +}; + +int iri_parse(const char *, const char *, struct iri *); +int iri_unparse(const struct iri *, char *, size_t); +int iri_human(const struct iri *, char *, size_t); +int iri_setquery(struct iri *, const char *); + +#endif /* IRI_H */