2 * Copyright (c) 2021 Omar Polo <op@omarpolo.com>
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
19 * - distinguish between an empty component and a undefined one
35 static const char *sub_ip_literal(const char*);
36 static const char *sub_host_dummy(const char*);
37 static const char *sub_pchar(const char*);
39 static const char *sub_segment(const char*);
40 static const char *sub_segment_nz(const char*);
41 static const char *sub_segment_nz_nc(const char*);
42 static const char *sub_path_common(const char*);
44 static const char *parse_scheme(const char*, struct phos_uri*);
45 static const char *parse_host(const char*, struct phos_uri*);
46 static const char *parse_port(const char*, struct phos_uri*);
47 static const char *parse_authority(const char*, struct phos_uri*);
48 static const char *parse_path_abempty(const char*, struct phos_uri*);
49 static const char *parse_path_absolute(const char*, struct phos_uri*);
50 static const char *parse_path_noscheme(const char*, struct phos_uri*);
51 static const char *parse_path_rootless(const char*, struct phos_uri*);
52 static const char *parse_path_empty(const char*, struct phos_uri*);
53 static const char *parse_hier_part(const char*, struct phos_uri*);
54 static const char *parse_query(const char*, struct phos_uri*);
55 static const char *parse_fragment(const char*, struct phos_uri*);
56 static const char *parse_uri(const char*, struct phos_uri*);
57 static const char *parse_relative_part(const char*, struct phos_uri*);
58 static const char *parse_relative_ref(const char*, struct phos_uri*);
59 static const char *parse_uri_reference(const char*, struct phos_uri*);
61 static int hasprefix(const char*, const char*);
62 static char *dotdot(char*, char*);
63 static void path_clean(struct phos_uri*);
64 static int merge_path(struct phos_uri*, const struct phos_uri*, const struct phos_uri*);
66 static int phos_resolve_uri_from(const struct phos_uri*, const struct phos_uri*, struct phos_uri*);
105 return gen_delims(c) || sub_delims(c);
124 * IP-literal = "[" ( IPv6address / IPvFuture ) "]"
126 * in reality, we parse [.*]
129 sub_ip_literal(const char *s)
134 while (*s != '\0' && *s != ']')
143 * parse everything until : or / (or \0).
144 * NB: empty hosts are technically valid!
147 sub_host_dummy(const char *s)
149 while (*s != '\0' && *s != ':' && *s != '/')
155 * pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
158 sub_pchar(const char *s)
167 if (isxdigit(s[1]) && isxdigit(s[2]))
174 if (*s == ':' || *s == '@')
184 sub_segment(const char *s)
188 while ((t = sub_pchar(s)) != NULL)
193 /* segment-nz = 1*pchar */
195 sub_segment_nz(const char *s)
197 if ((s = sub_pchar(s)) == NULL)
199 return sub_segment(s);
203 * segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
205 * so, 1*pchar excluding ":"
208 sub_segment_nz_nc(const char *s)
215 while (*s != ':' && (t = sub_pchar(s)) != NULL)
220 /* *( "/" segment ) */
222 sub_path_common(const char *s)
236 * scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
239 parse_scheme(const char *s, struct phos_uri *parsed)
241 const char *start = s;
262 if (len >= sizeof(parsed->scheme))
265 memcpy(parsed->scheme, start, len);
270 * host = IP-literal / IPv4address / reg-name
272 * rules IPv4address and reg-name are relaxed into parse_host_dummy.
275 parse_host(const char *s, struct phos_uri *parsed)
280 if ((t = sub_ip_literal(s)) != NULL ||
281 (t = sub_host_dummy(s)) != NULL) {
283 if (len >= sizeof(parsed->scheme))
285 memcpy(parsed->host, s, len);
296 parse_port(const char *s, struct phos_uri *parsed)
298 const char *errstr, *start = s;
308 if (len >= sizeof(parsed->port))
311 memcpy(parsed->port, start, len);
313 parsed->dec_port = strtonum(parsed->port, 0, 65535, &errstr);
321 * authority = host [ ":" port ]
322 * (yep, blatantly ignore the userinfo stuff -- not relevant for Gemini)
325 parse_authority(const char *s, struct phos_uri *parsed)
327 if ((s = parse_host(s, parsed)) == NULL)
332 return parse_port(s, parsed);
338 static inline const char *
339 set_path(const char *start, const char *end, struct phos_uri *parsed)
347 if (len >= sizeof(parsed->path))
349 memcpy(parsed->path, start, len);
354 * path-abempty = *( "/" segment )
357 parse_path_abempty(const char *s, struct phos_uri *parsed)
361 t = sub_path_common(s);
362 return set_path(s, t, parsed);
366 * path-absolute = "/" [ segment-nz *( "/" segment ) ]
369 parse_path_absolute(const char *s, struct phos_uri *parsed)
371 const char *t, *start = s;
377 if ((t = sub_segment_nz(s)) == NULL)
378 return set_path(start, s, parsed);
380 s = sub_path_common(t);
381 return set_path(start, s, parsed);
385 * path-noscheme = segment-nz-nc *( "/" segment )
388 parse_path_noscheme(const char *s, struct phos_uri *parsed)
390 const char *start = s;
392 if ((s = sub_segment_nz_nc(s)) == NULL)
394 s = sub_path_common(s);
395 return set_path(start, s, parsed);
399 * path-rootless = segment-nz *( "/" segment )
402 parse_path_rootless(const char *s, struct phos_uri *parsed)
404 const char *start = s;
406 if ((s = sub_segment_nz(s)) == NULL)
408 s = sub_path_common(s);
409 return set_path(start, s, parsed);
413 * path-empty = 0<pchar>
416 parse_path_empty(const char *s, struct phos_uri *parsed)
422 * hier-part = "//" authority path-abempty
428 parse_hier_part(const char *s, struct phos_uri *parsed)
432 if (s[0] == '/' && s[1] == '/') {
434 if ((s = parse_authority(s, parsed)) == NULL)
436 return parse_path_abempty(s, parsed);
439 if ((t = parse_path_absolute(s, parsed)) != NULL)
442 if ((t = parse_path_rootless(s, parsed)) != NULL)
445 return parse_path_empty(s, parsed);
449 * query = *( pchar / "/" / "?" )
452 parse_query(const char *s, struct phos_uri *parsed)
454 const char *t, *start = s;
458 if (*s == '/' || *s == '?') {
463 if ((t = sub_pchar(s)) == NULL)
469 if (len >= sizeof(parsed->query))
472 memcpy(parsed->query, start, len);
477 * fragment = *( pchar / "/" / "?" )
480 parse_fragment(const char *s, struct phos_uri *parsed)
482 const char *start = s;
489 if (*s == '/' || *s == '?') {
494 if ((s = sub_pchar(s)) == NULL)
499 if (len >= sizeof(parsed->fragment))
502 memcpy(parsed->fragment, start, len);
507 * URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
510 parse_uri(const char *s, struct phos_uri *parsed)
512 if ((s = parse_scheme(s, parsed)) == NULL)
519 if ((s = parse_hier_part(s, parsed)) == NULL)
524 if ((s = parse_query(s, parsed)) == NULL)
530 if ((s = parse_fragment(s, parsed)) == NULL)
538 * relative-part = "//" authority path-abempty
544 parse_relative_part(const char *s, struct phos_uri *parsed)
548 if (s[0] == '/' && s[1] == '/') {
550 if ((s = parse_authority(s, parsed)) == NULL)
552 return parse_path_abempty(s, parsed);
555 if ((t = parse_path_absolute(s, parsed)) != NULL)
558 if ((t = parse_path_noscheme(s, parsed)) != NULL)
561 return parse_path_empty(s, parsed);
565 * relative-ref = relative-part [ "?" query ] [ "#" fragment ]
568 parse_relative_ref(const char *s, struct phos_uri *parsed)
570 if ((s = parse_relative_part(s, parsed)) == NULL)
575 if ((s = parse_query(s, parsed)) == NULL)
581 if ((s = parse_fragment(s, parsed)) == NULL)
589 * URI-reference = URI / relative-ref
592 parse_uri_reference(const char *s, struct phos_uri *parsed)
596 if ((t = parse_uri(s, parsed)) != NULL)
598 memset(parsed, 0, sizeof(*parsed));
599 return parse_relative_ref(s, parsed);
604 * absolute-URI = scheme ":" hier-part [ "?" query ]
607 parse_absolute_uri(const char *s, struct phos_uri *parsed)
609 if ((s = parse_scheme(s, parsed)) == NULL)
616 if ((s = parse_hier_part(s, parsed)) == NULL)
621 if ((s = parse_query(s, parsed)) == NULL)
629 /* normalizing fns */
632 hasprefix(const char *str, const char *prfx)
634 for (; *str == *prfx && *prfx != '\0'; str++, prfx++)
637 return *prfx == '\0';
641 dotdot(char *point, char *start)
645 for (t = point-1; t > start; --t) {
652 memmove(t, point, strlen(point)+1);
657 * This is the "Remove Dot Segments" straight outta RFC3986, section
661 path_clean(struct phos_uri *uri)
663 char *in = uri->path;
665 while (in != NULL && *in != '\0') {
666 assert(in >= uri->path);
668 /* A) drop leading ../ or ./ */
669 if (hasprefix(in, "../"))
670 memmove(in, &in[3], strlen(&in[3])+1);
671 else if (hasprefix(in, "./"))
672 memmove(in, &in[2], strlen(&in[2])+1);
674 /* B) replace /./ or /. with / */
675 else if (hasprefix(in, "/./"))
676 memmove(&in[1], &in[3], strlen(&in[3])+1);
677 else if (!strcmp(in, "/."))
680 /* C) resolve dot-dot */
681 else if (hasprefix(in, "/../")) {
682 in = dotdot(in, uri->path);
683 memmove(&in[1], &in[4], strlen(&in[4])+1);
684 } else if (!strcmp(in, "/..")) {
685 in = dotdot(in, uri->path);
691 else if (!strcmp(in, "."))
693 else if (!strcmp(in, ".."))
698 in = strchr(in+1, '/');
703 * see RFC3986 5.3.3 "Merge Paths".
706 merge_path(struct phos_uri *ret, const struct phos_uri *base,
707 const struct phos_uri *ref)
712 len = sizeof(ret->path);
714 s = strrchr(base->path, '/');
715 if ((*base->host != '\0' && *base->path == '\0') || s == NULL) {
716 strlcpy(ret->path, "/", len);
719 memcpy(ret->path, base->path, s - base->path + 1);
722 return strlcat(ret->path, ref->path, len) < len;
726 /* public interface */
729 phos_parse_absolute_uri(const char *s, struct phos_uri *uri)
731 memset(uri, 0, sizeof(*uri));
733 if ((s = parse_absolute_uri(s, uri)) == NULL)
742 phos_parse_uri_reference(const char *s, struct phos_uri *uri)
744 memset(uri, 0, sizeof(*uri));
746 if ((s = parse_uri_reference(s, uri)) == NULL)
755 * Implementation of the "transform references" algorithm from
756 * RFC3986, see 5.2.2.
758 * We expect base and ref to be URIs constructed by this library
759 * (because we emit only normalized URIs).
761 * ATM this is marked as private because:
762 * - let's say the URI is "."
763 * - one calls phos_parse_uri_references
764 * - it exists with success, but the path becomes ""
765 * - this routine does the right thing, but the outcome is not what expected.
767 * so users for now have to user resolve_uri_from_str, which parses
768 * the URI but not normalize it, and then call into us.
771 phos_resolve_uri_from(const struct phos_uri *base, const struct phos_uri *ref,
772 struct phos_uri *ret)
774 memset(ret, 0, sizeof(*ret));
776 if (*ref->scheme != '\0') {
777 strlcpy(ret->scheme, ref->scheme, sizeof(ret->scheme));
778 strlcpy(ret->host, ref->host, sizeof(ret->host));
779 strlcpy(ret->port, ref->port, sizeof(ret->port));
780 ret->dec_port = ret->dec_port;
781 strlcpy(ret->path, ref->path, sizeof(ret->path));
782 strlcpy(ret->query, ref->query, sizeof(ret->query));
784 if (*ref->host != '\0') {
785 strlcpy(ret->host, ref->host, sizeof(ret->host));
786 strlcpy(ret->port, ref->port, sizeof(ret->port));
787 ret->dec_port = ret->dec_port;
788 strlcpy(ret->path, ref->path, sizeof(ret->path));
789 strlcpy(ret->query, ref->query, sizeof(ret->query));
791 if (*ref->path == '\0') {
792 strlcpy(ret->path, base->path, sizeof(ret->path));
793 if (*ref->query != '\0')
794 strlcpy(ret->query, ref->query, sizeof(ret->query));
796 strlcpy(ret->query, base->query, sizeof(ret->query));
798 if (*ref->path == '/')
799 strlcpy(ret->path, ref->path, sizeof(ret->path));
801 if (!merge_path(ret, base, ref))
806 strlcpy(ret->query, ref->query, sizeof(ret->query));
809 strlcpy(ret->host, base->host, sizeof(ret->host));
810 strlcpy(ret->port, base->port, sizeof(ret->port));
811 ret->dec_port = base->dec_port;
814 strlcpy(ret->scheme, base->scheme, sizeof(ret->scheme));
817 strlcpy(ret->fragment, ref->fragment, sizeof(ret->fragment));
823 phos_resolve_uri_from_str(const struct phos_uri *base, const char *refstr,
824 struct phos_uri *ret)
828 memset(&ref, 0, sizeof(ref));
830 if ((refstr = parse_uri_reference(refstr, &ref)) == NULL)
836 return phos_resolve_uri_from(base, &ref, ret);
840 phos_uri_drop_empty_segments(struct phos_uri *uri)
844 for (i = uri->path; *i; ++i) {
845 if (*i == '/' && *(i+1) == '/') {
846 memmove(i, i+1, strlen(i)); /* move also the \0 */
853 phos_uri_set_query(struct phos_uri *uri, const char *query)
859 len = sizeof(uri->query);
861 memset(uri->query, 0, len);
863 for (; *query != '\0' && len > 0; ++query) {
868 unreserved(*query) ||
869 sub_delims(*query)) {
878 sprintf(out, "%02X", t);
883 return *query == '\0';
887 phos_serialize_uri(const struct phos_uri *uri, char *buf, size_t len)
890 if (strlcat(buf, s, len) >= len) \
893 strlcpy(buf, "", len);
895 if (*uri->scheme != '\0') {
900 if (*uri->host != '\0' || strcmp(uri->scheme, "file") == 0) {
902 * The file URI scheme has a quirk that even if a
903 * hostname is not present, we still have to append
904 * the two slashes. This is why we have
905 * file:///etc/hosts and not file:/etc/hosts
911 if (*uri->port != '\0' && strcmp(uri->port, "1965")) {
918 if (*uri->query != '\0') {
923 if (*uri->fragment) {