Blob


1 /*
2 * Copyright (c) 2021 Omar Polo <op@omarpolo.com>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
17 /*
18 * TODOs:
19 * - distinguish between an empty component and a undefined one
20 * - ...
21 */
23 #include <assert.h>
25 #include "compat.h"
27 #include "phos.h"
29 #include <ctype.h>
30 #include <stdint.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
35 static const char *sub_ip_literal(const char*);
36 static const char *sub_host_dummy(const char*);
37 static const char *sub_pchar(const char*);
39 static const char *sub_segment(const char*);
40 static const char *sub_segment_nz(const char*);
41 static const char *sub_segment_nz_nc(const char*);
42 static const char *sub_path_common(const char*);
44 static const char *parse_scheme(const char*, struct phos_uri*);
45 static const char *parse_host(const char*, struct phos_uri*);
46 static const char *parse_port(const char*, struct phos_uri*);
47 static const char *parse_authority(const char*, struct phos_uri*);
48 static const char *parse_path_abempty(const char*, struct phos_uri*);
49 static const char *parse_path_absolute(const char*, struct phos_uri*);
50 static const char *parse_path_noscheme(const char*, struct phos_uri*);
51 static const char *parse_path_rootless(const char*, struct phos_uri*);
52 static const char *parse_path_empty(const char*, struct phos_uri*);
53 static const char *parse_hier_part(const char*, struct phos_uri*);
54 static const char *parse_query(const char*, struct phos_uri*);
55 static const char *parse_fragment(const char*, struct phos_uri*);
56 static const char *parse_uri(const char*, struct phos_uri*);
57 static const char *parse_relative_part(const char*, struct phos_uri*);
58 static const char *parse_relative_ref(const char*, struct phos_uri*);
59 static const char *parse_uri_reference(const char*, struct phos_uri*);
61 static int hasprefix(const char*, const char*);
62 static char *dotdot(char*, char*);
63 static void path_clean(struct phos_uri*);
64 static int merge_path(struct phos_uri*, const struct phos_uri*, const struct phos_uri*);
66 static int phos_resolve_uri_from(const struct phos_uri*, const struct phos_uri*, struct phos_uri*);
69 /* common defs */
71 static inline int
72 gen_delims(int c)
73 {
74 return c == ':'
75 || c == '/'
76 || c == '?'
77 || c == '#'
78 || c == '['
79 || c == ']'
80 || c == '@';
81 }
83 static inline int
84 sub_delims(int c)
85 {
86 return c == '!'
87 || c == '$'
88 || c == '&'
89 || c == '\''
90 || c == '('
91 || c == ')'
92 || c == '*'
93 || c == '+'
94 || c == ','
95 || c == ';'
96 || c == '=';
97 }
99 static inline int
100 reserved(int c)
102 return gen_delims(c) || sub_delims(c);
105 static inline int
106 unreserved(int c)
108 return isalpha(c)
109 || isdigit(c)
110 || c == '-'
111 || c == '.'
112 || c == '_'
113 || c == '~';
117 /* subs */
119 /*
120 * IP-literal = "[" ( IPv6address / IPvFuture ) "]"
122 * in reality, we parse [.*]
123 */
124 static const char *
125 sub_ip_literal(const char *s)
127 if (*s != '[')
128 return NULL;
130 while (*s != '\0' && *s != ']')
131 s++;
133 if (*s == '\0')
134 return NULL;
135 return ++s;
138 /*
139 * parse everything until : or / (or \0).
140 * NB: empty hosts are technically valid!
141 */
142 static const char *
143 sub_host_dummy(const char *s)
145 while (*s != '\0' && *s != ':' && *s != '/')
146 s++;
147 return s;
150 /*
151 * pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
152 */
153 static const char *
154 sub_pchar(const char *s)
156 if (*s == '\0')
157 return NULL;
159 if (unreserved(*s))
160 return ++s;
162 if (*s == '%') {
163 if (isxdigit(s[1]) && isxdigit(s[2]))
164 return s + 3;
167 if (sub_delims(*s))
168 return ++s;
170 if (*s == ':' || *s == '@')
171 return ++s;
173 return NULL;
176 /*
177 * segment = *pchar
178 */
179 static const char *
180 sub_segment(const char *s)
182 const char *t;
184 while ((t = sub_pchar(s)) != NULL)
185 s = t;
186 return s;
189 /* segment-nz = 1*pchar */
190 static const char *
191 sub_segment_nz(const char *s)
193 if ((s = sub_pchar(s)) == NULL)
194 return NULL;
195 return sub_segment(s);
198 /*
199 * segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
201 * so, 1*pchar excluding ":"
202 */
203 static const char *
204 sub_segment_nz_nc(const char *s)
206 const char *t;
208 if (*s == ':')
209 return NULL;
211 while (*s != ':' && (t = sub_pchar(s)) != NULL)
212 s = t;
213 return s;
216 /* *( "/" segment ) */
217 static const char *
218 sub_path_common(const char *s)
220 for (;;) {
221 if (*s != '/')
222 return s;
223 s++;
224 s = sub_segment(s);
229 /* parse fns */
231 /*
232 * scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
233 */
234 static const char *
235 parse_scheme(const char *s, struct phos_uri *parsed)
237 const char *start = s;
238 size_t len;
240 if (!isalpha(*s))
241 return NULL;
243 while (*s != '\0') {
244 if (isalpha(*s) ||
245 isdigit(*s) ||
246 *s == '+' ||
247 *s == '-' ||
248 *s == '.')
249 s++;
250 else
251 break;
254 if (*s == '\0')
255 return NULL;
257 len = s - start;
258 if (len >= sizeof(parsed->scheme))
259 return NULL;
261 memcpy(parsed->scheme, start, len);
262 return s;
265 /*
266 * host = IP-literal / IPv4address / reg-name
268 * rules IPv4address and reg-name are relaxed into parse_host_dummy.
269 */
270 static const char *
271 parse_host(const char *s, struct phos_uri *parsed)
273 const char *t;
274 size_t len;
276 if ((t = sub_ip_literal(s)) != NULL ||
277 (t = sub_host_dummy(s)) != NULL) {
278 len = t - s;
279 if (len >= sizeof(parsed->scheme))
280 return NULL;
281 memcpy(parsed->host, s, len);
282 return t;
285 return NULL;
288 /*
289 * port = *digit
290 */
291 static const char *
292 parse_port(const char *s, struct phos_uri *parsed)
294 const char *errstr, *start = s;
295 size_t len;
297 while (isdigit(*s))
298 s++;
300 if (s == start)
301 return NULL;
303 len = s - start;
304 if (len >= sizeof(parsed->port))
305 return NULL;
307 memcpy(parsed->port, start, len);
309 parsed->dec_port = strtonum(parsed->port, 0, 65535, &errstr);
310 if (errstr != NULL)
311 return NULL;
313 return s;
316 /*
317 * authority = host [ ":" port ]
318 * (yep, blatantly ignore the userinfo stuff -- not relevant for Gemini)
319 */
320 static const char *
321 parse_authority(const char *s, struct phos_uri *parsed)
323 if ((s = parse_host(s, parsed)) == NULL)
324 return NULL;
326 if (*s == ':') {
327 s++;
328 return parse_port(s, parsed);
331 return s;
334 static inline const char *
335 set_path(const char *start, const char *end, struct phos_uri *parsed)
337 size_t len;
339 if (end == NULL)
340 return NULL;
342 len = end - start;
343 if (len >= sizeof(parsed->path))
344 return NULL;
345 memcpy(parsed->path, start, len);
346 return end;
349 /*
350 * path-abempty = *( "/" segment )
351 */
352 static const char *
353 parse_path_abempty(const char *s, struct phos_uri *parsed)
355 const char *t;
357 t = sub_path_common(s);
358 return set_path(s, t, parsed);
361 /*
362 * path-absolute = "/" [ segment-nz *( "/" segment ) ]
363 */
364 static const char *
365 parse_path_absolute(const char *s, struct phos_uri *parsed)
367 const char *t, *start = s;
369 if (*s != '/')
370 return NULL;
372 s++;
373 if ((t = sub_segment_nz(s)) == NULL)
374 return set_path(start, s, parsed);
376 s = sub_path_common(t);
377 return set_path(start, s, parsed);
380 /*
381 * path-noscheme = segment-nz-nc *( "/" segment )
382 */
383 static const char *
384 parse_path_noscheme(const char *s, struct phos_uri *parsed)
386 const char *start = s;
388 if ((s = sub_segment_nz_nc(s)) == NULL)
389 return NULL;
390 s = sub_path_common(s);
391 return set_path(start, s, parsed);
394 /*
395 * path-rootless = segment-nz *( "/" segment )
396 */
397 static const char *
398 parse_path_rootless(const char *s, struct phos_uri *parsed)
400 const char *start = s;
402 if ((s = sub_segment_nz(s)) == NULL)
403 return NULL;
404 s = sub_path_common(s);
405 return set_path(start, s, parsed);
408 /*
409 * path-empty = 0<pchar>
410 */
411 static const char *
412 parse_path_empty(const char *s, struct phos_uri *parsed)
414 return s;
417 /*
418 * hier-part = "//" authority path-abempty
419 * / path-absolute
420 * / path-rootless
421 * / path-empty
422 */
423 static const char *
424 parse_hier_part(const char *s, struct phos_uri *parsed)
426 const char *t;
428 if (s[0] == '/' && s[1] == '/') {
429 s += 2;
430 if ((s = parse_authority(s, parsed)) == NULL)
431 return NULL;
432 return parse_path_abempty(s, parsed);
435 if ((t = parse_path_absolute(s, parsed)) != NULL)
436 return t;
438 if ((t = parse_path_rootless(s, parsed)) != NULL)
439 return t;
441 return parse_path_empty(s, parsed);
444 /*
445 * query = *( pchar / "/" / "?" )
446 */
447 static const char *
448 parse_query(const char *s, struct phos_uri *parsed)
450 const char *t, *start = s;
451 size_t len;
453 while (*s != '\0') {
454 if (*s == '/' || *s == '?') {
455 s++;
456 continue;
459 if ((t = sub_pchar(s)) == NULL)
460 break;
461 s = t;
464 len = s - start;
465 if (len >= sizeof(parsed->query))
466 return NULL;
468 memcpy(parsed->query, start, len);
469 return s;
472 /*
473 * fragment = *( pchar / "/" / "?" )
474 */
475 static const char *
476 parse_fragment(const char *s, struct phos_uri *parsed)
478 const char *start = s;
479 size_t len;
481 for (;;) {
482 if (*s == '\0')
483 break;
485 if (*s == '/' || *s == '?') {
486 s++;
487 continue;
490 if ((s = sub_pchar(s)) == NULL)
491 return NULL;
494 len = s - start;
495 if (len >= sizeof(parsed->fragment))
496 return NULL;
498 memcpy(parsed->fragment, start, len);
499 return s;
502 /*
503 * URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
504 */
505 static const char *
506 parse_uri(const char *s, struct phos_uri *parsed)
508 if ((s = parse_scheme(s, parsed)) == NULL)
509 return NULL;
511 if (*s != ':')
512 return NULL;
514 s++;
515 if ((s = parse_hier_part(s, parsed)) == NULL)
516 return NULL;
518 if (*s == '?') {
519 s++;
520 if ((s = parse_query(s, parsed)) == NULL)
521 return NULL;
524 if (*s == '#') {
525 s++;
526 if ((s = parse_fragment(s, parsed)) == NULL)
527 return NULL;
530 return s;
533 /*
534 * relative-part = "//" authority path-abempty
535 * / path-absolute
536 * / path-noscheme
537 * / path-empty
538 */
539 static const char *
540 parse_relative_part(const char *s, struct phos_uri *parsed)
542 const char *t;
544 if (s[0] == '/' && s[1] == '/') {
545 s += 2;
546 if ((s = parse_authority(s, parsed)) == NULL)
547 return NULL;
548 return parse_path_abempty(s, parsed);
551 if ((t = parse_path_absolute(s, parsed)) != NULL)
552 return t;
554 if ((t = parse_path_noscheme(s, parsed)) != NULL)
555 return t;
557 return parse_path_empty(s, parsed);
560 /*
561 * relative-ref = relative-part [ "?" query ] [ "#" fragment ]
562 */
563 static const char *
564 parse_relative_ref(const char *s, struct phos_uri *parsed)
566 if ((s = parse_relative_part(s, parsed)) == NULL)
567 return NULL;
569 if (*s == '?') {
570 s++;
571 if ((s = parse_query(s, parsed)) == NULL)
572 return NULL;
575 if (*s == '#') {
576 s++;
577 if ((s = parse_fragment(s, parsed)) == NULL)
578 return NULL;
581 return s;
584 /*
585 * URI-reference = URI / relative-ref
586 */
587 static const char *
588 parse_uri_reference(const char *s, struct phos_uri *parsed)
590 const char *t;
592 if ((t = parse_uri(s, parsed)) != NULL)
593 return t;
594 memset(parsed, 0, sizeof(*parsed));
595 return parse_relative_ref(s, parsed);
599 /*
600 * absolute-URI = scheme ":" hier-part [ "?" query ]
601 */
602 static const char *
603 parse_absolute_uri(const char *s, struct phos_uri *parsed)
605 if ((s = parse_scheme(s, parsed)) == NULL)
606 return NULL;
608 if (*s != ':')
609 return NULL;
611 s++;
612 if ((s = parse_hier_part(s, parsed)) == NULL)
613 return NULL;
615 if (*s == '?') {
616 s++;
617 if ((s = parse_query(s, parsed)) == NULL)
618 return NULL;
621 return s;
625 /* normalizing fns */
627 static int
628 hasprefix(const char *str, const char *prfx)
630 for (; *str == *prfx && *prfx != '\0'; str++, prfx++)
633 return *prfx == '\0';
636 static char *
637 dotdot(char *point, char *start)
639 char *t;
641 for (t = point-1; t > start; --t) {
642 if (*t == '/')
643 break;
645 if (t < start)
646 t = start;
648 memmove(t, point, strlen(point)+1);
649 return t;
652 /*
653 * This is the "Remove Dot Segments" straight outta RFC3986, section
654 * 5.2.4
655 */
656 static void
657 path_clean(struct phos_uri *uri)
659 char *in = uri->path;
661 while (in != NULL && *in != '\0') {
662 assert(in >= uri->path);
664 /* A) drop leading ../ or ./ */
665 if (hasprefix(in, "../"))
666 memmove(in, &in[3], strlen(&in[3])+1);
667 else if (hasprefix(in, "./"))
668 memmove(in, &in[2], strlen(&in[2])+1);
670 /* B) replace /./ or /. with / */
671 else if (hasprefix(in, "/./"))
672 memmove(&in[1], &in[3], strlen(&in[3])+1);
673 else if (!strcmp(in, "/."))
674 in[1] = '\0';
676 /* C) resolve dot-dot */
677 else if (hasprefix(in, "/../")) {
678 in = dotdot(in, uri->path);
679 memmove(&in[1], &in[4], strlen(&in[4])+1);
680 } else if (!strcmp(in, "/..")) {
681 in = dotdot(in, uri->path);
682 in[1] = '\0';
683 break;
686 /* D */
687 else if (!strcmp(in, "."))
688 *in = '\0';
689 else if (!strcmp(in, ".."))
690 *in = '\0';
692 /* E */
693 else
694 in = strchr(in+1, '/');
698 /*
699 * see RFC3986 5.3.3 "Merge Paths".
700 */
701 static int
702 merge_path(struct phos_uri *ret, const struct phos_uri *base,
703 const struct phos_uri *ref)
705 const char *s;
706 size_t len;
708 len = sizeof(ret->path);
710 s = strrchr(base->path, '/');
711 if ((*base->host != '\0' && *base->path == '\0') || s == NULL) {
712 strlcpy(ret->path, "/", len);
713 } else {
714 /* copy the / too */
715 memcpy(ret->path, base->path, s - base->path + 1);
718 return strlcat(ret->path, ref->path, len) < len;
722 /* public interface */
724 int
725 phos_parse_absolute_uri(const char *s, struct phos_uri *uri)
727 memset(uri, 0, sizeof(*uri));
729 if ((s = parse_absolute_uri(s, uri)) == NULL)
730 return 0;
731 if (*s != '\0')
732 return 0;
733 path_clean(uri);
734 return 1;
737 int
738 phos_parse_uri_reference(const char *s, struct phos_uri *uri)
740 memset(uri, 0, sizeof(*uri));
742 if ((s = parse_uri_reference(s, uri)) == NULL)
743 return 0;
744 if (*s != '\0')
745 return 0;
746 path_clean(uri);
747 return 1;
750 /*
751 * Implementation of the "transform references" algorithm from
752 * RFC3986, see 5.2.2.
754 * We expect base and ref to be URIs constructed by this library
755 * (because we emit only normalized URIs).
757 * ATM this is marked as private because:
758 * - let's say the URI is "."
759 * - one calls phos_parse_uri_references
760 * - it exists with success, but the path becomes ""
761 * - this routine does the right thing, but the outcome is not what expected.
763 * so users for now have to user resolve_uri_from_str, which parses
764 * the URI but not normalize it, and then call into us.
765 */
766 static int
767 phos_resolve_uri_from(const struct phos_uri *base, const struct phos_uri *ref,
768 struct phos_uri *ret)
770 memset(ret, 0, sizeof(*ret));
772 if (*ref->scheme != '\0') {
773 strlcpy(ret->scheme, ref->scheme, sizeof(ret->scheme));
774 strlcpy(ret->host, ref->host, sizeof(ret->host));
775 strlcpy(ret->port, ref->port, sizeof(ret->port));
776 ret->dec_port = ret->dec_port;
777 strlcpy(ret->path, ref->path, sizeof(ret->path));
778 strlcpy(ret->query, ref->query, sizeof(ret->query));
779 } else {
780 if (*ref->host != '\0') {
781 strlcpy(ret->host, ref->host, sizeof(ret->host));
782 strlcpy(ret->port, ref->port, sizeof(ret->port));
783 ret->dec_port = ret->dec_port;
784 strlcpy(ret->path, ref->path, sizeof(ret->path));
785 strlcpy(ret->query, ref->query, sizeof(ret->query));
786 } else {
787 if (*ref->path == '\0') {
788 strlcpy(ret->path, base->path, sizeof(ret->path));
789 if (*ref->query != '\0')
790 strlcpy(ret->query, ref->query, sizeof(ret->query));
791 else
792 strlcpy(ret->query, base->query, sizeof(ret->query));
793 } else {
794 if (*ref->path == '/')
795 strlcpy(ret->path, ref->path, sizeof(ret->path));
796 else {
797 if (!merge_path(ret, base, ref))
798 return 0;
800 path_clean(ret);
802 strlcpy(ret->query, ref->query, sizeof(ret->query));
805 strlcpy(ret->host, base->host, sizeof(ret->host));
806 strlcpy(ret->port, base->port, sizeof(ret->port));
807 ret->dec_port = base->dec_port;
810 strlcpy(ret->scheme, base->scheme, sizeof(ret->scheme));
813 strlcpy(ret->fragment, ref->fragment, sizeof(ret->fragment));
815 return 1;
818 int
819 phos_resolve_uri_from_str(const struct phos_uri *base, const char *refstr,
820 struct phos_uri *ret)
822 struct phos_uri ref;
824 memset(&ref, 0, sizeof(ref));
826 if ((refstr = parse_uri_reference(refstr, &ref)) == NULL)
827 return 0;
829 if (*refstr != '\0')
830 return 0;
832 return phos_resolve_uri_from(base, &ref, ret);
835 void
836 phos_uri_drop_empty_segments(struct phos_uri *uri)
838 char *i;
840 for (i = uri->path; *i; ++i) {
841 if (*i == '/' && *(i+1) == '/') {
842 memmove(i, i+1, strlen(i)); /* move also the \0 */
843 i--;
848 int
849 phos_uri_set_query(struct phos_uri *uri, const char *query)
851 char *out;
852 int t;
853 size_t len;
855 len = sizeof(uri->query);
856 out = uri->query;
857 memset(uri->query, 0, len);
859 for (; *query != '\0' && len > 0; ++query) {
860 if (*query == '/' ||
861 *query == '?' ||
862 *query == ':' ||
863 *query == '@' ||
864 unreserved(*query) ||
865 sub_delims(*query)) {
866 *out++ = *query;
867 len--;
868 } else {
869 if (len <= 4)
870 break;
871 len -= 3;
872 *out++ = '%';
873 t = *query;
874 sprintf(out, "%02X", t);
875 out += 2;
879 return *query == '\0';
882 int
883 phos_serialize_uri(const struct phos_uri *uri, char *buf, size_t len)
885 #define CAT(s) \
886 if (strlcat(buf, s, len) >= len) \
887 return 0;
889 strlcpy(buf, "", len);
891 if (*uri->scheme != '\0') {
892 CAT(uri->scheme);
893 CAT(":");
896 if (*uri->host != '\0' || strcmp(uri->scheme, "file") == 0) {
897 /*
898 * The file URI scheme has a quirk that even if a
899 * hostname is not present, we still have to append
900 * the two slashes. This is why we have
901 * file:///etc/hosts and not file:/etc/hosts
902 */
903 CAT("//");
904 CAT(uri->host);
907 if (*uri->port != '\0' && strcmp(uri->port, "1965")) {
908 CAT(":");
909 CAT(uri->port);
912 CAT(uri->path);
914 if (*uri->query != '\0') {
915 CAT("?");
916 CAT(uri->query);
919 if (*uri->fragment) {
920 CAT("#");
921 CAT(uri->fragment);
924 return 1;
926 #undef CAT