Blob


1 /*
2 * Copyright (c) 2021 Omar Polo <op@omarpolo.com>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
17 /*
18 * TODOs:
19 * - distinguish between an empty component and a undefined one
20 * - ...
21 */
23 #include <assert.h>
25 #include "compat.h"
27 #include "phos.h"
29 #include <ctype.h>
30 #include <stdint.h>
31 #include <stdio.h>
32 #include <stdlib.h>
33 #include <string.h>
35 static const char *sub_ip_literal(const char*);
36 static const char *sub_host_dummy(const char*);
37 static const char *sub_pchar(const char*);
39 static const char *sub_segment(const char*);
40 static const char *sub_segment_nz(const char*);
41 static const char *sub_segment_nz_nc(const char*);
42 static const char *sub_path_common(const char*);
44 static const char *parse_scheme(const char*, struct phos_uri*);
45 static const char *parse_host(const char*, struct phos_uri*);
46 static const char *parse_port(const char*, struct phos_uri*);
47 static const char *parse_authority(const char*, struct phos_uri*);
48 static const char *parse_path_abempty(const char*, struct phos_uri*);
49 static const char *parse_path_absolute(const char*, struct phos_uri*);
50 static const char *parse_path_noscheme(const char*, struct phos_uri*);
51 static const char *parse_path_rootless(const char*, struct phos_uri*);
52 static const char *parse_path_empty(const char*, struct phos_uri*);
53 static const char *parse_hier_part(const char*, struct phos_uri*);
54 static const char *parse_query(const char*, struct phos_uri*);
55 static const char *parse_fragment(const char*, struct phos_uri*);
56 static const char *parse_uri(const char*, struct phos_uri*);
57 static const char *parse_relative_part(const char*, struct phos_uri*);
58 static const char *parse_relative_ref(const char*, struct phos_uri*);
59 static const char *parse_uri_reference(const char*, struct phos_uri*);
61 static int hasprefix(const char*, const char*);
62 static char *dotdot(char*, char*);
63 static void path_clean(struct phos_uri*);
64 static int merge_path(struct phos_uri*, const struct phos_uri*, const struct phos_uri*);
66 static int phos_resolve_uri_from(const struct phos_uri*, const struct phos_uri*, struct phos_uri*);
69 /* common defs */
71 #if unused
72 static inline int
73 gen_delims(int c)
74 {
75 return c == ':'
76 || c == '/'
77 || c == '?'
78 || c == '#'
79 || c == '['
80 || c == ']'
81 || c == '@';
82 }
83 #endif
85 static inline int
86 sub_delims(int c)
87 {
88 return c == '!'
89 || c == '$'
90 || c == '&'
91 || c == '\''
92 || c == '('
93 || c == ')'
94 || c == '*'
95 || c == '+'
96 || c == ','
97 || c == ';'
98 || c == '=';
99 }
101 #if unused
102 static inline int
103 reserved(int c)
105 return gen_delims(c) || sub_delims(c);
107 #endif
109 static inline int
110 unreserved(int c)
112 return isalpha(c)
113 || isdigit(c)
114 || c == '-'
115 || c == '.'
116 || c == '_'
117 || c == '~';
121 /* subs */
123 /*
124 * IP-literal = "[" ( IPv6address / IPvFuture ) "]"
126 * in reality, we parse [.*]
127 */
128 static const char *
129 sub_ip_literal(const char *s)
131 if (*s != '[')
132 return NULL;
134 while (*s != '\0' && *s != ']')
135 s++;
137 if (*s == '\0')
138 return NULL;
139 return ++s;
142 /*
143 * parse everything until : or / (or \0).
144 * NB: empty hosts are technically valid!
145 */
146 static const char *
147 sub_host_dummy(const char *s)
149 while (*s != '\0' && *s != ':' && *s != '/')
150 s++;
151 return s;
154 /*
155 * pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
156 */
157 static const char *
158 sub_pchar(const char *s)
160 if (*s == '\0')
161 return NULL;
163 if (unreserved(*s))
164 return ++s;
166 if (*s == '%') {
167 if (isxdigit(s[1]) && isxdigit(s[2]))
168 return s + 3;
171 if (sub_delims(*s))
172 return ++s;
174 if (*s == ':' || *s == '@')
175 return ++s;
177 return NULL;
180 /*
181 * segment = *pchar
182 */
183 static const char *
184 sub_segment(const char *s)
186 const char *t;
188 while ((t = sub_pchar(s)) != NULL)
189 s = t;
190 return s;
193 /* segment-nz = 1*pchar */
194 static const char *
195 sub_segment_nz(const char *s)
197 if ((s = sub_pchar(s)) == NULL)
198 return NULL;
199 return sub_segment(s);
202 /*
203 * segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
205 * so, 1*pchar excluding ":"
206 */
207 static const char *
208 sub_segment_nz_nc(const char *s)
210 const char *t;
212 if (*s == ':')
213 return NULL;
215 while (*s != ':' && (t = sub_pchar(s)) != NULL)
216 s = t;
217 return s;
220 /* *( "/" segment ) */
221 static const char *
222 sub_path_common(const char *s)
224 for (;;) {
225 if (*s != '/')
226 return s;
227 s++;
228 s = sub_segment(s);
233 /* parse fns */
235 /*
236 * scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
237 */
238 static const char *
239 parse_scheme(const char *s, struct phos_uri *parsed)
241 const char *start = s;
242 size_t len;
244 if (!isalpha(*s))
245 return NULL;
247 while (*s != '\0') {
248 if (isalpha(*s) ||
249 isdigit(*s) ||
250 *s == '+' ||
251 *s == '-' ||
252 *s == '.')
253 s++;
254 else
255 break;
258 if (*s == '\0')
259 return NULL;
261 len = s - start;
262 if (len >= sizeof(parsed->scheme))
263 return NULL;
265 memcpy(parsed->scheme, start, len);
266 return s;
269 /*
270 * host = IP-literal / IPv4address / reg-name
272 * rules IPv4address and reg-name are relaxed into parse_host_dummy.
273 */
274 static const char *
275 parse_host(const char *s, struct phos_uri *parsed)
277 const char *t;
278 size_t len;
280 if ((t = sub_ip_literal(s)) != NULL ||
281 (t = sub_host_dummy(s)) != NULL) {
282 len = t - s;
283 if (len >= sizeof(parsed->scheme))
284 return NULL;
285 memcpy(parsed->host, s, len);
286 return t;
289 return NULL;
292 /*
293 * port = *digit
294 */
295 static const char *
296 parse_port(const char *s, struct phos_uri *parsed)
298 const char *errstr, *start = s;
299 size_t len;
301 while (isdigit(*s))
302 s++;
304 if (s == start)
305 return NULL;
307 len = s - start;
308 if (len >= sizeof(parsed->port))
309 return NULL;
311 memcpy(parsed->port, start, len);
313 parsed->dec_port = strtonum(parsed->port, 0, 65535, &errstr);
314 if (errstr != NULL)
315 return NULL;
317 return s;
320 /*
321 * authority = host [ ":" port ]
322 * (yep, blatantly ignore the userinfo stuff -- not relevant for Gemini)
323 */
324 static const char *
325 parse_authority(const char *s, struct phos_uri *parsed)
327 if ((s = parse_host(s, parsed)) == NULL)
328 return NULL;
330 if (*s == ':') {
331 s++;
332 return parse_port(s, parsed);
335 return s;
338 static inline const char *
339 set_path(const char *start, const char *end, struct phos_uri *parsed)
341 size_t len;
343 if (end == NULL)
344 return NULL;
346 len = end - start;
347 if (len >= sizeof(parsed->path))
348 return NULL;
349 memcpy(parsed->path, start, len);
350 return end;
353 /*
354 * path-abempty = *( "/" segment )
355 */
356 static const char *
357 parse_path_abempty(const char *s, struct phos_uri *parsed)
359 const char *t;
361 t = sub_path_common(s);
362 return set_path(s, t, parsed);
365 /*
366 * path-absolute = "/" [ segment-nz *( "/" segment ) ]
367 */
368 static const char *
369 parse_path_absolute(const char *s, struct phos_uri *parsed)
371 const char *t, *start = s;
373 if (*s != '/')
374 return NULL;
376 s++;
377 if ((t = sub_segment_nz(s)) == NULL)
378 return set_path(start, s, parsed);
380 s = sub_path_common(t);
381 return set_path(start, s, parsed);
384 /*
385 * path-noscheme = segment-nz-nc *( "/" segment )
386 */
387 static const char *
388 parse_path_noscheme(const char *s, struct phos_uri *parsed)
390 const char *start = s;
392 if ((s = sub_segment_nz_nc(s)) == NULL)
393 return NULL;
394 s = sub_path_common(s);
395 return set_path(start, s, parsed);
398 /*
399 * path-rootless = segment-nz *( "/" segment )
400 */
401 static const char *
402 parse_path_rootless(const char *s, struct phos_uri *parsed)
404 const char *start = s;
406 if ((s = sub_segment_nz(s)) == NULL)
407 return NULL;
408 s = sub_path_common(s);
409 return set_path(start, s, parsed);
412 /*
413 * path-empty = 0<pchar>
414 */
415 static const char *
416 parse_path_empty(const char *s, struct phos_uri *parsed)
418 return s;
421 /*
422 * hier-part = "//" authority path-abempty
423 * / path-absolute
424 * / path-rootless
425 * / path-empty
426 */
427 static const char *
428 parse_hier_part(const char *s, struct phos_uri *parsed)
430 const char *t;
432 if (s[0] == '/' && s[1] == '/') {
433 s += 2;
434 if ((s = parse_authority(s, parsed)) == NULL)
435 return NULL;
436 return parse_path_abempty(s, parsed);
439 if ((t = parse_path_absolute(s, parsed)) != NULL)
440 return t;
442 if ((t = parse_path_rootless(s, parsed)) != NULL)
443 return t;
445 return parse_path_empty(s, parsed);
448 /*
449 * query = *( pchar / "/" / "?" )
450 */
451 static const char *
452 parse_query(const char *s, struct phos_uri *parsed)
454 const char *t, *start = s;
455 size_t len;
457 while (*s != '\0') {
458 if (*s == '/' || *s == '?') {
459 s++;
460 continue;
463 if ((t = sub_pchar(s)) == NULL)
464 break;
465 s = t;
468 len = s - start;
469 if (len >= sizeof(parsed->query))
470 return NULL;
472 memcpy(parsed->query, start, len);
473 return s;
476 /*
477 * fragment = *( pchar / "/" / "?" )
478 */
479 static const char *
480 parse_fragment(const char *s, struct phos_uri *parsed)
482 const char *start = s;
483 size_t len;
485 for (;;) {
486 if (*s == '\0')
487 break;
489 if (*s == '/' || *s == '?') {
490 s++;
491 continue;
494 if ((s = sub_pchar(s)) == NULL)
495 return NULL;
498 len = s - start;
499 if (len >= sizeof(parsed->fragment))
500 return NULL;
502 memcpy(parsed->fragment, start, len);
503 return s;
506 /*
507 * URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
508 */
509 static const char *
510 parse_uri(const char *s, struct phos_uri *parsed)
512 if ((s = parse_scheme(s, parsed)) == NULL)
513 return NULL;
515 if (*s != ':')
516 return NULL;
518 s++;
519 if ((s = parse_hier_part(s, parsed)) == NULL)
520 return NULL;
522 if (*s == '?') {
523 s++;
524 if ((s = parse_query(s, parsed)) == NULL)
525 return NULL;
528 if (*s == '#') {
529 s++;
530 if ((s = parse_fragment(s, parsed)) == NULL)
531 return NULL;
534 return s;
537 /*
538 * relative-part = "//" authority path-abempty
539 * / path-absolute
540 * / path-noscheme
541 * / path-empty
542 */
543 static const char *
544 parse_relative_part(const char *s, struct phos_uri *parsed)
546 const char *t;
548 if (s[0] == '/' && s[1] == '/') {
549 s += 2;
550 if ((s = parse_authority(s, parsed)) == NULL)
551 return NULL;
552 return parse_path_abempty(s, parsed);
555 if ((t = parse_path_absolute(s, parsed)) != NULL)
556 return t;
558 if ((t = parse_path_noscheme(s, parsed)) != NULL)
559 return t;
561 return parse_path_empty(s, parsed);
564 /*
565 * relative-ref = relative-part [ "?" query ] [ "#" fragment ]
566 */
567 static const char *
568 parse_relative_ref(const char *s, struct phos_uri *parsed)
570 if ((s = parse_relative_part(s, parsed)) == NULL)
571 return NULL;
573 if (*s == '?') {
574 s++;
575 if ((s = parse_query(s, parsed)) == NULL)
576 return NULL;
579 if (*s == '#') {
580 s++;
581 if ((s = parse_fragment(s, parsed)) == NULL)
582 return NULL;
585 return s;
588 /*
589 * URI-reference = URI / relative-ref
590 */
591 static const char *
592 parse_uri_reference(const char *s, struct phos_uri *parsed)
594 const char *t;
596 if ((t = parse_uri(s, parsed)) != NULL)
597 return t;
598 memset(parsed, 0, sizeof(*parsed));
599 return parse_relative_ref(s, parsed);
603 /*
604 * absolute-URI = scheme ":" hier-part [ "?" query ]
605 */
606 static const char *
607 parse_absolute_uri(const char *s, struct phos_uri *parsed)
609 if ((s = parse_scheme(s, parsed)) == NULL)
610 return NULL;
612 if (*s != ':')
613 return NULL;
615 s++;
616 if ((s = parse_hier_part(s, parsed)) == NULL)
617 return NULL;
619 if (*s == '?') {
620 s++;
621 if ((s = parse_query(s, parsed)) == NULL)
622 return NULL;
625 return s;
629 /* normalizing fns */
631 static int
632 hasprefix(const char *str, const char *prfx)
634 for (; *str == *prfx && *prfx != '\0'; str++, prfx++)
637 return *prfx == '\0';
640 static char *
641 dotdot(char *point, char *start)
643 char *t;
645 for (t = point-1; t > start; --t) {
646 if (*t == '/')
647 break;
649 if (t < start)
650 t = start;
652 memmove(t, point, strlen(point)+1);
653 return t;
656 /*
657 * This is the "Remove Dot Segments" straight outta RFC3986, section
658 * 5.2.4
659 */
660 static void
661 path_clean(struct phos_uri *uri)
663 char *in = uri->path;
665 while (in != NULL && *in != '\0') {
666 assert(in >= uri->path);
668 /* A) drop leading ../ or ./ */
669 if (hasprefix(in, "../"))
670 memmove(in, &in[3], strlen(&in[3])+1);
671 else if (hasprefix(in, "./"))
672 memmove(in, &in[2], strlen(&in[2])+1);
674 /* B) replace /./ or /. with / */
675 else if (hasprefix(in, "/./"))
676 memmove(&in[1], &in[3], strlen(&in[3])+1);
677 else if (!strcmp(in, "/."))
678 in[1] = '\0';
680 /* C) resolve dot-dot */
681 else if (hasprefix(in, "/../")) {
682 in = dotdot(in, uri->path);
683 memmove(&in[1], &in[4], strlen(&in[4])+1);
684 } else if (!strcmp(in, "/..")) {
685 in = dotdot(in, uri->path);
686 in[1] = '\0';
687 break;
690 /* D */
691 else if (!strcmp(in, "."))
692 *in = '\0';
693 else if (!strcmp(in, ".."))
694 *in = '\0';
696 /* E */
697 else
698 in = strchr(in+1, '/');
702 /*
703 * see RFC3986 5.3.3 "Merge Paths".
704 */
705 static int
706 merge_path(struct phos_uri *ret, const struct phos_uri *base,
707 const struct phos_uri *ref)
709 const char *s;
710 size_t len;
712 len = sizeof(ret->path);
714 s = strrchr(base->path, '/');
715 if ((*base->host != '\0' && *base->path == '\0') || s == NULL) {
716 strlcpy(ret->path, "/", len);
717 } else {
718 /* copy the / too */
719 memcpy(ret->path, base->path, s - base->path + 1);
722 return strlcat(ret->path, ref->path, len) < len;
726 /* public interface */
728 int
729 phos_parse_absolute_uri(const char *s, struct phos_uri *uri)
731 memset(uri, 0, sizeof(*uri));
733 if ((s = parse_absolute_uri(s, uri)) == NULL)
734 return 0;
735 if (*s != '\0')
736 return 0;
737 path_clean(uri);
738 return 1;
741 int
742 phos_parse_uri_reference(const char *s, struct phos_uri *uri)
744 memset(uri, 0, sizeof(*uri));
746 if ((s = parse_uri_reference(s, uri)) == NULL)
747 return 0;
748 if (*s != '\0')
749 return 0;
750 path_clean(uri);
751 return 1;
754 /*
755 * Implementation of the "transform references" algorithm from
756 * RFC3986, see 5.2.2.
758 * We expect base and ref to be URIs constructed by this library
759 * (because we emit only normalized URIs).
761 * ATM this is marked as private because:
762 * - let's say the URI is "."
763 * - one calls phos_parse_uri_references
764 * - it exists with success, but the path becomes ""
765 * - this routine does the right thing, but the outcome is not what expected.
767 * so users for now have to user resolve_uri_from_str, which parses
768 * the URI but not normalize it, and then call into us.
769 */
770 static int
771 phos_resolve_uri_from(const struct phos_uri *base, const struct phos_uri *ref,
772 struct phos_uri *ret)
774 memset(ret, 0, sizeof(*ret));
776 if (*ref->scheme != '\0') {
777 strlcpy(ret->scheme, ref->scheme, sizeof(ret->scheme));
778 strlcpy(ret->host, ref->host, sizeof(ret->host));
779 strlcpy(ret->port, ref->port, sizeof(ret->port));
780 ret->dec_port = ret->dec_port;
781 strlcpy(ret->path, ref->path, sizeof(ret->path));
782 strlcpy(ret->query, ref->query, sizeof(ret->query));
783 } else {
784 if (*ref->host != '\0') {
785 strlcpy(ret->host, ref->host, sizeof(ret->host));
786 strlcpy(ret->port, ref->port, sizeof(ret->port));
787 ret->dec_port = ret->dec_port;
788 strlcpy(ret->path, ref->path, sizeof(ret->path));
789 strlcpy(ret->query, ref->query, sizeof(ret->query));
790 } else {
791 if (*ref->path == '\0') {
792 strlcpy(ret->path, base->path, sizeof(ret->path));
793 if (*ref->query != '\0')
794 strlcpy(ret->query, ref->query, sizeof(ret->query));
795 else
796 strlcpy(ret->query, base->query, sizeof(ret->query));
797 } else {
798 if (*ref->path == '/')
799 strlcpy(ret->path, ref->path, sizeof(ret->path));
800 else {
801 if (!merge_path(ret, base, ref))
802 return 0;
804 path_clean(ret);
806 strlcpy(ret->query, ref->query, sizeof(ret->query));
809 strlcpy(ret->host, base->host, sizeof(ret->host));
810 strlcpy(ret->port, base->port, sizeof(ret->port));
811 ret->dec_port = base->dec_port;
814 strlcpy(ret->scheme, base->scheme, sizeof(ret->scheme));
817 strlcpy(ret->fragment, ref->fragment, sizeof(ret->fragment));
819 return 1;
822 int
823 phos_resolve_uri_from_str(const struct phos_uri *base, const char *refstr,
824 struct phos_uri *ret)
826 struct phos_uri ref;
828 memset(&ref, 0, sizeof(ref));
830 if ((refstr = parse_uri_reference(refstr, &ref)) == NULL)
831 return 0;
833 if (*refstr != '\0')
834 return 0;
836 return phos_resolve_uri_from(base, &ref, ret);
839 void
840 phos_uri_drop_empty_segments(struct phos_uri *uri)
842 char *i;
844 for (i = uri->path; *i; ++i) {
845 if (*i == '/' && *(i+1) == '/') {
846 memmove(i, i+1, strlen(i)); /* move also the \0 */
847 i--;
852 int
853 phos_uri_set_query(struct phos_uri *uri, const char *query)
855 char *out;
856 int t;
857 size_t len;
859 len = sizeof(uri->query);
860 out = uri->query;
861 memset(uri->query, 0, len);
863 for (; *query != '\0' && len > 0; ++query) {
864 if (*query == '/' ||
865 *query == '?' ||
866 *query == ':' ||
867 *query == '@' ||
868 unreserved(*query) ||
869 sub_delims(*query)) {
870 *out++ = *query;
871 len--;
872 } else {
873 if (len <= 4)
874 break;
875 len -= 3;
876 *out++ = '%';
877 t = *query;
878 sprintf(out, "%02X", t);
879 out += 2;
883 return *query == '\0';
886 int
887 phos_serialize_uri(const struct phos_uri *uri, char *buf, size_t len)
889 #define CAT(s) \
890 if (strlcat(buf, s, len) >= len) \
891 return 0;
893 strlcpy(buf, "", len);
895 if (*uri->scheme != '\0') {
896 CAT(uri->scheme);
897 CAT(":");
900 if (*uri->host != '\0' || strcmp(uri->scheme, "file") == 0) {
901 /*
902 * The file URI scheme has a quirk that even if a
903 * hostname is not present, we still have to append
904 * the two slashes. This is why we have
905 * file:///etc/hosts and not file:/etc/hosts
906 */
907 CAT("//");
908 CAT(uri->host);
911 if (*uri->port != '\0' && strcmp(uri->port, "1965")) {
912 CAT(":");
913 CAT(uri->port);
916 CAT(uri->path);
918 if (*uri->query != '\0') {
919 CAT("?");
920 CAT(uri->query);
923 if (*uri->fragment) {
924 CAT("#");
925 CAT(uri->fragment);
928 return 1;
930 #undef CAT