Blob


1 /*
2 * Copyright (c) 2021 Omar Polo <op@omarpolo.com>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
17 /*
18 * TODOs:
19 * - distinguish between an empty component and a undefined one
20 * - ...
21 */
23 #include <assert.h>
25 #include "compat.h"
27 #include "phos.h"
29 #include <ctype.h>
30 #include <stdint.h>
31 #include <stdlib.h>
32 #include <string.h>
34 static const char *sub_ip_literal(const char*);
35 static const char *sub_host_dummy(const char*);
36 static const char *sub_pchar(const char*);
37 static const char *sub_segment(const char*);
38 static const char *sub_segment_nz(const char*);
39 static const char *sub_segment_nz_nc(const char*);
40 static const char *sub_path_common(const char*);
42 static const char *parse_scheme(const char*, struct phos_uri*);
43 static const char *parse_host(const char*, struct phos_uri*);
44 static const char *parse_port(const char*, struct phos_uri*);
45 static const char *parse_authority(const char*, struct phos_uri*);
46 static const char *parse_path_abempty(const char*, struct phos_uri*);
47 static const char *parse_path_absolute(const char*, struct phos_uri*);
48 static const char *parse_path_noscheme(const char*, struct phos_uri*);
49 static const char *parse_path_rootless(const char*, struct phos_uri*);
50 static const char *parse_path_empty(const char*, struct phos_uri*);
51 static const char *parse_hier_part(const char*, struct phos_uri*);
52 static const char *parse_query(const char*, struct phos_uri*);
53 static const char *parse_fragment(const char*, struct phos_uri*);
54 static const char *parse_uri(const char*, struct phos_uri*);
55 static const char *parse_relative_part(const char*, struct phos_uri*);
56 static const char *parse_relative_ref(const char*, struct phos_uri*);
57 static const char *parse_uri_reference(const char*, struct phos_uri*);
59 static int hasprefix(const char*, const char*);
60 static char *dotdot(char*, char*);
61 static void path_clean(struct phos_uri*);
62 static int merge_path(struct phos_uri*, const struct phos_uri*, const struct phos_uri*);
64 static int phos_resolve_uri_from(const struct phos_uri*, const struct phos_uri*, struct phos_uri*);
67 /* common defs */
69 static inline int
70 gen_delims(int c)
71 {
72 return c == ':'
73 || c == '/'
74 || c == '?'
75 || c == '#'
76 || c == '['
77 || c == ']'
78 || c == '@';
79 }
81 static inline int
82 sub_delims(int c)
83 {
84 return c == '!'
85 || c == '$'
86 || c == '&'
87 || c == '\''
88 || c == '('
89 || c == ')'
90 || c == '*'
91 || c == '+'
92 || c == ','
93 || c == ';'
94 || c == '=';
95 }
97 static inline int
98 reserved(int c)
99 {
100 return gen_delims(c) || sub_delims(c);
103 static inline int
104 unreserved(int c)
106 return isalpha(c)
107 || isdigit(c)
108 || c == '-'
109 || c == '.'
110 || c == '_'
111 || c == '~';
115 /* subs */
117 /*
118 * IP-literal = "[" ( IPv6address / IPvFuture ) "]"
120 * in reality, we parse [.*]
121 */
122 static const char *
123 sub_ip_literal(const char *s)
125 if (*s != '[')
126 return NULL;
128 while (*s != '\0' && *s != ']')
129 s++;
131 if (*s == '\0')
132 return NULL;
133 return ++s;
136 /*
137 * parse everything until : or / (or \0).
138 * NB: empty hosts are technically valid!
139 */
140 static const char *
141 sub_host_dummy(const char *s)
143 while (*s != '\0' && *s != ':' && *s != '/')
144 s++;
145 return s;
148 /*
149 * pchar = unreserved / pct-encoded / sub-delims / ":" / "@"
150 */
151 static const char *
152 sub_pchar(const char *s)
154 if (*s == '\0')
155 return NULL;
157 if (unreserved(*s))
158 return ++s;
160 if (*s == '%') {
161 if (isxdigit(s[1]) && isxdigit(s[2]))
162 return s + 3;
165 if (sub_delims(*s))
166 return ++s;
168 if (*s == ':' || *s == '@')
169 return ++s;
171 return NULL;
174 /*
175 * segment = *pchar
176 */
177 static const char *
178 sub_segment(const char *s)
180 const char *t;
182 while ((t = sub_pchar(s)) != NULL)
183 s = t;
184 return s;
187 /* segment-nz = 1*pchar */
188 static const char *
189 sub_segment_nz(const char *s)
191 if ((s = sub_pchar(s)) == NULL)
192 return NULL;
193 return sub_segment(s);
196 /*
197 * segment-nz-nc = 1*( unreserved / pct-encoded / sub-delims / "@" )
199 * so, 1*pchar excluding ":"
200 */
201 static const char *
202 sub_segment_nz_nc(const char *s)
204 const char *t;
206 if (*s == ':')
207 return NULL;
209 while (*s != ':' && (t = sub_pchar(s)) != NULL)
210 s = t;
211 return s;
214 /* *( "/" segment ) */
215 static const char *
216 sub_path_common(const char *s)
218 for (;;) {
219 if (*s != '/')
220 return s;
221 s++;
222 s = sub_segment(s);
227 /* parse fns */
229 /*
230 * scheme = ALPHA *( ALPHA / DIGIT / "+" / "-" / "." )
231 */
232 static const char *
233 parse_scheme(const char *s, struct phos_uri *parsed)
235 const char *start = s;
236 size_t len;
238 if (!isalpha(*s))
239 return NULL;
241 while (*s != '\0') {
242 if (isalpha(*s) ||
243 isdigit(*s) ||
244 *s == '+' ||
245 *s == '-' ||
246 *s == '.')
247 s++;
248 else
249 break;
252 if (*s == '\0')
253 return NULL;
255 len = s - start;
256 if (len >= sizeof(parsed->scheme))
257 return NULL;
259 memcpy(parsed->scheme, start, len);
260 return s;
263 /*
264 * host = IP-literal / IPv4address / reg-name
266 * rules IPv4address and reg-name are relaxed into parse_host_dummy.
267 */
268 static const char *
269 parse_host(const char *s, struct phos_uri *parsed)
271 const char *t;
272 size_t len;
274 if ((t = sub_ip_literal(s)) != NULL ||
275 (t = sub_host_dummy(s)) != NULL) {
276 len = t - s;
277 if (len >= sizeof(parsed->scheme))
278 return NULL;
279 memcpy(parsed->host, s, len);
280 return t;
283 return NULL;
286 /*
287 * port = *digit
288 */
289 static const char *
290 parse_port(const char *s, struct phos_uri *parsed)
292 const char *errstr, *start = s;
293 size_t len;
295 while (isdigit(*s))
296 s++;
298 if (s == start)
299 return NULL;
301 len = s - start;
302 if (len >= sizeof(parsed->port))
303 return NULL;
305 memcpy(parsed->port, start, len);
307 parsed->dec_port = strtonum(parsed->port, 0, 65535, &errstr);
308 if (errstr != NULL)
309 return NULL;
311 return s;
314 /*
315 * authority = host [ ":" port ]
316 * (yep, blatantly ignore the userinfo stuff -- not relevant for Gemini)
317 */
318 static const char *
319 parse_authority(const char *s, struct phos_uri *parsed)
321 if ((s = parse_host(s, parsed)) == NULL)
322 return NULL;
324 if (*s == ':') {
325 s++;
326 return parse_port(s, parsed);
329 return s;
332 static inline const char *
333 set_path(const char *start, const char *end, struct phos_uri *parsed)
335 size_t len;
337 if (end == NULL)
338 return NULL;
340 len = end - start;
341 if (len >= sizeof(parsed->path))
342 return NULL;
343 memcpy(parsed->path, start, len);
344 return end;
347 /*
348 * path-abempty = *( "/" segment )
349 */
350 static const char *
351 parse_path_abempty(const char *s, struct phos_uri *parsed)
353 const char *t;
355 t = sub_path_common(s);
356 return set_path(s, t, parsed);
359 /*
360 * path-absolute = "/" [ segment-nz *( "/" segment ) ]
361 */
362 static const char *
363 parse_path_absolute(const char *s, struct phos_uri *parsed)
365 const char *t, *start = s;
367 if (*s != '/')
368 return NULL;
370 s++;
371 if ((t = sub_segment_nz(s)) == NULL)
372 return set_path(start, s, parsed);
374 s = sub_path_common(t);
375 return set_path(start, s, parsed);
378 /*
379 * path-noscheme = segment-nz-nc *( "/" segment )
380 */
381 static const char *
382 parse_path_noscheme(const char *s, struct phos_uri *parsed)
384 const char *start = s;
386 if ((s = sub_segment_nz_nc(s)) == NULL)
387 return NULL;
388 s = sub_path_common(s);
389 return set_path(start, s, parsed);
392 /*
393 * path-rootless = segment-nz *( "/" segment )
394 */
395 static const char *
396 parse_path_rootless(const char *s, struct phos_uri *parsed)
398 const char *start = s;
400 if ((s = sub_segment_nz(s)) == NULL)
401 return NULL;
402 s = sub_path_common(s);
403 return set_path(start, s, parsed);
406 /*
407 * path-empty = 0<pchar>
408 */
409 static const char *
410 parse_path_empty(const char *s, struct phos_uri *parsed)
412 return s;
415 /*
416 * hier-part = "//" authority path-abempty
417 * / path-absolute
418 * / path-rootless
419 * / path-empty
420 */
421 static const char *
422 parse_hier_part(const char *s, struct phos_uri *parsed)
424 const char *t;
426 if (s[0] == '/' && s[1] == '/') {
427 s += 2;
428 if ((s = parse_authority(s, parsed)) == NULL)
429 return NULL;
430 return parse_path_abempty(s, parsed);
433 if ((t = parse_path_absolute(s, parsed)) != NULL)
434 return t;
436 if ((t = parse_path_rootless(s, parsed)) != NULL)
437 return t;
439 return parse_path_empty(s, parsed);
442 /*
443 * query = *( pchar / "/" / "?" )
444 */
445 static const char *
446 parse_query(const char *s, struct phos_uri *parsed)
448 const char *t, *start = s;
449 size_t len;
451 while (*s != '\0') {
452 if (*s == '/' || *s == '?') {
453 s++;
454 continue;
457 if ((t = sub_pchar(s)) == NULL)
458 break;
459 s = t;
462 len = s - start;
463 if (len >= sizeof(parsed->query))
464 return NULL;
466 memcpy(parsed->query, start, len);
467 return s;
470 /*
471 * fragment = *( pchar / "/" / "?" )
472 */
473 static const char *
474 parse_fragment(const char *s, struct phos_uri *parsed)
476 const char *start = s;
477 size_t len;
479 for (;;) {
480 if (*s == '\0')
481 break;
483 if (*s == '/' || *s == '?') {
484 s++;
485 continue;
488 if ((s = sub_pchar(s)) == NULL)
489 return NULL;
492 len = s - start;
493 if (len >= sizeof(parsed->fragment))
494 return NULL;
496 memcpy(parsed->fragment, start, len);
497 return s;
500 /*
501 * URI = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
502 */
503 static const char *
504 parse_uri(const char *s, struct phos_uri *parsed)
506 if ((s = parse_scheme(s, parsed)) == NULL)
507 return NULL;
509 if (*s != ':')
510 return NULL;
512 s++;
513 if ((s = parse_hier_part(s, parsed)) == NULL)
514 return NULL;
516 if (*s == '?') {
517 s++;
518 if ((s = parse_query(s, parsed)) == NULL)
519 return NULL;
522 if (*s == '#') {
523 s++;
524 if ((s = parse_fragment(s, parsed)) == NULL)
525 return NULL;
528 return s;
531 /*
532 * relative-part = "//" authority path-abempty
533 * / path-absolute
534 * / path-noscheme
535 * / path-empty
536 */
537 static const char *
538 parse_relative_part(const char *s, struct phos_uri *parsed)
540 const char *t;
542 if (s[0] == '/' && s[1] == '/') {
543 s += 2;
544 if ((s = parse_authority(s, parsed)) == NULL)
545 return NULL;
546 return parse_path_abempty(s, parsed);
549 if ((t = parse_path_absolute(s, parsed)) != NULL)
550 return t;
552 if ((t = parse_path_noscheme(s, parsed)) != NULL)
553 return t;
555 return parse_path_empty(s, parsed);
558 /*
559 * relative-ref = relative-part [ "?" query ] [ "#" fragment ]
560 */
561 static const char *
562 parse_relative_ref(const char *s, struct phos_uri *parsed)
564 if ((s = parse_relative_part(s, parsed)) == NULL)
565 return NULL;
567 if (*s == '?') {
568 s++;
569 if ((s = parse_query(s, parsed)) == NULL)
570 return NULL;
573 if (*s == '#') {
574 s++;
575 if ((s = parse_fragment(s, parsed)) == NULL)
576 return NULL;
579 return s;
582 /*
583 * URI-reference = URI / relative-ref
584 */
585 static const char *
586 parse_uri_reference(const char *s, struct phos_uri *parsed)
588 const char *t;
590 if ((t = parse_uri(s, parsed)) != NULL)
591 return t;
592 memset(parsed, 0, sizeof(*parsed));
593 return parse_relative_ref(s, parsed);
597 /*
598 * absolute-URI = scheme ":" hier-part [ "?" query ]
599 */
600 static const char *
601 parse_absolute_uri(const char *s, struct phos_uri *parsed)
603 if ((s = parse_scheme(s, parsed)) == NULL)
604 return NULL;
606 if (*s != ':')
607 return NULL;
609 s++;
610 if ((s = parse_hier_part(s, parsed)) == NULL)
611 return NULL;
613 if (*s == '?') {
614 s++;
615 if ((s = parse_query(s, parsed)) == NULL)
616 return NULL;
619 return s;
623 /* normalizing fns */
625 static int
626 hasprefix(const char *str, const char *prfx)
628 for (; *str == *prfx && *prfx != '\0'; str++, prfx++)
631 return *prfx == '\0';
634 static char *
635 dotdot(char *point, char *start)
637 char *t;
639 for (t = point-1; t > start; --t) {
640 if (*t == '/')
641 break;
643 if (t < start)
644 t = start;
646 memmove(t, point, strlen(point)+1);
647 return t;
650 /*
651 * This is the "Remove Dot Segments" straight outta RFC3986, section
652 * 5.2.4
653 */
654 static void
655 path_clean(struct phos_uri *uri)
657 char *in = uri->path;
659 while (in != NULL && *in != '\0') {
660 assert(in >= uri->path);
662 /* A) drop leading ../ or ./ */
663 if (hasprefix(in, "../"))
664 memmove(in, &in[3], strlen(&in[3])+1);
665 else if (hasprefix(in, "./"))
666 memmove(in, &in[2], strlen(&in[2])+1);
668 /* B) replace /./ or /. with / */
669 else if (hasprefix(in, "/./"))
670 memmove(&in[1], &in[3], strlen(&in[3])+1);
671 else if (!strcmp(in, "/."))
672 in[1] = '\0';
674 /* C) resolve dot-dot */
675 else if (hasprefix(in, "/../")) {
676 in = dotdot(in, uri->path);
677 memmove(&in[1], &in[4], strlen(&in[4])+1);
678 } else if (!strcmp(in, "/..")) {
679 in = dotdot(in, uri->path);
680 in[1] = '\0';
681 break;
684 /* D */
685 else if (!strcmp(in, "."))
686 *in = '\0';
687 else if (!strcmp(in, ".."))
688 *in = '\0';
690 /* E */
691 else
692 in = strchr(in+1, '/');
696 /*
697 * see RFC3986 5.3.3 "Merge Paths".
698 */
699 static int
700 merge_path(struct phos_uri *ret, const struct phos_uri *base,
701 const struct phos_uri *ref)
703 const char *s;
704 size_t len;
706 len = sizeof(ret->path);
708 s = strrchr(base->path, '/');
709 if ((*base->host != '\0' && *base->path == '\0') || s == NULL) {
710 strlcpy(ret->path, "/", len);
711 } else {
712 /* copy the / too */
713 memcpy(ret->path, base->path, s - base->path + 1);
716 return strlcat(ret->path, ref->path, len) < len;
720 /* public interface */
722 int
723 phos_parse_absolute_uri(const char *s, struct phos_uri *uri)
725 memset(uri, 0, sizeof(*uri));
727 if ((s = parse_absolute_uri(s, uri)) == NULL)
728 return 0;
729 if (*s != '\0')
730 return 0;
731 path_clean(uri);
732 return 1;
735 int
736 phos_parse_uri_reference(const char *s, struct phos_uri *uri)
738 memset(uri, 0, sizeof(*uri));
740 if ((s = parse_uri_reference(s, uri)) == NULL)
741 return 0;
742 if (*s != '\0')
743 return 0;
744 path_clean(uri);
745 return 1;
748 /*
749 * Implementation of the "transform references" algorithm from
750 * RFC3986, see 5.2.2.
752 * We expect base and ref to be URIs constructed by this library
753 * (because we emit only normalized URIs).
755 * ATM this is marked as private because:
756 * - let's say the URI is "."
757 * - one calls phos_parse_uri_references
758 * - it exists with success, but the path becomes ""
759 * - this routine does the right thing, but the outcome is not what expected.
761 * so users for now have to user resolve_uri_from_str, which parses
762 * the URI but not normalize it, and then call into us.
763 */
764 static int
765 phos_resolve_uri_from(const struct phos_uri *base, const struct phos_uri *ref,
766 struct phos_uri *ret)
768 memset(ret, 0, sizeof(*ret));
770 if (*ref->scheme != '\0') {
771 strlcpy(ret->scheme, ref->scheme, sizeof(ret->scheme));
772 strlcpy(ret->host, ref->host, sizeof(ret->host));
773 strlcpy(ret->port, ref->port, sizeof(ret->port));
774 ret->dec_port = ret->dec_port;
775 strlcpy(ret->path, ref->path, sizeof(ret->path));
776 strlcpy(ret->query, ref->query, sizeof(ret->query));
777 } else {
778 if (*ref->host != '\0') {
779 strlcpy(ret->host, ref->host, sizeof(ret->host));
780 strlcpy(ret->port, ref->port, sizeof(ret->port));
781 ret->dec_port = ret->dec_port;
782 strlcpy(ret->path, ref->path, sizeof(ret->path));
783 strlcpy(ret->query, ref->query, sizeof(ret->query));
784 } else {
785 if (*ref->path == '\0') {
786 strlcpy(ret->path, base->path, sizeof(ret->path));
787 if (*ref->query != '\0')
788 strlcpy(ret->query, ref->query, sizeof(ret->query));
789 else
790 strlcpy(ret->query, base->query, sizeof(ret->query));
791 } else {
792 if (*ref->path == '/')
793 strlcpy(ret->path, ref->path, sizeof(ret->path));
794 else {
795 if (!merge_path(ret, base, ref))
796 return 0;
798 path_clean(ret);
800 strlcpy(ret->query, ref->query, sizeof(ret->query));
803 strlcpy(ret->host, base->host, sizeof(ret->host));
804 strlcpy(ret->port, base->port, sizeof(ret->port));
805 ret->dec_port = base->dec_port;
808 strlcpy(ret->scheme, base->scheme, sizeof(ret->scheme));
811 strlcpy(ret->fragment, ref->fragment, sizeof(ret->fragment));
813 return 1;
816 int
817 phos_resolve_uri_from_str(const struct phos_uri *base, const char *refstr,
818 struct phos_uri *ret)
820 struct phos_uri ref;
822 memset(&ref, 0, sizeof(ref));
824 if ((refstr = parse_uri_reference(refstr, &ref)) == NULL)
825 return 0;
827 if (*refstr != '\0')
828 return 0;
830 return phos_resolve_uri_from(base, &ref, ret);
833 void
834 phos_uri_drop_empty_segments(struct phos_uri *uri)
836 char *i;
838 for (i = uri->path; *i; ++i) {
839 if (*i == '/' && *(i+1) == '/') {
840 memmove(i, i+1, strlen(i)); /* move also the \0 */
841 i--;
846 int
847 phos_serialize_uri(const struct phos_uri *uri, char *buf, size_t len)
849 #define CAT(s) \
850 if (strlcat(buf, s, len) >= len) \
851 return 0;
853 strlcpy(buf, "", len);
855 if (*uri->scheme != '\0') {
856 CAT(uri->scheme);
857 CAT(":");
860 if (*uri->host != '\0') {
861 CAT("//");
862 CAT(uri->host);
865 CAT(uri->path);
867 if (*uri->query != '\0') {
868 CAT("?");
869 CAT(uri->query);
872 if (*uri->fragment) {
873 CAT("#");
874 CAT(uri->fragment);
877 return 1;
879 #undef CAT