Blob


1 /*
2 * Copyright (c) 2022, 2024 Omar Polo <op@omarpolo.com>
3 *
4 * Permission to use, copy, modify, and distribute this software for any
5 * purpose with or without fee is hereby granted, provided that the above
6 * copyright notice and this permission notice appear in all copies.
7 *
8 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
15 */
17 #include "compat.h"
19 #include <ctype.h>
20 #include <errno.h>
21 #include <stddef.h>
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
27 #include "iri.h"
29 /* TODO: URI -> IRI. accept IRI but emit always URI */
31 static inline int
32 cpstr(const char *start, const char *till, char *buf, size_t len)
33 {
34 size_t slen = till - start;
36 if (slen + 1 >= len)
37 return (-1);
38 memcpy(buf, start, slen);
39 buf[slen] = '\0';
40 return (0);
41 }
43 static inline int
44 unreserved(int c)
45 {
46 return (isalnum((unsigned char)c) ||
47 c == '-' ||
48 c == '.' ||
49 c == '_' ||
50 c == '~');
51 }
53 static inline int
54 pctenc(const char *s)
55 {
56 const char *t = s;
58 return (t[0] == '%' &&
59 isxdigit((unsigned char)t[1]) &&
60 isxdigit((unsigned char)t[2]));
61 }
63 static inline int
64 sub_delims(int c)
65 {
66 return (c == '!' || c == '$' || c == '&' || c == '\'' ||
67 c == '(' || c == ')' || c == '*' || c == '+' || c == ',' ||
68 c == ';' || c == '=');
69 }
71 static inline const char *
72 advance_pchar(const char *s)
73 {
74 if (unreserved(*s) || sub_delims(*s) || *s == ':' || *s == '@')
75 return (s + 1);
76 if (pctenc(s))
77 return (s + 3);
78 return (NULL);
79 }
81 static inline const char *
82 advance_segment(const char *s)
83 {
84 const char *t = s;
86 while ((t = advance_pchar(s)) != NULL)
87 s = t;
88 return (s);
89 }
91 static inline const char *
92 advance_segment_nz(const char *s)
93 {
94 const char *t;
96 if ((t = advance_pchar(s)) == NULL)
97 return (NULL);
98 return (advance_segment(t));
99 }
101 static inline const char *
102 advance_segment_nz_nc(const char *s)
104 const char *t = s;
106 for (;;) {
107 if (unreserved(*t) || sub_delims(*t) || *t == '@')
108 t++;
109 else if (pctenc(t))
110 t += 3;
111 else
112 break;
115 return (t != s ? t : NULL);
118 static const char *
119 parse_scheme(const char *s, struct iri *iri)
121 const char *t = s;
123 if (!isalpha((unsigned char)*t))
124 return (NULL);
126 while (isalnum((unsigned char)*t) ||
127 *t == '+' ||
128 *t == '-' ||
129 *t == '.')
130 t++;
132 if (cpstr(s, t, iri->iri_scheme, sizeof(iri->iri_scheme)) == -1)
133 return (NULL);
135 iri->iri_flags |= IH_SCHEME;
136 return (t);
139 /* userinfo is always optional */
140 static const char *
141 parse_uinfo(const char *s, struct iri *iri)
143 const char *t = s;
145 for (;;) {
146 if (unreserved(*t) || sub_delims(*t) || *t == ':')
147 t++;
148 else if (pctenc(t))
149 t += 3;
150 else
151 break;
154 if (*t != '@')
155 return (s);
157 if (cpstr(s, t, iri->iri_uinfo, sizeof(iri->iri_uinfo)) == -1)
158 return (NULL);
159 iri->iri_flags |= IH_UINFO;
160 return (t + 1);
163 static const char *
164 parse_host(const char *s, struct iri *iri)
166 const char *t = s;
168 /*
169 * cheating a bit by relaxing and merging the rule for
170 * IPv6address and IPvFuture and by merging IPv4address and
171 * reg-name.
172 */
174 if (*t == '[') {
175 while (*t && *t != ']')
176 ++t;
177 if (*t == '\0')
178 return (NULL);
179 t++;
180 if (cpstr(s, t, iri->iri_host, sizeof(iri->iri_host)) == -1)
181 return (NULL);
182 iri->iri_flags |= IH_HOST;
183 return (t);
186 for (;;) {
187 if (unreserved(*t) || sub_delims(*t))
188 t++;
189 else if (pctenc(t))
190 t += 3;
191 else
192 break;
195 if (cpstr(s, t, iri->iri_host, sizeof(iri->iri_host)) == -1)
196 return (NULL);
197 iri->iri_flags |= IH_HOST;
198 return (t);
201 static const char *
202 parse_port(const char *s, struct iri *iri)
204 const char *t = s;
205 const char *errstr;
207 while (isdigit((unsigned char)*t))
208 t++;
209 if (cpstr(s, t, iri->iri_portstr, sizeof(iri->iri_portstr)) == -1)
210 return (NULL);
211 iri->iri_port = strtonum(iri->iri_portstr, 1, UINT16_MAX, &errstr);
212 if (errstr)
213 return (NULL);
214 iri->iri_flags |= IH_PORT;
215 return (t);
218 static const char *
219 parse_authority(const char *s, struct iri *iri)
221 const char *t;
223 if ((t = parse_uinfo(s, iri)) == NULL)
224 return (NULL);
226 if ((t = parse_host(t, iri)) == NULL)
227 return (NULL);
229 if (*t == ':')
230 return (parse_port(t + 1, iri));
232 return (t);
235 static const char *
236 parse_path_abempty(const char *s, struct iri *iri)
238 const char *t = s;
240 while (*t == '/')
241 t = advance_segment(t + 1);
243 if (cpstr(s, t, iri->iri_path, sizeof(iri->iri_path)) == -1)
244 return (NULL);
245 iri->iri_flags |= IH_PATH;
246 return (t);
249 static const char *
250 parse_path_absolute(const char *s, struct iri *iri)
252 const char *t;
254 if (*s != '/')
255 return (NULL);
257 if ((t = advance_segment_nz(s + 1)) == NULL)
258 t = s + 1;
259 else {
260 while (*t == '/')
261 t = advance_segment(t + 1);
264 if (cpstr(s, t, iri->iri_path, sizeof(iri->iri_path)) == -1)
265 return (NULL);
266 iri->iri_flags |= IH_PATH;
267 return (t);
270 static const char *
271 parse_path_rootless(const char *s, struct iri *iri)
273 const char *t;
275 if ((t = advance_segment_nz(s)) == NULL)
276 return (NULL);
278 while (*t == '/')
279 t = advance_segment(t + 1);
281 if (cpstr(s, t, iri->iri_path, sizeof(iri->iri_path)) == -1)
282 return (NULL);
283 iri->iri_flags |= IH_PATH;
284 return (t);
287 static const char *
288 parse_path_noscheme(const char *s, struct iri *iri)
290 const char *t;
292 if ((t = advance_segment_nz_nc(s)) == NULL)
293 return (NULL);
295 while (*t == '/')
296 t = advance_segment(t + 1);
298 if (cpstr(s, t, iri->iri_path, sizeof(iri->iri_path)) == -1)
299 return (NULL);
300 iri->iri_flags |= IH_PATH;
301 return (t);
304 static const char *
305 parse_path_empty(const char *s, struct iri *iri)
307 iri->iri_path[0] = '\0';
308 iri->iri_flags |= IH_PATH;
309 return (s);
312 static const char *
313 parse_hier(const char *s, struct iri *iri)
315 const char *t;
317 if (!strncmp(s, "//", 2)) {
318 if ((t = parse_authority(s + 2, iri)) == NULL)
319 return (NULL);
320 return (parse_path_abempty(t, iri));
323 if ((t = parse_path_absolute(s, iri)) != NULL)
324 return (t);
326 if ((t = parse_path_rootless(s, iri)) != NULL)
327 return (t);
329 return (parse_path_empty(s, iri));
332 static const char *
333 parse_relative(const char *s, struct iri *iri)
335 const char *t = s;
337 if (!strncmp(s, "//", 2)) {
338 if ((t = parse_authority(s + 2, iri)) == NULL)
339 return (NULL);
340 return (parse_path_abempty(t, iri));
343 if ((t = parse_path_absolute(s, iri)) != NULL)
344 return (t);
346 if ((t = parse_path_noscheme(s, iri)) != NULL)
347 return (t);
349 return (parse_path_empty(s, iri));
352 static const char *
353 parse_qf(const char *s, int flag, struct iri *iri, char *buf, size_t bufsize)
355 const char *n, *t = s;
357 for (;;) {
358 if ((n = advance_pchar(t)) != NULL)
359 t = n;
360 else if (*t == '/' || *t == '?')
361 t++;
362 else
363 break;
366 if (cpstr(s, t, buf, bufsize) == -1)
367 return (NULL);
368 iri->iri_flags |= flag;
369 return (t);
372 static int
373 parse_uri(const char *s, struct iri *iri)
375 iri->iri_flags = 0;
377 if ((s = parse_scheme(s, iri)) == NULL)
378 return (-1);
380 if (*s != ':')
381 return (-1);
383 if ((s = parse_hier(s + 1, iri)) == NULL)
384 return (-1);
386 if (*s == '?') {
387 s = parse_qf(s + 1, IH_QUERY, iri, iri->iri_query,
388 sizeof(iri->iri_query));
389 if (s == NULL)
390 return (-1);
393 if (*s == '#') {
394 s = parse_qf(s + 1, IH_FRAGMENT, iri, iri->iri_fragment,
395 sizeof(iri->iri_fragment));
396 if (s == NULL)
397 return (-1);
400 if (*s == '\0')
401 return (0);
403 return (-1);
406 static int
407 parse_relative_ref(const char *s, struct iri *iri)
409 if ((s = parse_relative(s, iri)) == NULL)
410 return (-1);
412 if (*s == '?') {
413 s = parse_qf(s + 1, IH_QUERY, iri, iri->iri_query,
414 sizeof(iri->iri_query));
415 if (s == NULL)
416 return (-1);
419 if (*s == '#') {
420 s = parse_qf(s + 1, IH_FRAGMENT, iri, iri->iri_fragment,
421 sizeof(iri->iri_fragment));
422 if (s == NULL)
423 return (-1);
426 if (*s == '\0')
427 return (0);
429 return (-1);
432 static int
433 parse(const char *s, struct iri *iri)
435 iri->iri_flags = 0;
437 if (s == NULL)
438 return (0);
440 if (parse_uri(s, iri) == -1) {
441 iri->iri_flags = 0;
442 if (parse_relative_ref(s, iri) == -1)
443 return (-1);
446 return (0);
449 static inline void
450 lowerify(char *s)
452 for (; *s; ++s)
453 *s = tolower((unsigned char)*s);
456 static void
457 cpfields(struct iri *dest, const struct iri *src, int flags)
459 if (flags & IH_SCHEME) {
460 dest->iri_flags |= IH_SCHEME;
461 if (src->iri_flags & IH_SCHEME)
462 memcpy(dest->iri_scheme, src->iri_scheme,
463 sizeof(dest->iri_scheme));
464 lowerify(dest->iri_scheme);
466 if (flags & IH_UINFO) {
467 if (src->iri_flags & IH_UINFO) {
468 memcpy(dest->iri_uinfo, src->iri_uinfo,
469 sizeof(dest->iri_uinfo));
470 dest->iri_flags |= IH_UINFO;
473 if (flags & IH_HOST) {
474 dest->iri_flags |= IH_HOST;
475 if (src->iri_flags & IH_HOST)
476 memcpy(dest->iri_host, src->iri_host,
477 sizeof(dest->iri_host));
478 lowerify(dest->iri_host);
480 if (flags & IH_PORT) {
481 if (src->iri_flags & IH_PORT) {
482 dest->iri_port = src->iri_port;
483 memcpy(dest->iri_portstr, src->iri_portstr,
484 sizeof(dest->iri_portstr));
485 dest->iri_flags |= IH_PORT;
488 if (flags & IH_PATH) {
489 dest->iri_flags |= IH_PATH;
490 if (src->iri_flags & IH_PATH)
491 memcpy(dest->iri_path, src->iri_path,
492 sizeof(dest->iri_path));
494 if (flags & IH_QUERY) {
495 if (src->iri_flags & IH_QUERY) {
496 dest->iri_flags |= IH_QUERY;
497 memcpy(dest->iri_query, src->iri_query,
498 sizeof(dest->iri_query));
501 if (flags & IH_FRAGMENT) {
502 if (src->iri_flags & IH_FRAGMENT) {
503 dest->iri_flags |= IH_FRAGMENT;
504 memcpy(dest->iri_fragment, src->iri_fragment,
505 sizeof(dest->iri_fragment));
510 static inline int
511 remove_dot_segments(char *buf, ptrdiff_t bufsize)
513 char *p, *q;
515 p = q = buf;
516 while (*p && (q - buf < bufsize)) {
517 if (p[0] == '/' && p[1] == '.' &&
518 (p[2] == '/' || p[2] == '\0')) {
519 p += 2;
520 if (*p != '/')
521 *q++ = '/';
522 } else if (p[0] == '/' && p[1] == '.' && p[2] == '.' &&
523 (p[3] == '/' || p[3] == '\0')) {
524 p += 3;
525 while (q > buf && *--q != '/')
526 continue;
527 if (*p != '/' && (q > buf && q[-1] != '/'))
528 *q++ = '/';
529 } else
530 *q++ = *p++;
532 if ((*p == '\0') && (q - buf < bufsize)) {
533 *q = '\0';
534 return (0);
537 errno = ENAMETOOLONG;
538 return (-1);
541 static inline int
542 mergepath(char *buf, size_t bufsize, int abs, const char *base, const char *r)
544 const char *s;
546 if (base == NULL || *base == '\0')
547 base = "/";
548 if (r == NULL || *r == '\0')
549 r = "/";
551 if (bufsize == 0)
552 return (-1);
553 buf[0] = '\0';
555 if (abs && (*base == '\0' || !strcmp(base, "/"))) {
556 if (*r == '/')
557 r++;
558 strlcpy(buf, "/", bufsize);
559 strlcat(buf, r, bufsize);
560 return (0);
563 if ((s = strrchr(base, '/')) != NULL) {
564 cpstr(base, s + 1, buf, bufsize);
565 if (*r == '/')
566 r++;
568 if (strlcat(buf, r, bufsize) >= bufsize) {
569 errno = ENAMETOOLONG;
570 return (-1);
573 return (0);
576 int
577 iri_parse(const char *base, const char *str, struct iri *iri)
579 static struct iri ibase, iparsed;
581 memset(iri, 0, sizeof(*iri));
583 if (base == NULL) {
584 ibase.iri_flags = 0;
585 if (parse_uri(str, &iparsed) == -1) {
586 errno = EINVAL;
587 return (-1);
589 } else {
590 if (parse_uri(base, &ibase) == -1 ||
591 parse(str, &iparsed) == -1) {
592 errno = EINVAL;
593 return (-1);
597 cpfields(iri, &iparsed, IH_FRAGMENT);
599 if (iparsed.iri_flags & IH_SCHEME) {
600 cpfields(iri, &iparsed, iparsed.iri_flags);
601 remove_dot_segments(iri->iri_path, sizeof(iri->iri_path));
602 return (0);
605 cpfields(iri, &ibase, IH_SCHEME);
607 if (iparsed.iri_flags & IH_HOST) {
608 cpfields(iri, &iparsed, IH_AUTHORITY|IH_PATH|IH_QUERY);
609 remove_dot_segments(iri->iri_path, sizeof(iri->iri_path));
610 return (0);
613 cpfields(iri, &ibase, IH_AUTHORITY);
615 if ((iparsed.iri_flags & IH_PATH) && *iparsed.iri_path == '\0') {
616 cpfields(iri, &ibase, IH_PATH);
617 if (iparsed.iri_flags & IH_QUERY)
618 cpfields(iri, &iparsed, IH_QUERY);
619 else
620 cpfields(iri, &ibase, IH_QUERY);
621 return (0);
624 cpfields(iri, &iparsed, IH_QUERY);
625 if ((iparsed.iri_flags & IH_PATH) && *iparsed.iri_path == '/')
626 cpfields(iri, &iparsed, IH_PATH);
627 else {
628 if (!(ibase.iri_flags & IH_PATH))
629 ibase.iri_path[0] = '\0';
630 if (!(iparsed.iri_flags & IH_PATH))
631 iparsed.iri_path[0] = '\0';
632 if (mergepath(iri->iri_path, sizeof(iri->iri_path),
633 ibase.iri_flags & IH_AUTHORITY, ibase.iri_path,
634 iparsed.iri_path) == -1)
635 return (-1);
636 iri->iri_flags |= IH_PATH;
638 if (remove_dot_segments(iri->iri_path, sizeof(iri->iri_path)) == -1)
639 return (-1);
640 return (0);
643 int
644 iri_unparse(const struct iri *i, char *buf, size_t buflen)
646 int need_ss, have_path, need_s;
647 int r;
649 /* file is a quirky scheme */
650 need_ss = (i->iri_flags & IH_AUTHORITY) ||
651 !strcmp(i->iri_scheme, "file");
652 have_path = i->iri_flags & IH_PATH;
653 need_s = have_path && (i->iri_flags & IH_AUTHORITY) &&
654 i->iri_path[0] != '/';
656 r = snprintf(buf, buflen, "%s%s%s%s%s%s%s%s%s%s%s%s%s%s",
657 (i->iri_flags & IH_SCHEME) ? i->iri_scheme : "",
658 (i->iri_flags & IH_SCHEME) ? ":" : "",
659 need_ss ? "//" : "",
660 (i->iri_flags & IH_UINFO) ? i->iri_uinfo : "",
661 (i->iri_flags & IH_UINFO) ? "@" : "",
662 (i->iri_flags & IH_HOST) ? i->iri_host : "",
663 (i->iri_flags & IH_PORT) ? ":" : "",
664 (i->iri_flags & IH_PORT) ? i->iri_portstr : "",
665 need_s ? "/" : "",
666 have_path ? i->iri_path : "",
667 (i->iri_flags & IH_QUERY) ? "?" : "",
668 (i->iri_flags & IH_QUERY) ? i->iri_query : "",
669 (i->iri_flags & IH_FRAGMENT) ? "#" : "",
670 (i->iri_flags & IH_FRAGMENT) ? i->iri_fragment : "");
671 if (r < 0 || (size_t)r >= buflen) {
672 errno = ENOBUFS;
673 return (-1);
676 return (0);
679 int
680 iri_human(const struct iri *iri, char *buf, size_t buflen)
682 memset(buf, 0, buflen);
683 return (-1);
686 int
687 iri_setport(struct iri *iri, const char *portstr)
689 const char *errstr;
690 int port;
692 port = strtonum(portstr, 1, UINT16_MAX, &errstr);
693 if (errstr)
694 return (-1);
696 snprintf(iri->iri_portstr, sizeof(iri->iri_portstr), "%d", port);
697 iri->iri_port = port;
698 return (0);
701 int
702 iri_setquery(struct iri *iri, const char *p)
704 ptrdiff_t bufsize;
705 int r;
706 char *buf, *q, tmp[4];
708 buf = q = iri->iri_query;
709 bufsize = sizeof(iri->iri_query);
710 while (*p && (q - buf < bufsize)) {
711 if (unreserved(*p) || sub_delims(*p) || *p == ':' ||
712 *p == '@' || *p == '/' || *p == '?') {
713 *q++ = *p++;
714 continue;
717 if (q - buf >= bufsize - 3)
718 goto err;
719 r = snprintf(tmp, sizeof(tmp), "%%%02X", (int)*p);
720 if (r < 0 || (size_t)r > sizeof(tmp))
721 return (-1);
722 *q++ = tmp[0];
723 *q++ = tmp[1];
724 *q++ = tmp[2];
725 p++;
727 if ((*p == '\0') && (q - buf < bufsize)) {
728 iri->iri_flags |= IH_QUERY;
729 *q = '\0';
730 return (0);
733 err:
734 errno = ENOBUFS;
735 return (-1);
738 int
739 iri_urlescape(const char *path, char *buf, size_t len)
741 const char *hex = "0123456789abcdef";
742 const uint8_t *p = path;
744 while (*p) {
745 if (len == 0)
746 break;
748 if (unreserved(*p) || sub_delims(*p) ||
749 *p == ':' || *p == '@' ||
750 *p == '/') {
751 *buf++ = *p++;
752 len--;
753 continue;
756 if (len < 3)
757 break;
758 *buf++ = '%';
759 *buf++ = hex[*p >> 4];
760 *buf++ = hex[*p & 0xf];
761 len -= 3;
762 p++;
765 if (len == 0 || *p)
766 return (-1);
768 *buf = '\0';
769 return (0);
772 int
773 iri_urlunescape(const char *str, char *buf, size_t len)
775 char t[3];
776 unsigned long l;
778 t[2] = '\0';
780 while (*str) {
781 if (len == 0)
782 return (-1);
784 if (*str != '%') {
785 *buf++ = *str++;
786 len--;
787 continue;
790 if (!isxdigit((unsigned char)str[1]) ||
791 !isxdigit((unsigned char)str[2]))
792 return (-1);
794 t[0] = str[1];
795 t[1] = str[2];
797 /* we know it's a proper number and will fit a char */
798 l = strtol(t, NULL, 16);
799 *buf++ = (unsigned char)l;
800 len--;
801 str += 3;
804 if (len == 0)
805 return (-1);
806 *buf = '\0';
807 return (0);