2 70d237df 2021-07-25 op * Copyright (c) 2021 Omar Polo <op@omarpolo.com>
4 70d237df 2021-07-25 op * Permission to use, copy, modify, and distribute this software for any
5 70d237df 2021-07-25 op * purpose with or without fee is hereby granted, provided that the above
6 70d237df 2021-07-25 op * copyright notice and this permission notice appear in all copies.
8 70d237df 2021-07-25 op * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
9 70d237df 2021-07-25 op * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
10 70d237df 2021-07-25 op * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
11 70d237df 2021-07-25 op * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
12 70d237df 2021-07-25 op * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
13 70d237df 2021-07-25 op * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
14 70d237df 2021-07-25 op * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18 70d237df 2021-07-25 op * A streaming gemtext parser.
21 70d237df 2021-07-25 op * - handle NULs
25 70d237df 2021-07-25 op #include "compat.h"
27 70d237df 2021-07-25 op #include <ctype.h>
28 70d237df 2021-07-25 op #include <string.h>
29 70d237df 2021-07-25 op #include <stdlib.h>
31 70d237df 2021-07-25 op #include "defaults.h"
32 70d237df 2021-07-25 op #include "parser.h"
33 70d237df 2021-07-25 op #include "utf8.h"
35 70d237df 2021-07-25 op static int gemtext_parse(struct parser*, const char*, size_t);
36 70d237df 2021-07-25 op static int gemtext_foreach_line(struct parser*, const char*, size_t);
37 70d237df 2021-07-25 op static int gemtext_free(struct parser*);
39 70d237df 2021-07-25 op static int parse_text(struct parser*, enum line_type, const char*, size_t);
40 70d237df 2021-07-25 op static int parse_link(struct parser*, enum line_type, const char*, size_t);
41 70d237df 2021-07-25 op static int parse_title(struct parser*, enum line_type, const char*, size_t);
42 70d237df 2021-07-25 op static int parse_item(struct parser*, enum line_type, const char*, size_t);
43 70d237df 2021-07-25 op static int parse_quote(struct parser*, enum line_type, const char*, size_t);
44 70d237df 2021-07-25 op static int parse_pre_start(struct parser*, enum line_type, const char*, size_t);
45 70d237df 2021-07-25 op static int parse_pre_cnt(struct parser*, enum line_type, const char*, size_t);
46 70d237df 2021-07-25 op static int parse_pre_end(struct parser*, enum line_type, const char*, size_t);
47 70d237df 2021-07-25 op static void search_title(struct parser*, enum line_type);
49 70d237df 2021-07-25 op typedef int (parselinefn)(struct parser*, enum line_type, const char*, size_t);
51 70d237df 2021-07-25 op static parselinefn *parsers[] = {
52 70d237df 2021-07-25 op [LINE_TEXT] = parse_text,
53 70d237df 2021-07-25 op [LINE_LINK] = parse_link,
54 70d237df 2021-07-25 op [LINE_TITLE_1] = parse_title,
55 70d237df 2021-07-25 op [LINE_TITLE_2] = parse_title,
56 70d237df 2021-07-25 op [LINE_TITLE_3] = parse_title,
57 70d237df 2021-07-25 op [LINE_ITEM] = parse_item,
58 70d237df 2021-07-25 op [LINE_QUOTE] = parse_quote,
59 70d237df 2021-07-25 op [LINE_PRE_START] = parse_pre_start,
60 70d237df 2021-07-25 op [LINE_PRE_CONTENT] = parse_pre_cnt,
61 70d237df 2021-07-25 op [LINE_PRE_END] = parse_pre_end,
65 70d237df 2021-07-25 op gemtext_initparser(struct parser *p)
67 70d237df 2021-07-25 op memset(p, 0, sizeof(*p));
69 70d237df 2021-07-25 op p->name = "text/gemini";
70 70d237df 2021-07-25 op p->parse = &gemtext_parse;
71 70d237df 2021-07-25 op p->free = &gemtext_free;
73 78894e73 2021-08-12 op TAILQ_INIT(&p->head);
76 70d237df 2021-07-25 op static inline int
77 70d237df 2021-07-25 op emit_line(struct parser *p, enum line_type type, char *line, char *alt)
79 70d237df 2021-07-25 op struct line *l;
81 70d237df 2021-07-25 op if ((l = calloc(1, sizeof(*l))) == NULL)
84 70d237df 2021-07-25 op l->type = type;
85 70d237df 2021-07-25 op l->line = line;
88 70d237df 2021-07-25 op switch (l->type) {
89 70d237df 2021-07-25 op case LINE_PRE_START:
90 70d237df 2021-07-25 op case LINE_PRE_END:
91 70d237df 2021-07-25 op if (hide_pre_context)
92 70d237df 2021-07-25 op l->flags = L_HIDDEN;
93 70d237df 2021-07-25 op if (l->type == LINE_PRE_END &&
94 70d237df 2021-07-25 op hide_pre_closing_line)
95 70d237df 2021-07-25 op l->flags = L_HIDDEN;
97 70d237df 2021-07-25 op case LINE_PRE_CONTENT:
98 70d237df 2021-07-25 op if (hide_pre_blocks)
99 70d237df 2021-07-25 op l->flags = L_HIDDEN;
101 70d237df 2021-07-25 op case LINE_LINK:
102 70d237df 2021-07-25 op if (emojify_link &&
103 70d237df 2021-07-25 op !emojied_line(line, (const char **)&l->data))
104 70d237df 2021-07-25 op l->data = NULL;
110 32ac17a4 2021-08-12 op TAILQ_INSERT_TAIL(&p->head, l, lines);
116 70d237df 2021-07-25 op parse_text(struct parser *p, enum line_type t, const char *buf, size_t len)
120 70d237df 2021-07-25 op if ((l = calloc(1, len+1)) == NULL)
122 70d237df 2021-07-25 op memcpy(l, buf, len);
123 70d237df 2021-07-25 op return emit_line(p, t, l, NULL);
127 70d237df 2021-07-25 op parse_link(struct parser *p, enum line_type t, const char *buf, size_t len)
130 70d237df 2021-07-25 op const char *url_start;
132 70d237df 2021-07-25 op if (len <= 2)
133 70d237df 2021-07-25 op return emit_line(p, t, NULL, NULL);
137 70d237df 2021-07-25 op while (len > 0 && isspace(buf[0])) {
142 70d237df 2021-07-25 op if (len == 0)
143 70d237df 2021-07-25 op return emit_line(p, t, NULL, NULL);
145 70d237df 2021-07-25 op url_start = buf;
146 70d237df 2021-07-25 op while (len > 0 && !isspace(buf[0])) {
151 70d237df 2021-07-25 op if ((u = calloc(1, buf - url_start + 1)) == NULL)
153 70d237df 2021-07-25 op memcpy(u, url_start, buf - url_start);
155 70d237df 2021-07-25 op if (len == 0)
156 70d237df 2021-07-25 op goto nolabel;
158 70d237df 2021-07-25 op while (len > 0 && isspace(buf[0])) {
163 70d237df 2021-07-25 op if (len == 0)
164 70d237df 2021-07-25 op goto nolabel;
166 70d237df 2021-07-25 op if ((l = calloc(1, len + 1)) == NULL)
169 70d237df 2021-07-25 op memcpy(l, buf, len);
170 70d237df 2021-07-25 op return emit_line(p, t, l, u);
173 70d237df 2021-07-25 op if ((l = strdup(u)) == NULL)
175 70d237df 2021-07-25 op return emit_line(p, t, l, u);
179 70d237df 2021-07-25 op parse_title(struct parser *p, enum line_type t, const char *buf, size_t len)
184 70d237df 2021-07-25 op case LINE_TITLE_1:
185 70d237df 2021-07-25 op if (len <= 1)
186 70d237df 2021-07-25 op return emit_line(p, t, NULL, NULL);
190 70d237df 2021-07-25 op case LINE_TITLE_2:
191 70d237df 2021-07-25 op if (len <= 2)
192 70d237df 2021-07-25 op return emit_line(p, t, NULL, NULL);
196 70d237df 2021-07-25 op case LINE_TITLE_3:
197 70d237df 2021-07-25 op if (len <= 3)
198 70d237df 2021-07-25 op return emit_line(p, t, NULL, NULL);
203 70d237df 2021-07-25 op /* unreachable */
207 70d237df 2021-07-25 op while (len > 0 && isspace(buf[0])) {
212 70d237df 2021-07-25 op if (len == 0)
213 70d237df 2021-07-25 op return emit_line(p, t, NULL, NULL);
215 70d237df 2021-07-25 op if (t == LINE_TITLE_1 && *p->title == '\0')
216 70d237df 2021-07-25 op strncpy(p->title, buf, MIN(sizeof(p->title)-1, len));
218 70d237df 2021-07-25 op if ((l = calloc(1, len+1)) == NULL)
220 70d237df 2021-07-25 op memcpy(l, buf, len);
221 70d237df 2021-07-25 op return emit_line(p, t, l, NULL);
225 70d237df 2021-07-25 op parse_item(struct parser *p, enum line_type t, const char *buf, size_t len)
229 70d237df 2021-07-25 op if (len == 1)
230 70d237df 2021-07-25 op return emit_line(p, t, NULL, NULL);
235 70d237df 2021-07-25 op while (len > 0 && isspace(buf[0])) {
240 70d237df 2021-07-25 op if (len == 0)
241 70d237df 2021-07-25 op return emit_line(p, t, NULL, NULL);
243 70d237df 2021-07-25 op if ((l = calloc(1, len+1)) == NULL)
245 70d237df 2021-07-25 op memcpy(l, buf, len);
246 70d237df 2021-07-25 op return emit_line(p, t, l, NULL);
250 70d237df 2021-07-25 op parse_quote(struct parser *p, enum line_type t, const char *buf, size_t len)
254 70d237df 2021-07-25 op if (len == 1)
255 70d237df 2021-07-25 op return emit_line(p, t, NULL, NULL);
260 70d237df 2021-07-25 op while (len > 0 && isspace(buf[0])) {
265 70d237df 2021-07-25 op if (len == 0)
266 70d237df 2021-07-25 op return emit_line(p, t, NULL, NULL);
268 70d237df 2021-07-25 op if ((l = calloc(1, len+1)) == NULL)
270 70d237df 2021-07-25 op memcpy(l, buf, len);
271 70d237df 2021-07-25 op return emit_line(p, t, l, NULL);
275 70d237df 2021-07-25 op parse_pre_start(struct parser *p, enum line_type t, const char *buf, size_t len)
279 70d237df 2021-07-25 op if (len <= 3)
280 70d237df 2021-07-25 op return emit_line(p, t, NULL, NULL);
285 70d237df 2021-07-25 op while (len > 0 && isspace(buf[0])) {
290 70d237df 2021-07-25 op if (len == 0)
291 70d237df 2021-07-25 op return emit_line(p, t, NULL, NULL);
293 70d237df 2021-07-25 op if ((l = calloc(1, len+1)) == NULL)
296 70d237df 2021-07-25 op memcpy(l, buf, len);
297 70d237df 2021-07-25 op return emit_line(p, t, l, NULL);
301 70d237df 2021-07-25 op parse_pre_cnt(struct parser *p, enum line_type t, const char *buf, size_t len)
305 70d237df 2021-07-25 op if (len == 0)
306 70d237df 2021-07-25 op return emit_line(p, t, NULL, NULL);
308 70d237df 2021-07-25 op if ((l = calloc(1, len+1)) == NULL)
310 70d237df 2021-07-25 op memcpy(l, buf, len);
311 70d237df 2021-07-25 op return emit_line(p, t, l, NULL);
315 70d237df 2021-07-25 op parse_pre_end(struct parser *p, enum line_type t, const char *buf, size_t len)
317 70d237df 2021-07-25 op return emit_line(p, t, NULL, NULL);
320 70d237df 2021-07-25 op static inline enum line_type
321 70d237df 2021-07-25 op detect_line_type(const char *buf, size_t len, int in_pre)
323 70d237df 2021-07-25 op if (in_pre) {
324 70d237df 2021-07-25 op if (len >= 3 &&
325 70d237df 2021-07-25 op buf[0] == '`' && buf[1] == '`' && buf[2] == '`')
326 70d237df 2021-07-25 op return LINE_PRE_END;
328 70d237df 2021-07-25 op return LINE_PRE_CONTENT;
331 70d237df 2021-07-25 op if (len == 0)
332 70d237df 2021-07-25 op return LINE_TEXT;
334 70d237df 2021-07-25 op switch (*buf) {
336 db056a13 2021-10-26 op if (len > 1 && buf[1] == ' ')
337 db056a13 2021-10-26 op return LINE_ITEM;
339 70d237df 2021-07-25 op case '>': return LINE_QUOTE;
341 70d237df 2021-07-25 op if (len >= 1 && buf[1] == '>')
342 70d237df 2021-07-25 op return LINE_LINK;
345 70d237df 2021-07-25 op if (len == 1)
346 70d237df 2021-07-25 op return LINE_TEXT;
347 70d237df 2021-07-25 op if (buf[1] != '#')
348 70d237df 2021-07-25 op return LINE_TITLE_1;
349 70d237df 2021-07-25 op if (len == 2)
350 70d237df 2021-07-25 op return LINE_TEXT;
351 70d237df 2021-07-25 op if (buf[2] != '#')
352 70d237df 2021-07-25 op return LINE_TITLE_2;
353 70d237df 2021-07-25 op if (len == 3)
354 70d237df 2021-07-25 op return LINE_TEXT;
355 70d237df 2021-07-25 op return LINE_TITLE_3;
358 70d237df 2021-07-25 op return LINE_TEXT;
359 70d237df 2021-07-25 op if (buf[0] == '`' && buf[1] == '`' && buf[2] == '`')
360 70d237df 2021-07-25 op return LINE_PRE_START;
364 70d237df 2021-07-25 op return LINE_TEXT;
368 70d237df 2021-07-25 op gemtext_parse(struct parser *p, const char *buf, size_t size)
370 70d237df 2021-07-25 op return parser_foreach_line(p, buf, size, gemtext_foreach_line);
374 70d237df 2021-07-25 op gemtext_foreach_line(struct parser *p, const char *line, size_t linelen)
376 70d237df 2021-07-25 op enum line_type t;
378 70d237df 2021-07-25 op t = detect_line_type(line, linelen, p->flags & PARSER_IN_PRE);
379 70d237df 2021-07-25 op if (t == LINE_PRE_START)
380 70d237df 2021-07-25 op p->flags ^= PARSER_IN_PRE;
381 70d237df 2021-07-25 op if (t == LINE_PRE_END)
382 70d237df 2021-07-25 op p->flags ^= PARSER_IN_PRE;
383 70d237df 2021-07-25 op return parsers[t](p, t, line, linelen);
387 70d237df 2021-07-25 op gemtext_free(struct parser *p)
389 70d237df 2021-07-25 op enum line_type t;
391 70d237df 2021-07-25 op /* flush the buffer */
392 70d237df 2021-07-25 op if (p->len != 0) {
393 70d237df 2021-07-25 op t = detect_line_type(p->buf, p->len, p->flags & PARSER_IN_PRE);
394 70d237df 2021-07-25 op if (!parsers[t](p, t, p->buf, p->len))
396 70d237df 2021-07-25 op if ((p->flags & PARSER_IN_PRE) &&
397 70d237df 2021-07-25 op !emit_line(p, LINE_PRE_END, NULL, NULL))
401 70d237df 2021-07-25 op free(p->buf);
404 70d237df 2021-07-25 op * use the first level 2 or 3 header as page title if none
407 70d237df 2021-07-25 op if (*p->title == '\0')
408 70d237df 2021-07-25 op search_title(p, LINE_TITLE_2);
409 70d237df 2021-07-25 op if (*p->title == '\0')
410 70d237df 2021-07-25 op search_title(p, LINE_TITLE_3);
416 70d237df 2021-07-25 op search_title(struct parser *p, enum line_type level)
418 70d237df 2021-07-25 op struct line *l;
420 70d237df 2021-07-25 op TAILQ_FOREACH(l, &p->head, lines) {
421 70d237df 2021-07-25 op if (l->type == level) {
422 70d237df 2021-07-25 op if (l->line == NULL)
424 70d237df 2021-07-25 op strlcpy(p->title, l->line, sizeof(p->title));