Blame


1 3448adb0 2022-11-02 op if [ "$ENCODING" = "utf8" ]; then
2 3448adb0 2022-11-02 op UNIT="byte"
3 3448adb0 2022-11-02 op SUFFIX="_utf8"
4 3448adb0 2022-11-02 op ANTISUFFIX=""
5 3448adb0 2022-11-02 op else
6 3448adb0 2022-11-02 op UNIT="codepoint"
7 3448adb0 2022-11-02 op SUFFIX=""
8 3448adb0 2022-11-02 op ANTISUFFIX="_utf8"
9 3448adb0 2022-11-02 op fi
10 3448adb0 2022-11-02 op
11 3448adb0 2022-11-02 op cat << EOF
12 3448adb0 2022-11-02 op .Dd ${MAN_DATE}
13 3448adb0 2022-11-02 op .Dt GRAPHEME_NEXT_$(printf "%s_break%s" "$TYPE" "$SUFFIX" | tr [:lower:] [:upper:]) 3
14 3448adb0 2022-11-02 op .Os suckless.org
15 3448adb0 2022-11-02 op .Sh NAME
16 3448adb0 2022-11-02 op .Nm grapheme_next_${TYPE}_break${SUFFIX}
17 3448adb0 2022-11-02 op .Nd determine ${UNIT}-offset to next ${REALTYPE} break
18 3448adb0 2022-11-02 op .Sh SYNOPSIS
19 3448adb0 2022-11-02 op .In grapheme.h
20 3448adb0 2022-11-02 op .Ft size_t
21 3448adb0 2022-11-02 op .Fn grapheme_next_${TYPE}_break${SUFFIX} "const $(if [ "$ENCODING" = "utf8" ]; then printf "char"; else printf "uint_least32_t"; fi) *str" "size_t len"
22 3448adb0 2022-11-02 op .Sh DESCRIPTION
23 3448adb0 2022-11-02 op The
24 3448adb0 2022-11-02 op .Fn grapheme_next_${TYPE}_break${SUFFIX}
25 3448adb0 2022-11-02 op function computes the offset (in ${UNIT}s) to the next ${REALTYPE}
26 3448adb0 2022-11-02 op break (see
27 3448adb0 2022-11-02 op .Xr libgrapheme 7 )
28 3448adb0 2022-11-02 op in the $(if [ "$ENCODING" = "utf8" ]; then printf "UTF-8-encoded string"; else printf "codepoint array"; fi)
29 3448adb0 2022-11-02 op .Va str
30 3448adb0 2022-11-02 op of length
31 3448adb0 2022-11-02 op .Va len .$(if [ "$TYPE" != "line" ]; then printf "\nIf a ${REALTYPE} begins at
32 3448adb0 2022-11-02 op .Va str
33 3448adb0 2022-11-02 op this offset is equal to the length of said ${REALTYPE}."; fi)
34 3448adb0 2022-11-02 op .Pp
35 3448adb0 2022-11-02 op If
36 3448adb0 2022-11-02 op .Va len
37 3448adb0 2022-11-02 op is set to
38 3448adb0 2022-11-02 op .Dv SIZE_MAX
39 3448adb0 2022-11-02 op (stdint.h is already included by grapheme.h) the string
40 3448adb0 2022-11-02 op .Va str
41 3448adb0 2022-11-02 op is interpreted to be NUL-terminated and processing stops when
42 3448adb0 2022-11-02 op a $(if [ "$ENCODING" = "utf8" ]; then printf "NUL-byte"; else printf "codepoint with the value 0"; fi) is encountered.
43 3448adb0 2022-11-02 op .Pp
44 3448adb0 2022-11-02 op For $(if [ "$ENCODING" != "utf8" ]; then printf "UTF-8-encoded"; else printf "non-UTF-8"; fi) input
45 3448adb0 2022-11-02 op data$(if [ "$TYPE" = "character" ] && [ "$ENCODING" = "utf8" ]; then printf "\n.Xr grapheme_is_character_break 3 and"; fi)
46 3448adb0 2022-11-02 op .Xr grapheme_next_${TYPE}_break${ANTISUFFIX} 3
47 3448adb0 2022-11-02 op can be used instead.
48 3448adb0 2022-11-02 op .Sh RETURN VALUES
49 3448adb0 2022-11-02 op The
50 3448adb0 2022-11-02 op .Fn grapheme_next_${TYPE}_break${SUFFIX}
51 3448adb0 2022-11-02 op function returns the offset (in ${UNIT}s) to the next ${REALTYPE}
52 3448adb0 2022-11-02 op break in
53 3448adb0 2022-11-02 op .Va str
54 3448adb0 2022-11-02 op or 0 if
55 3448adb0 2022-11-02 op .Va str
56 3448adb0 2022-11-02 op is
57 3448adb0 2022-11-02 op .Dv NULL .
58 3448adb0 2022-11-02 op EOF
59 3448adb0 2022-11-02 op
60 3448adb0 2022-11-02 op if [ "$ENCODING" = "utf8" ]; then
61 3448adb0 2022-11-02 op cat << EOF
62 3448adb0 2022-11-02 op .Sh EXAMPLES
63 3448adb0 2022-11-02 op .Bd -literal
64 3448adb0 2022-11-02 op /* cc (-static) -o example example.c -lgrapheme */
65 3448adb0 2022-11-02 op #include <grapheme.h>
66 3448adb0 2022-11-02 op #include <stdint.h>
67 3448adb0 2022-11-02 op #include <stdio.h>
68 3448adb0 2022-11-02 op
69 3448adb0 2022-11-02 op int
70 3448adb0 2022-11-02 op main(void)
71 3448adb0 2022-11-02 op {
72 3448adb0 2022-11-02 op /* UTF-8 encoded input */
73 3448adb0 2022-11-02 op char *s = "T\\\\xC3\\\\xABst \\\\xF0\\\\x9F\\\\x91\\\\xA8\\\\xE2\\\\x80\\\\x8D\\\\xF0"
74 3448adb0 2022-11-02 op "\\\\x9F\\\\x91\\\\xA9\\\\xE2\\\\x80\\\\x8D\\\\xF0\\\\x9F\\\\x91\\\\xA6 \\\\xF0"
75 3448adb0 2022-11-02 op "\\\\x9F\\\\x87\\\\xBA\\\\xF0\\\\x9F\\\\x87\\\\xB8 \\\\xE0\\\\xA4\\\\xA8\\\\xE0"
76 3448adb0 2022-11-02 op "\\\\xA5\\\\x80 \\\\xE0\\\\xAE\\\\xA8\\\\xE0\\\\xAE\\\\xBF!";
77 3448adb0 2022-11-02 op size_t ret, len, off;
78 3448adb0 2022-11-02 op
79 3448adb0 2022-11-02 op printf("Input: \\\\"%s\\\\"\\\\n", s);
80 3448adb0 2022-11-02 op
81 3448adb0 2022-11-02 op /* print each ${REALTYPE} with byte-length */
82 3448adb0 2022-11-02 op printf("${REALTYPE}s in NUL-delimited input:\\\\n");
83 3448adb0 2022-11-02 op for (off = 0; s[off] != '\\\\0'; off += ret) {
84 3448adb0 2022-11-02 op ret = grapheme_next_${TYPE}_break_utf8(s + off, SIZE_MAX);
85 3448adb0 2022-11-02 op printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off);
86 3448adb0 2022-11-02 op }
87 3448adb0 2022-11-02 op printf("\\\\n");
88 3448adb0 2022-11-02 op
89 3448adb0 2022-11-02 op /* do the same, but this time string is length-delimited */
90 3448adb0 2022-11-02 op len = 17;
91 3448adb0 2022-11-02 op printf("${REALTYPE}s in input delimited to %zu bytes:\\\\n", len);
92 3448adb0 2022-11-02 op for (off = 0; off < len; off += ret) {
93 3448adb0 2022-11-02 op ret = grapheme_next_${TYPE}_break_utf8(s + off, len - off);
94 3448adb0 2022-11-02 op printf("%2zu bytes | %.*s\\\\n", ret, (int)ret, s + off);
95 3448adb0 2022-11-02 op }
96 3448adb0 2022-11-02 op
97 3448adb0 2022-11-02 op return 0;
98 3448adb0 2022-11-02 op }
99 3448adb0 2022-11-02 op .Ed
100 3448adb0 2022-11-02 op EOF
101 3448adb0 2022-11-02 op fi
102 3448adb0 2022-11-02 op
103 3448adb0 2022-11-02 op cat << EOF
104 3448adb0 2022-11-02 op .Sh SEE ALSO$(if [ "$TYPE" = "character" ] && [ "$ENCODING" != "utf8" ]; then printf "\n.Xr grapheme_is_character_break 3 ,"; fi)
105 3448adb0 2022-11-02 op .Xr grapheme_next_${TYPE}_break${ANTISUFFIX} 3 ,
106 3448adb0 2022-11-02 op .Xr libgrapheme 7
107 3448adb0 2022-11-02 op .Sh STANDARDS
108 3448adb0 2022-11-02 op .Fn grapheme_next_${TYPE}_break${SUFFIX}
109 3448adb0 2022-11-02 op is compliant with the Unicode ${UNICODE_VERSION} specification.
110 3448adb0 2022-11-02 op .Sh AUTHORS
111 3448adb0 2022-11-02 op .An Laslo Hunhold Aq Mt dev@frign.de
112 3448adb0 2022-11-02 op EOF