Blame


1 3448adb0 2022-11-02 op cat << EOF
2 3448adb0 2022-11-02 op .Dd ${MAN_DATE}
3 3448adb0 2022-11-02 op .Dt GRAPHEME_DECODE_UTF8 3
4 3448adb0 2022-11-02 op .Os suckless.org
5 3448adb0 2022-11-02 op .Sh NAME
6 3448adb0 2022-11-02 op .Nm grapheme_decode_utf8
7 3448adb0 2022-11-02 op .Nd decode first codepoint in UTF-8-encoded string
8 3448adb0 2022-11-02 op .Sh SYNOPSIS
9 3448adb0 2022-11-02 op .In grapheme.h
10 3448adb0 2022-11-02 op .Ft size_t
11 3448adb0 2022-11-02 op .Fn grapheme_decode_utf8 "const char *str" "size_t len" "uint_least32_t *cp"
12 3448adb0 2022-11-02 op .Sh DESCRIPTION
13 3448adb0 2022-11-02 op The
14 3448adb0 2022-11-02 op .Fn grapheme_decode_utf8
15 3448adb0 2022-11-02 op function decodes the first codepoint in the UTF-8-encoded string
16 3448adb0 2022-11-02 op .Va str
17 3448adb0 2022-11-02 op of length
18 3448adb0 2022-11-02 op .Va len .
19 3448adb0 2022-11-02 op If the UTF-8-sequence is invalid (overlong encoding, unexpected byte,
20 3448adb0 2022-11-02 op string ends unexpectedly, empty string, etc.) the decoding is stopped
21 3448adb0 2022-11-02 op at the last processed byte and the decoded codepoint set to
22 3448adb0 2022-11-02 op .Dv GRAPHEME_INVALID_CODEPOINT .
23 3448adb0 2022-11-02 op .Pp
24 3448adb0 2022-11-02 op If
25 3448adb0 2022-11-02 op .Va cp
26 3448adb0 2022-11-02 op is not
27 3448adb0 2022-11-02 op .Dv NULL
28 3448adb0 2022-11-02 op the decoded codepoint is stored in the memory pointed to by
29 3448adb0 2022-11-02 op .Va cp .
30 3448adb0 2022-11-02 op .Pp
31 3448adb0 2022-11-02 op Given NUL has a unique 1 byte representation, it is safe to operate on
32 3448adb0 2022-11-02 op NUL-terminated strings by setting
33 3448adb0 2022-11-02 op .Va len
34 3448adb0 2022-11-02 op to
35 3448adb0 2022-11-02 op .Dv SIZE_MAX
36 3448adb0 2022-11-02 op (stdint.h is already included by grapheme.h) and terminating when
37 3448adb0 2022-11-02 op .Va cp
38 3448adb0 2022-11-02 op is 0 (see
39 3448adb0 2022-11-02 op .Sx EXAMPLES
40 3448adb0 2022-11-02 op for an example).
41 3448adb0 2022-11-02 op .Sh RETURN VALUES
42 3448adb0 2022-11-02 op The
43 3448adb0 2022-11-02 op .Fn grapheme_decode_utf8
44 3448adb0 2022-11-02 op function returns the number of processed bytes and 0 if
45 3448adb0 2022-11-02 op .Va str
46 3448adb0 2022-11-02 op is
47 3448adb0 2022-11-02 op .Dv NULL
48 3448adb0 2022-11-02 op or
49 3448adb0 2022-11-02 op .Va len
50 3448adb0 2022-11-02 op is 0.
51 3448adb0 2022-11-02 op If the string ends unexpectedly in a multibyte sequence, the desired
52 3448adb0 2022-11-02 op length (that is larger than
53 3448adb0 2022-11-02 op .Va len )
54 3448adb0 2022-11-02 op is returned.
55 3448adb0 2022-11-02 op .Sh EXAMPLES
56 3448adb0 2022-11-02 op .Bd -literal
57 3448adb0 2022-11-02 op /* cc (-static) -o example example.c -lgrapheme */
58 3448adb0 2022-11-02 op #include <grapheme.h>
59 3448adb0 2022-11-02 op #include <inttypes.h>
60 3448adb0 2022-11-02 op #include <stdio.h>
61 3448adb0 2022-11-02 op
62 3448adb0 2022-11-02 op void
63 3448adb0 2022-11-02 op print_cps(const char *str, size_t len)
64 3448adb0 2022-11-02 op {
65 3448adb0 2022-11-02 op size_t ret, off;
66 3448adb0 2022-11-02 op uint_least32_t cp;
67 3448adb0 2022-11-02 op
68 3448adb0 2022-11-02 op for (off = 0; off < len; off += ret) {
69 3448adb0 2022-11-02 op if ((ret = grapheme_decode_utf8(str + off,
70 3448adb0 2022-11-02 op len - off, &cp)) > (len - off)) {
71 3448adb0 2022-11-02 op /*
72 3448adb0 2022-11-02 op * string ended unexpectedly in the middle of a
73 3448adb0 2022-11-02 op * multibyte sequence and we have the choice
74 3448adb0 2022-11-02 op * here to possibly expand str by ret - len + off
75 3448adb0 2022-11-02 op * bytes to get a full sequence, but we just
76 3448adb0 2022-11-02 op * bail out in this case.
77 3448adb0 2022-11-02 op */
78 3448adb0 2022-11-02 op break;
79 3448adb0 2022-11-02 op }
80 3448adb0 2022-11-02 op printf("%"PRIxLEAST32"\\\\n", cp);
81 3448adb0 2022-11-02 op }
82 3448adb0 2022-11-02 op }
83 3448adb0 2022-11-02 op
84 3448adb0 2022-11-02 op void
85 3448adb0 2022-11-02 op print_cps_nul_terminated(const char *str)
86 3448adb0 2022-11-02 op {
87 3448adb0 2022-11-02 op size_t ret, off;
88 3448adb0 2022-11-02 op uint_least32_t cp;
89 3448adb0 2022-11-02 op
90 3448adb0 2022-11-02 op for (off = 0; (ret = grapheme_decode_utf8(str + off,
91 3448adb0 2022-11-02 op SIZE_MAX, &cp)) > 0 &&
92 3448adb0 2022-11-02 op cp != 0; off += ret) {
93 3448adb0 2022-11-02 op printf("%"PRIxLEAST32"\\\\n", cp);
94 3448adb0 2022-11-02 op }
95 3448adb0 2022-11-02 op }
96 3448adb0 2022-11-02 op .Ed
97 3448adb0 2022-11-02 op .Sh SEE ALSO
98 3448adb0 2022-11-02 op .Xr grapheme_encode_utf8 3 ,
99 3448adb0 2022-11-02 op .Xr libgrapheme 7
100 3448adb0 2022-11-02 op .Sh AUTHORS
101 3448adb0 2022-11-02 op .An Laslo Hunhold Aq Mt dev@frign.de
102 3448adb0 2022-11-02 op EOF