commit 174b3cdfa1168b20ec02e75e0161e5c802a05e85
from: Omar Polo <op@omarpolo.com>
date: Sun Mar 21 09:42:41 2021 UTC

cursor handling with utf8 support

the code relative to the cursors movements now respects the width of
the characters (zero, one or two cells).

commit - 31aa9f59f584734a946777192b4a2ba28f7fdb6c
commit + 174b3cdfa1168b20ec02e75e0161e5c802a05e85
blob - 895d05682bc4f894fe4911bee0909e6718614c07
blob + 4007f563f30d68620480edb1cc429007d24b2598
--- Makefile.am
+++ Makefile.am
@@ -18,6 +18,7 @@ telescope_SOURCES =	compat.h	\
 			ui.c		\
 			url.c		\
 			url.h		\
+			utf8.c		\
 			util.c		\
 			wrap.c
 
blob - 2fca8a23aff5a367b4058ead948eed9814202332
blob + 319abdfb8a27f50215b9bbe4230af846808b62b9
--- telescope.h
+++ telescope.h
@@ -254,6 +254,13 @@ void		 ui_require_input(struct tab*, int);
 void		 ui_notify(const char*, ...) __attribute__((format(printf, 1, 2)));
 void		 ui_end(void);
 
+/* utf.8 */
+char		*utf8_nth(char*, size_t);
+size_t		 utf8_cplen(char*);
+size_t		 utf8_chwidth(uint32_t);
+size_t		 utf8_snwidth(const char*, size_t);
+size_t		 utf8_swidth(const char*);
+
 /* util.c */
 int		 mark_nonblock(int);
 int		 has_prefix(const char*, const char*);
blob - 563538712e3edc8a5b57815f58ee4ec067a115c1
blob + bf812fbd3a8ba6fd7bfddf8740a956b0930cbec9
--- ui.c
+++ ui.c
@@ -371,14 +371,12 @@ restore_cursor(struct tab *tab)
 	if (vl == NULL || vl->line == NULL)
 		tab->s.curs_x = tab->s.line_x = 0;
 	else
-		tab->s.line_x = MIN(tab->s.line_x, strlen(vl->line));
+		tab->s.curs_x = utf8_snwidth(vl->line, tab->s.line_x);
 
 	if (vl != NULL) {
 		prfx = line_prefixes[vl->parent->type].prfx1;
-		tab->s.curs_x = tab->s.line_x + strlen(prfx);
+		tab->s.curs_x += utf8_swidth(prfx);
 	}
-
-	wmove(body, tab->s.curs_y, tab->s.curs_x);
 }
 
 static void
@@ -477,7 +475,7 @@ cmd_move_end_of_line(struct tab *tab)
 	vl = tab->s.current_line;
 	if (vl->line == NULL)
 		return;
-	tab->s.line_x = body_cols;
+	tab->s.line_x = utf8_cplen(vl->line);
 	restore_cursor(tab);
 }
 
@@ -501,6 +499,7 @@ cmd_scroll_line_up(struct tab *tab)
 	print_vline(vl);
 
 	tab->s.current_line = TAILQ_PREV(tab->s.current_line, vhead, vlines);
+	restore_cursor(tab);
 }
 
 static void
@@ -522,6 +521,8 @@ cmd_scroll_line_down(struct tab *tab)
 	vl = nth_line(tab, tab->s.line_off + body_lines-1);
 	wmove(body, body_lines-1, 0);
 	print_vline(vl);
+
+	restore_cursor(tab);
 }
 
 static void
@@ -1453,7 +1454,6 @@ redraw_tab(struct tab *tab)
 	redraw_modeline(tab);
 	redraw_minibuffer();
 
-	restore_cursor(tab);
 	wrefresh(tabline);
 	wrefresh(modeline);
 
@@ -1487,6 +1487,8 @@ redraw_body(struct tab *tab)
 		if (line == body_lines)
 			break;
 	}
+
+	wmove(body, tab->s.curs_y, tab->s.curs_x);
 }
 
 static void
blob - 341da330169c7050c05b46fbe6337da7c133f590
blob + a77cea6636e1e8fd8f3183ad8a30cb9d87518ddd
--- utf8.c
+++ utf8.c
@@ -21,10 +21,12 @@
  * SOFTWARE.
  */
 
-#include "gmid.h"
+#include "telescope.h"
 
+#include <assert.h>
 #include <stddef.h>
 #include <stdint.h>
+#include <wchar.h>
 
 #define UTF8_ACCEPT 0
 #define UTF8_REJECT 1
@@ -47,7 +49,7 @@ static const uint8_t utf8d[] = {
 };
 
 static inline uint32_t
-utf8_decode(uint32_t* state, uint32_t* codep, uint8_t byte) {
+utf8_decode(uint32_t* restrict state, uint32_t* restrict codep, uint8_t byte) {
 	uint32_t type = utf8d[byte];
 
 	*codep = (*state != UTF8_ACCEPT) ?
@@ -58,26 +60,9 @@ utf8_decode(uint32_t* state, uint32_t* codep, uint8_t 
 	return *state;
 }
 
-/* for the iri parser.  Modelled after printCodePoints */
-int
-valid_multibyte_utf8(struct parser *p)
-{
-	uint32_t cp = 0, state = 0;
+
+/* end of the converter, utility functions ahead */
 
-        for (; *p->iri; p->iri++)
-		if (!utf8_decode(&state, &cp, *p->iri))
-			break;
-
-	/* reject the ASCII range */
-	if (state || cp <= 0x7F) {
-		/* XXX: do some error recovery? */
-		if (state)
-			p->err = "invalid UTF-8 character";
-		return 0;
-	}
-	return 1;
-}
-
 char *
 utf8_nth(char *s, size_t n)
 {
@@ -94,3 +79,61 @@ utf8_nth(char *s, size_t n)
 		return s;
 	return NULL;
 }
+
+size_t
+utf8_cplen(char *s)
+{
+	uint32_t cp = 0, state = 0;
+	size_t len;
+
+	len = 0;
+	for (; *s; ++s)
+		if (!utf8_decode(&state, &cp, *s))
+			len++;
+	return len;
+}
+
+/* returns only 0, 1 or 2.  assumes sizeof(wchar_t) is 4 */
+size_t
+utf8_chwidth(uint32_t cp)
+{
+	/* XXX: if we're running on a platform where sizeof(wchar_t)
+	 * == 2 what to do?  The manpage for wcwidth and wcs isn't
+	 * clear about the encoding, but if it's 16 bit wide I assume
+	 * it must use UTF-16... right? */
+	assert(sizeof(wchar_t) == 4);
+
+	return wcwidth((wchar_t)cp);
+}
+
+/* NOTE: n is the number of codepoints, NOT the byte length.  In
+ * other words, s MUST be NUL-terminated. */
+size_t
+utf8_snwidth(const char *s, size_t n)
+{
+	size_t i, tot;
+	uint32_t cp = 0, state = 0;
+
+	tot = 0;
+	for (i = 0; *s && i < n; ++s)
+		if (!utf8_decode(&state, &cp, *s)) {
+			i++;
+			tot += utf8_chwidth(cp);
+		}
+
+	return tot;
+}
+
+size_t
+utf8_swidth(const char *s)
+{
+	size_t tot;
+	uint32_t cp = 0, state = 0;
+
+	tot = 0;
+	for (; *s; ++s)
+		if (!utf8_decode(&state, &cp, *s))
+			tot += utf8_chwidth(cp);
+
+	return tot;
+}