Re: git: 9e589b093857 - main - tty: fix improper backspace behaviour for UTF8 characters when in canonical mode
- Reply: Christos Margiolis : "Re: git: 9e589b093857 - main - tty: fix improper backspace behaviour for UTF8 characters when in canonical mode"
- In reply to: Christos Margiolis : "git: 9e589b093857 - main - tty: fix improper backspace behaviour for UTF8 characters when in canonical mode"
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Sat, 07 Oct 2023 21:04:19 UTC
Moin-moin! It breaks the build %( In file included from /usr/local/poudriere/jails/150aarch64/usr/src/sys/teken/teken.c:70: /usr/local/poudriere/jails/150aarch64/usr/src/sys/teken/teken_wcwidth.h:132:7: error: call to undeclared function 'bitcount'; ISO C99 and later do not support implicit function declarations [-Werror,-Wimplicit-function-declaration] if (bitcount(bytes[0] & 0xf0) != nbytes) ^ 1 error generated. *** [teken.o] Error code 1 make[5]: stopped in /usr/local/poudriere/jails/150aarch64/usr/src/stand/efi/libefi On 07.10.2023 21:00, Christos Margiolis wrote: > The branch main has been updated by christos: > > URL: https://cgit.FreeBSD.org/src/commit/?id=9e589b0938579f3f4d89fa5c051f845bf754184d > > commit 9e589b0938579f3f4d89fa5c051f845bf754184d > Author: Bojan Novković <bojan.novkovic@fer.hr> > AuthorDate: 2023-10-07 18:00:11 +0000 > Commit: Christos Margiolis <christos@FreeBSD.org> > CommitDate: 2023-10-07 18:00:11 +0000 > > tty: fix improper backspace behaviour for UTF8 characters when in canonical mode > > This patch adds additional logic in ttydisc_rubchar() to properly handle > backspace behaviour for UTF-8 characters. > > Currently, typing in a backspace after a UTF8 character will delete only > one byte from the byte sequence, leaving garbled output in the tty's > output queue. With this change all of the character's bytes are deleted. > This change is only active when the IUTF8 flag is set (see > 19054eb6053189144aa962b2ecc1bf5087758a3e "(s)tty: add support for IUTF8 > input flag") > > The code uses the teken_wcwidth() function to properly handle character > column widths for different code points, and adds the > teken_utf8_bytes_to_codepoint() function that converts a UTF-8 byte > sequence to a codepoint, as specified in RFC3629. > > Reported by: christos > Reviewed by: christos, imp > MFC after: 2 weeks > Differential Revision: https://reviews.freebsd.org/D42067 > --- > sys/kern/tty_ttydisc.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++ > sys/teken/teken_wcwidth.h | 30 +++++++++++++++++++ > 2 files changed, 104 insertions(+) > > diff --git a/sys/kern/tty_ttydisc.c b/sys/kern/tty_ttydisc.c > index 665275ee93e7..eae7162e31c0 100644 > --- a/sys/kern/tty_ttydisc.c > +++ b/sys/kern/tty_ttydisc.c > @@ -43,6 +43,9 @@ > #include <sys/uio.h> > #include <sys/vnode.h> > > +#include <teken/teken.h> > +#include <teken/teken_wcwidth.h> > + > /* > * Standard TTYDISC `termios' line discipline. > */ > @@ -78,8 +81,13 @@ SYSCTL_ULONG(_kern, OID_AUTO, tty_nout, CTLFLAG_RD, > /* Character is alphanumeric. */ > #define CTL_ALNUM(c) (((c) >= '0' && (c) <= '9') || \ > ((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) > +/* Character is UTF8-encoded. */ > +#define CTL_UTF8(c) (!!((c) & 0x80)) > +/* Character is a UTF8 continuation byte. */ > +#define CTL_UTF8_CONT(c) (((c) & 0xc0) == 0x80) > > #define TTY_STACKBUF 256 > +#define UTF8_STACKBUF 4 > > void > ttydisc_open(struct tty *tp) > @@ -800,6 +808,72 @@ ttydisc_rubchar(struct tty *tp) > ttyoutq_write_nofrag(&tp->t_outq, > "\b\b\b\b\b\b\b\b", tablen); > return (0); > + } else if ((tp->t_termios.c_iflag & IUTF8) != 0 && > + CTL_UTF8(c)) { > + uint8_t bytes[UTF8_STACKBUF] = { 0 }; > + int curidx = UTF8_STACKBUF - 1, cwidth = 1, > + nb = 0; > + teken_char_t codepoint; > + > + /* Save current byte. */ > + bytes[curidx] = c; > + curidx--; > + nb++; > + /* Loop back through inq until we hit the > + * leading byte. */ > + while (CTL_UTF8_CONT(c) && nb < UTF8_STACKBUF) { > + ttyinq_peekchar(&tp->t_inq, &c, "e); > + ttyinq_unputchar(&tp->t_inq); > + bytes[curidx] = c; > + curidx--; > + nb++; > + } > + /* > + * Shift array so that the leading > + * byte ends up at idx 0. > + */ > + if (nb < UTF8_STACKBUF) > + memmove(&bytes[0], &bytes[curidx + 1], > + nb * sizeof(uint8_t)); > + /* Check for malformed UTF8 characters. */ > + if (nb == UTF8_STACKBUF && > + CTL_UTF8_CONT(bytes[0])) { > + /* > + * Place all bytes back into the inq and > + * delete the last byte only. > + */ > + ttyinq_write(&tp->t_inq, bytes, > + UTF8_STACKBUF, 0); > + } else { > + /* Find codepoint and width. */ > + codepoint = > + teken_utf8_bytes_to_codepoint(bytes, > + nb); > + if (codepoint != > + TEKEN_UTF8_INVALID_CODEPOINT) { > + cwidth = teken_wcwidth( > + codepoint); > + } else { > + /* > + * Place all bytes back into the > + * inq and fall back to > + * default behaviour. > + */ > + ttyinq_write(&tp->t_inq, bytes, > + nb, 0); > + } > + } > + tp->t_column -= cwidth; > + /* > + * Delete character by punching > + * 'cwidth' spaces over it. > + */ > + if (cwidth == 1) > + ttyoutq_write_nofrag(&tp->t_outq, > + "\b \b", 3); > + else if (cwidth == 2) > + ttyoutq_write_nofrag(&tp->t_outq, > + "\b\b \b\b", 6); > } else { > /* > * Remove a regular character by > diff --git a/sys/teken/teken_wcwidth.h b/sys/teken/teken_wcwidth.h > index f57a185c2433..f5a23dbc9679 100644 > --- a/sys/teken/teken_wcwidth.h > +++ b/sys/teken/teken_wcwidth.h > @@ -8,6 +8,8 @@ > * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c > */ > > +#define TEKEN_UTF8_INVALID_CODEPOINT -1 > + > struct interval { > teken_char_t first; > teken_char_t last; > @@ -116,3 +118,31 @@ static int teken_wcwidth(teken_char_t ucs) > (ucs >= 0x20000 && ucs <= 0x2fffd) || > (ucs >= 0x30000 && ucs <= 0x3fffd))); > } > + > +/* > + * Converts an UTF-8 byte sequence to a codepoint as specified in > + * https://datatracker.ietf.org/doc/html/rfc3629#section-3 . The function > + * expects the 'bytes' array to start with the leading character. > + */ > +static teken_char_t > +teken_utf8_bytes_to_codepoint(uint8_t bytes[4], int nbytes) > +{ > + > + /* Check for malformed characters. */ > + if (bitcount(bytes[0] & 0xf0) != nbytes) > + return (TEKEN_UTF8_INVALID_CODEPOINT); > + > + switch (nbytes) { > + case 1: > + return (bytes[0] & 0x7f); > + case 2: > + return (bytes[0] & 0xf) << 6 | (bytes[1] & 0x3f); > + case 3: > + return (bytes[0] & 0xf) << 12 | (bytes[1] & 0x3f) << 6 | (bytes[2] & 0x3f); > + case 4: > + return (bytes[0] & 0x7) << 18 | (bytes[1] & 0x3f) << 12 | > + (bytes[2] & 0x3f) << 6 | (bytes[3] & 0x3f); > + default: > + return (TEKEN_UTF8_INVALID_CODEPOINT); > + } > +} > -- Sincerely, Dima (fluffy@FreeBSD.org, https://t.me/FluffyBSD) (desktop, kde, x11, office, ports-secteam)@FreeBSD team