Re: git: 9e589b093857 - main - tty: fix improper backspace behaviour for UTF8 characters when in canonical mode

Reply: Christos Margiolis : "Re: git: 9e589b093857 - main - tty: fix improper backspace behaviour for UTF8 characters when in canonical mode"
In reply to: Christos Margiolis : "git: 9e589b093857 - main - tty: fix improper backspace behaviour for UTF8 characters when in canonical mode"
Go to: [ bottom of page ] [ top of archives ] [ this month ]
From: Dima Panov <fluffy_at_FreeBSD.org>
Date: Sat, 07 Oct 2023 21:04:19 UTC
Moin-moin!

It breaks the build %(


In file included from /usr/local/poudriere/jails/150aarch64/usr/src/sys/teken/teken.c:70:
/usr/local/poudriere/jails/150aarch64/usr/src/sys/teken/teken_wcwidth.h:132:7: error: call to undeclared function 'bitcount'; ISO C99 and later do not support implicit function declarations [-Werror,-Wimplicit-function-declaration]
   if (bitcount(bytes[0] & 0xf0) != nbytes)
       ^
1 error generated.
*** [teken.o] Error code 1

make[5]: stopped in /usr/local/poudriere/jails/150aarch64/usr/src/stand/efi/libefi



On 07.10.2023 21:00, Christos Margiolis wrote:
> The branch main has been updated by christos:
> 
> URL: https://cgit.FreeBSD.org/src/commit/?id=9e589b0938579f3f4d89fa5c051f845bf754184d
> 
> commit 9e589b0938579f3f4d89fa5c051f845bf754184d
> Author:     Bojan Novković <bojan.novkovic@fer.hr>
> AuthorDate: 2023-10-07 18:00:11 +0000
> Commit:     Christos Margiolis <christos@FreeBSD.org>
> CommitDate: 2023-10-07 18:00:11 +0000
> 
>      tty: fix improper backspace behaviour for UTF8 characters when in canonical mode
>      
>      This patch adds additional logic in ttydisc_rubchar() to properly handle
>      backspace behaviour for UTF-8 characters.
>      
>      Currently, typing in a backspace after a UTF8 character will delete only
>      one byte from the byte sequence, leaving garbled output in the tty's
>      output queue. With this change all of the character's bytes are deleted.
>      This change is only active when the IUTF8 flag is set (see
>      19054eb6053189144aa962b2ecc1bf5087758a3e "(s)tty: add support for IUTF8
>      input flag")
>      
>      The code uses the teken_wcwidth() function to properly handle character
>      column widths for different code points, and adds the
>      teken_utf8_bytes_to_codepoint() function that converts a UTF-8 byte
>      sequence to a codepoint, as specified in RFC3629.
>      
>      Reported by:    christos
>      Reviewed by:    christos, imp
>      MFC after:      2 weeks
>      Differential Revision:  https://reviews.freebsd.org/D42067
> ---
>   sys/kern/tty_ttydisc.c    | 74 +++++++++++++++++++++++++++++++++++++++++++++++
>   sys/teken/teken_wcwidth.h | 30 +++++++++++++++++++
>   2 files changed, 104 insertions(+)
> 
> diff --git a/sys/kern/tty_ttydisc.c b/sys/kern/tty_ttydisc.c
> index 665275ee93e7..eae7162e31c0 100644
> --- a/sys/kern/tty_ttydisc.c
> +++ b/sys/kern/tty_ttydisc.c
> @@ -43,6 +43,9 @@
>   #include <sys/uio.h>
>   #include <sys/vnode.h>
>   
> +#include <teken/teken.h>
> +#include <teken/teken_wcwidth.h>
> +
>   /*
>    * Standard TTYDISC `termios' line discipline.
>    */
> @@ -78,8 +81,13 @@ SYSCTL_ULONG(_kern, OID_AUTO, tty_nout, CTLFLAG_RD,
>   /* Character is alphanumeric. */
>   #define CTL_ALNUM(c)	(((c) >= '0' && (c) <= '9') || \
>       ((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
> +/* Character is UTF8-encoded. */
> +#define CTL_UTF8(c) (!!((c) & 0x80))
> +/* Character is a UTF8 continuation byte. */
> +#define CTL_UTF8_CONT(c) (((c) & 0xc0) == 0x80)
>   
>   #define	TTY_STACKBUF	256
> +#define UTF8_STACKBUF 4
>   
>   void
>   ttydisc_open(struct tty *tp)
> @@ -800,6 +808,72 @@ ttydisc_rubchar(struct tty *tp)
>   				ttyoutq_write_nofrag(&tp->t_outq,
>   				    "\b\b\b\b\b\b\b\b", tablen);
>   				return (0);
> +			} else if ((tp->t_termios.c_iflag & IUTF8) != 0 &&
> +			    CTL_UTF8(c)) {
> +				uint8_t bytes[UTF8_STACKBUF] = { 0 };
> +				int curidx = UTF8_STACKBUF - 1, cwidth = 1,
> +				    nb = 0;
> +				teken_char_t codepoint;
> +
> +				/* Save current byte. */
> +				bytes[curidx] = c;
> +				curidx--;
> +				nb++;
> +				/* Loop back through inq until we hit the
> +				 * leading byte. */
> +				while (CTL_UTF8_CONT(c) && nb < UTF8_STACKBUF) {
> +					ttyinq_peekchar(&tp->t_inq, &c, &quote);
> +					ttyinq_unputchar(&tp->t_inq);
> +					bytes[curidx] = c;
> +					curidx--;
> +					nb++;
> +				}
> +				/*
> +				 * Shift array so that the leading
> +				 * byte ends up at idx 0.
> +				 */
> +				if (nb < UTF8_STACKBUF)
> +					memmove(&bytes[0], &bytes[curidx + 1],
> +					    nb * sizeof(uint8_t));
> +				/* Check for malformed UTF8 characters. */
> +				if (nb == UTF8_STACKBUF &&
> +				    CTL_UTF8_CONT(bytes[0])) {
> +					/*
> +					 * Place all bytes back into the inq and
> +					 * delete the last byte only.
> +					 */
> +					ttyinq_write(&tp->t_inq, bytes,
> +					    UTF8_STACKBUF, 0);
> +				} else {
> +					/* Find codepoint and width. */
> +					codepoint =
> +					    teken_utf8_bytes_to_codepoint(bytes,
> +						nb);
> +					if (codepoint !=
> +					    TEKEN_UTF8_INVALID_CODEPOINT) {
> +						cwidth = teken_wcwidth(
> +						    codepoint);
> +					} else {
> +						/*
> +						 * Place all bytes back into the
> +						 * inq and fall back to
> +						 * default behaviour.
> +						 */
> +						ttyinq_write(&tp->t_inq, bytes,
> +						    nb, 0);
> +					}
> +				}
> +				tp->t_column -= cwidth;
> +				/*
> +				 * Delete character by punching
> +				 * 'cwidth' spaces over it.
> +				 */
> +				if (cwidth == 1)
> +					ttyoutq_write_nofrag(&tp->t_outq,
> +					    "\b \b", 3);
> +				else if (cwidth == 2)
> +					ttyoutq_write_nofrag(&tp->t_outq,
> +					    "\b\b  \b\b", 6);
>   			} else {
>   				/*
>   				 * Remove a regular character by
> diff --git a/sys/teken/teken_wcwidth.h b/sys/teken/teken_wcwidth.h
> index f57a185c2433..f5a23dbc9679 100644
> --- a/sys/teken/teken_wcwidth.h
> +++ b/sys/teken/teken_wcwidth.h
> @@ -8,6 +8,8 @@
>    * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
>    */
>   
> +#define TEKEN_UTF8_INVALID_CODEPOINT -1
> +
>   struct interval {
>     teken_char_t first;
>     teken_char_t last;
> @@ -116,3 +118,31 @@ static int teken_wcwidth(teken_char_t ucs)
>         (ucs >= 0x20000 && ucs <= 0x2fffd) ||
>         (ucs >= 0x30000 && ucs <= 0x3fffd)));
>   }
> +
> +/*
> + * Converts an UTF-8 byte sequence to a codepoint as specified in
> + * https://datatracker.ietf.org/doc/html/rfc3629#section-3 . The function
> + * expects the 'bytes' array to start with the leading character.
> + */
> +static teken_char_t
> +teken_utf8_bytes_to_codepoint(uint8_t bytes[4], int nbytes)
> +{
> +
> +  /* Check for malformed characters. */
> +  if (bitcount(bytes[0] & 0xf0) != nbytes)
> +    return (TEKEN_UTF8_INVALID_CODEPOINT);
> +
> +  switch (nbytes) {
> +  case 1:
> +    return (bytes[0] & 0x7f);
> +  case 2:
> +    return (bytes[0] & 0xf) << 6 | (bytes[1] & 0x3f);
> +  case 3:
> +    return (bytes[0] & 0xf) << 12 | (bytes[1] & 0x3f) << 6 | (bytes[2] & 0x3f);
> +  case 4:
> +    return (bytes[0] & 0x7) << 18 | (bytes[1] & 0x3f) << 12 |
> +	(bytes[2] & 0x3f) << 6 | (bytes[3] & 0x3f);
> +  default:
> +    return (TEKEN_UTF8_INVALID_CODEPOINT);
> +  }
> +}
> 

-- 
Sincerely,
Dima (fluffy@FreeBSD.org, https://t.me/FluffyBSD)
(desktop, kde, x11, office, ports-secteam)@FreeBSD team