Re: git: 9e589b093857 - main - tty: fix improper backspace behaviour for UTF8 characters when in canonical mode
- Reply: Christos Margiolis : "Re: git: 9e589b093857 - main - tty: fix improper backspace behaviour for UTF8 characters when in canonical mode"
- In reply to: Christos Margiolis : "git: 9e589b093857 - main - tty: fix improper backspace behaviour for UTF8 characters when in canonical mode"
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Sat, 07 Oct 2023 21:04:19 UTC
Moin-moin!
It breaks the build %(
In file included from /usr/local/poudriere/jails/150aarch64/usr/src/sys/teken/teken.c:70:
/usr/local/poudriere/jails/150aarch64/usr/src/sys/teken/teken_wcwidth.h:132:7: error: call to undeclared function 'bitcount'; ISO C99 and later do not support implicit function declarations [-Werror,-Wimplicit-function-declaration]
if (bitcount(bytes[0] & 0xf0) != nbytes)
^
1 error generated.
*** [teken.o] Error code 1
make[5]: stopped in /usr/local/poudriere/jails/150aarch64/usr/src/stand/efi/libefi
On 07.10.2023 21:00, Christos Margiolis wrote:
> The branch main has been updated by christos:
>
> URL: https://cgit.FreeBSD.org/src/commit/?id=9e589b0938579f3f4d89fa5c051f845bf754184d
>
> commit 9e589b0938579f3f4d89fa5c051f845bf754184d
> Author: Bojan Novković <bojan.novkovic@fer.hr>
> AuthorDate: 2023-10-07 18:00:11 +0000
> Commit: Christos Margiolis <christos@FreeBSD.org>
> CommitDate: 2023-10-07 18:00:11 +0000
>
> tty: fix improper backspace behaviour for UTF8 characters when in canonical mode
>
> This patch adds additional logic in ttydisc_rubchar() to properly handle
> backspace behaviour for UTF-8 characters.
>
> Currently, typing in a backspace after a UTF8 character will delete only
> one byte from the byte sequence, leaving garbled output in the tty's
> output queue. With this change all of the character's bytes are deleted.
> This change is only active when the IUTF8 flag is set (see
> 19054eb6053189144aa962b2ecc1bf5087758a3e "(s)tty: add support for IUTF8
> input flag")
>
> The code uses the teken_wcwidth() function to properly handle character
> column widths for different code points, and adds the
> teken_utf8_bytes_to_codepoint() function that converts a UTF-8 byte
> sequence to a codepoint, as specified in RFC3629.
>
> Reported by: christos
> Reviewed by: christos, imp
> MFC after: 2 weeks
> Differential Revision: https://reviews.freebsd.org/D42067
> ---
> sys/kern/tty_ttydisc.c | 74 +++++++++++++++++++++++++++++++++++++++++++++++
> sys/teken/teken_wcwidth.h | 30 +++++++++++++++++++
> 2 files changed, 104 insertions(+)
>
> diff --git a/sys/kern/tty_ttydisc.c b/sys/kern/tty_ttydisc.c
> index 665275ee93e7..eae7162e31c0 100644
> --- a/sys/kern/tty_ttydisc.c
> +++ b/sys/kern/tty_ttydisc.c
> @@ -43,6 +43,9 @@
> #include <sys/uio.h>
> #include <sys/vnode.h>
>
> +#include <teken/teken.h>
> +#include <teken/teken_wcwidth.h>
> +
> /*
> * Standard TTYDISC `termios' line discipline.
> */
> @@ -78,8 +81,13 @@ SYSCTL_ULONG(_kern, OID_AUTO, tty_nout, CTLFLAG_RD,
> /* Character is alphanumeric. */
> #define CTL_ALNUM(c) (((c) >= '0' && (c) <= '9') || \
> ((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z'))
> +/* Character is UTF8-encoded. */
> +#define CTL_UTF8(c) (!!((c) & 0x80))
> +/* Character is a UTF8 continuation byte. */
> +#define CTL_UTF8_CONT(c) (((c) & 0xc0) == 0x80)
>
> #define TTY_STACKBUF 256
> +#define UTF8_STACKBUF 4
>
> void
> ttydisc_open(struct tty *tp)
> @@ -800,6 +808,72 @@ ttydisc_rubchar(struct tty *tp)
> ttyoutq_write_nofrag(&tp->t_outq,
> "\b\b\b\b\b\b\b\b", tablen);
> return (0);
> + } else if ((tp->t_termios.c_iflag & IUTF8) != 0 &&
> + CTL_UTF8(c)) {
> + uint8_t bytes[UTF8_STACKBUF] = { 0 };
> + int curidx = UTF8_STACKBUF - 1, cwidth = 1,
> + nb = 0;
> + teken_char_t codepoint;
> +
> + /* Save current byte. */
> + bytes[curidx] = c;
> + curidx--;
> + nb++;
> + /* Loop back through inq until we hit the
> + * leading byte. */
> + while (CTL_UTF8_CONT(c) && nb < UTF8_STACKBUF) {
> + ttyinq_peekchar(&tp->t_inq, &c, "e);
> + ttyinq_unputchar(&tp->t_inq);
> + bytes[curidx] = c;
> + curidx--;
> + nb++;
> + }
> + /*
> + * Shift array so that the leading
> + * byte ends up at idx 0.
> + */
> + if (nb < UTF8_STACKBUF)
> + memmove(&bytes[0], &bytes[curidx + 1],
> + nb * sizeof(uint8_t));
> + /* Check for malformed UTF8 characters. */
> + if (nb == UTF8_STACKBUF &&
> + CTL_UTF8_CONT(bytes[0])) {
> + /*
> + * Place all bytes back into the inq and
> + * delete the last byte only.
> + */
> + ttyinq_write(&tp->t_inq, bytes,
> + UTF8_STACKBUF, 0);
> + } else {
> + /* Find codepoint and width. */
> + codepoint =
> + teken_utf8_bytes_to_codepoint(bytes,
> + nb);
> + if (codepoint !=
> + TEKEN_UTF8_INVALID_CODEPOINT) {
> + cwidth = teken_wcwidth(
> + codepoint);
> + } else {
> + /*
> + * Place all bytes back into the
> + * inq and fall back to
> + * default behaviour.
> + */
> + ttyinq_write(&tp->t_inq, bytes,
> + nb, 0);
> + }
> + }
> + tp->t_column -= cwidth;
> + /*
> + * Delete character by punching
> + * 'cwidth' spaces over it.
> + */
> + if (cwidth == 1)
> + ttyoutq_write_nofrag(&tp->t_outq,
> + "\b \b", 3);
> + else if (cwidth == 2)
> + ttyoutq_write_nofrag(&tp->t_outq,
> + "\b\b \b\b", 6);
> } else {
> /*
> * Remove a regular character by
> diff --git a/sys/teken/teken_wcwidth.h b/sys/teken/teken_wcwidth.h
> index f57a185c2433..f5a23dbc9679 100644
> --- a/sys/teken/teken_wcwidth.h
> +++ b/sys/teken/teken_wcwidth.h
> @@ -8,6 +8,8 @@
> * Latest version: http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
> */
>
> +#define TEKEN_UTF8_INVALID_CODEPOINT -1
> +
> struct interval {
> teken_char_t first;
> teken_char_t last;
> @@ -116,3 +118,31 @@ static int teken_wcwidth(teken_char_t ucs)
> (ucs >= 0x20000 && ucs <= 0x2fffd) ||
> (ucs >= 0x30000 && ucs <= 0x3fffd)));
> }
> +
> +/*
> + * Converts an UTF-8 byte sequence to a codepoint as specified in
> + * https://datatracker.ietf.org/doc/html/rfc3629#section-3 . The function
> + * expects the 'bytes' array to start with the leading character.
> + */
> +static teken_char_t
> +teken_utf8_bytes_to_codepoint(uint8_t bytes[4], int nbytes)
> +{
> +
> + /* Check for malformed characters. */
> + if (bitcount(bytes[0] & 0xf0) != nbytes)
> + return (TEKEN_UTF8_INVALID_CODEPOINT);
> +
> + switch (nbytes) {
> + case 1:
> + return (bytes[0] & 0x7f);
> + case 2:
> + return (bytes[0] & 0xf) << 6 | (bytes[1] & 0x3f);
> + case 3:
> + return (bytes[0] & 0xf) << 12 | (bytes[1] & 0x3f) << 6 | (bytes[2] & 0x3f);
> + case 4:
> + return (bytes[0] & 0x7) << 18 | (bytes[1] & 0x3f) << 12 |
> + (bytes[2] & 0x3f) << 6 | (bytes[3] & 0x3f);
> + default:
> + return (TEKEN_UTF8_INVALID_CODEPOINT);
> + }
> +}
>
--
Sincerely,
Dima (fluffy@FreeBSD.org, https://t.me/FluffyBSD)
(desktop, kde, x11, office, ports-secteam)@FreeBSD team