git: 9971e6aff1be - main - vt: Improve multi lingual word separation.

From: Hans Petter Selasky <hselasky_at_FreeBSD.org>
Date: Mon, 27 Jun 2022 08:18:08 UTC
The branch main has been updated by hselasky:

URL: https://cgit.FreeBSD.org/src/commit/?id=9971e6aff1bef3d456172c41a3df3ce7266517cf

commit 9971e6aff1bef3d456172c41a3df3ce7266517cf
Author:     Hans Petter Selasky <hselasky@FreeBSD.org>
AuthorDate: 2022-06-25 09:17:44 +0000
Commit:     Hans Petter Selasky <hselasky@FreeBSD.org>
CommitDate: 2022-06-27 08:17:16 +0000

    vt: Improve multi lingual word separation.
    
    Suggested by:   Tomoaki AOKI <junchoon@dec.sakura.ne.jp>
    Differential Revision:  https://reviews.freebsd.org/D35552
    PR:             263084
    MFC after:      1 week
    Sponsored by:   NVIDIA Networking
---
 sys/dev/vt/vt_buf.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/sys/dev/vt/vt_buf.c b/sys/dev/vt/vt_buf.c
index fa6c7c8fec5f..b83db85f1cdb 100644
--- a/sys/dev/vt/vt_buf.c
+++ b/sys/dev/vt/vt_buf.c
@@ -747,6 +747,29 @@ vtbuf_get_marked_len(struct vt_buf *vb)
 	return (sz * sizeof(term_char_t));
 }
 
+static bool
+tchar_is_word_separator(term_char_t ch)
+{
+	/* List of unicode word separator characters: */
+	switch (TCHAR_CHARACTER(ch)) {
+	case 0x0020: /* SPACE */
+	case 0x180E: /* MONGOLIAN VOWEL SEPARATOR */
+	case 0x2002: /* EN SPACE (nut) */
+	case 0x2003: /* EM SPACE (mutton) */
+	case 0x2004: /* THREE-PER-EM SPACE (thick space) */
+	case 0x2005: /* FOUR-PER-EM SPACE (mid space) */
+	case 0x2006: /* SIX-PER-EM SPACE */
+	case 0x2008: /* PUNCTUATION SPACE */
+	case 0x2009: /* THIN SPACE */
+	case 0x200A: /* HAIR SPACE */
+	case 0x200B: /* ZERO WIDTH SPACE */
+	case 0x3000: /* IDEOGRAPHIC SPACE */
+		return (true);
+	default:
+		return (false);
+	}
+}
+
 void
 vtbuf_extract_marked(struct vt_buf *vb, term_char_t *buf, int sz)
 {
@@ -779,7 +802,7 @@ vtbuf_extract_marked(struct vt_buf *vb, term_char_t *buf, int sz)
 		if (r != e.tp_row) {
 			/* Trim trailing word separators, if any. */
 			for (; i != j; i--) {
-				if (TCHAR_CHARACTER(buf[i - 1]) != ' ')
+				if (!tchar_is_word_separator(buf[i - 1]))
 					break;
 			}
 			/* Add newline character as expected by TTY. */
@@ -824,7 +847,7 @@ vtbuf_set_mark(struct vt_buf *vb, int type, int col, int row)
 		    vtbuf_wth(vb, row);
 		r = vb->vb_rows[vb->vb_mark_start.tp_row];
 		for (i = col; i >= 0; i --) {
-			if (TCHAR_CHARACTER(r[i]) == ' ') {
+			if (tchar_is_word_separator(r[i])) {
 				vb->vb_mark_start.tp_col = i + 1;
 				break;
 			}
@@ -833,7 +856,7 @@ vtbuf_set_mark(struct vt_buf *vb, int type, int col, int row)
 		if (i == -1)
 			vb->vb_mark_start.tp_col = 0;
 		for (i = col; i < vb->vb_scr_size.tp_col; i++) {
-			if (TCHAR_CHARACTER(r[i]) == ' ') {
+			if (tchar_is_word_separator(r[i])) {
 				vb->vb_mark_end.tp_col = i;
 				break;
 			}