git: 8f7ed58a1555 - main - regex: mixed sets are misidentified as singletons

From: Yuri Pankov <yuripv_at_FreeBSD.org>
Date: Fri, 22 Dec 2023 05:21:32 UTC
The branch main has been updated by yuripv:

URL: https://cgit.FreeBSD.org/src/commit/?id=8f7ed58a15556bf567ff876e1999e4fe4d684e1d

commit 8f7ed58a15556bf567ff876e1999e4fe4d684e1d
Author:     Bill Sommerfeld <sommerfeld@hamachi.org>
AuthorDate: 2023-12-21 03:46:14 +0000
Commit:     Yuri Pankov <yuripv@FreeBSD.org>
CommitDate: 2023-12-22 05:19:59 +0000

    regex: mixed sets are misidentified as singletons
    
    Fix "singleton" function used by regcomp() to turn character set matches
    into exact character matches if a character set has exactly one
    element.
    
    The underlying cset representation is complex; most critically it
    records"small" characters (codepoint less than either 128
    or 256 depending on locale) in a bit vector, and "wide" characters in
    a secondary array.
    
    Unfortunately the "singleton" function uses to identify singleton sets
    treated a cset as a singleton if either the "small" or the "wide" sets
    had exactly one element (it would then ignore the other set).
    
    The easiest way to demonstrate this bug:
    
            $ export LANG=C.UTF-8
            $ echo 'a' | grep '[abà]'
    
    It should match (and print "a") but instead it doesn't match because the
    single accented character in the set is misinterpreted as a singleton.
    
    Reviewed by:    kevans, yuripv
    Obtained from:  illumos
    Differential Revision:  https://reviews.freebsd.org/D43149
---
 lib/libc/regex/regcomp.c          | 25 ++++++++++++++++++-----
 lib/libc/tests/regex/multibyte.sh | 43 ++++++++++++++++++++++++++++++++++++++-
 2 files changed, 62 insertions(+), 6 deletions(-)

diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c
index ba803130a050..89b96b00fefb 100644
--- a/lib/libc/regex/regcomp.c
+++ b/lib/libc/regex/regcomp.c
@@ -1586,17 +1586,32 @@ singleton(cset *cs)
 {
 	wint_t i, s, n;
 
+	/* Exclude the complicated cases we don't want to deal with */
+	if (cs->nranges != 0 || cs->ntypes != 0 || cs->icase != 0)
+		return (OUT);
+
+	if (cs->nwides > 1)
+		return (OUT);
+
+	/* Count the number of characters present in the bitmap */
 	for (i = n = 0; i < NC; i++)
 		if (CHIN(cs, i)) {
 			n++;
 			s = i;
 		}
-	if (n == 1)
-		return (s);
-	if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
-	    cs->icase == 0)
+
+	if (n > 1)
+		return (OUT);
+
+	if (n == 1) {
+		if (cs->nwides == 0)
+			return (s);
+		else
+			return (OUT);
+	}
+	if (cs->nwides == 1)
 		return (cs->wides[0]);
-	/* Don't bother handling the other cases. */
+
 	return (OUT);
 }
 
diff --git a/lib/libc/tests/regex/multibyte.sh b/lib/libc/tests/regex/multibyte.sh
index a736352bf0a2..18323f500a2b 100755
--- a/lib/libc/tests/regex/multibyte.sh
+++ b/lib/libc/tests/regex/multibyte.sh
@@ -1,4 +1,3 @@
-
 atf_test_case bmpat
 bmpat_head()
 {
@@ -45,8 +44,50 @@ icase_body()
 	echo $c | atf_check -o "inline:$c\n" sed -ne "/$a/Ip"
 }
 
+atf_test_case mbset cleanup
+mbset_head()
+{
+	atf_set "descr" "Check multibyte sets matching"
+}
+mbset_body()
+{
+	export LC_CTYPE="C.UTF-8"
+
+	# This involved an erroneously implemented optimization which reduces
+	# single-element sets to an exact match with a single codepoint.
+	# Match sets record small-codepoint characters in a bitmap and
+	# large-codepoint characters in an array; the optimization would falsely
+	# trigger if either the bitmap or the array was a singleton, ignoring
+	# the members of the other side of the set.
+	#
+	# To exercise this, we construct sets which have one member of one side
+	# and one or more of the other, and verify that all members can be
+	# found.
+	printf "a" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset
+	printf "à" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset
+	printf "a" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
+	printf "à" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
+	printf "á" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
+	printf "à" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
+	printf "a" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
+	printf "b" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
+	printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset
+	printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset
+	printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
+	printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
+	printf "á" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
+	printf "à" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
+	printf "a" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
+	printf "b" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
+}
+mbset_cleanup()
+{
+	rm -f mbset
+}
+
 atf_init_test_cases()
 {
 	atf_add_test_case bmpat
 	atf_add_test_case icase
+	atf_add_test_case mbset
 }