git: 4f4860c9b07c - stable/14 - regex: mixed sets are misidentified as singletons
- Go to: [ bottom of page ] [ top of archives ] [ this month ]
Date: Wed, 25 Sep 2024 20:43:34 UTC
The branch stable/14 has been updated by kevans:
URL: https://cgit.FreeBSD.org/src/commit/?id=4f4860c9b07cc10cb6acbe6fbd71db45e344d2e6
commit 4f4860c9b07cc10cb6acbe6fbd71db45e344d2e6
Author: Bill Sommerfeld <sommerfeld@hamachi.org>
AuthorDate: 2023-12-21 03:46:14 +0000
Commit: Kyle Evans <kevans@FreeBSD.org>
CommitDate: 2024-09-25 20:42:25 +0000
regex: mixed sets are misidentified as singletons
Fix "singleton" function used by regcomp() to turn character set matches
into exact character matches if a character set has exactly one
element.
The underlying cset representation is complex; most critically it
records"small" characters (codepoint less than either 128
or 256 depending on locale) in a bit vector, and "wide" characters in
a secondary array.
Unfortunately the "singleton" function uses to identify singleton sets
treated a cset as a singleton if either the "small" or the "wide" sets
had exactly one element (it would then ignore the other set).
The easiest way to demonstrate this bug:
$ export LANG=C.UTF-8
$ echo 'a' | grep '[abà]'
It should match (and print "a") but instead it doesn't match because the
single accented character in the set is misinterpreted as a singleton.
PR: 281710
Reviewed by: kevans, yuripv
Obtained from: illumos
(cherry picked from commit 8f7ed58a15556bf567ff876e1999e4fe4d684e1d)
---
lib/libc/regex/regcomp.c | 25 ++++++++++++++++++-----
lib/libc/tests/regex/multibyte.sh | 43 ++++++++++++++++++++++++++++++++++++++-
2 files changed, 62 insertions(+), 6 deletions(-)
diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c
index 55f96a2ccbd2..eae4d02657e8 100644
--- a/lib/libc/regex/regcomp.c
+++ b/lib/libc/regex/regcomp.c
@@ -1591,17 +1591,32 @@ singleton(cset *cs)
{
wint_t i, s, n;
+ /* Exclude the complicated cases we don't want to deal with */
+ if (cs->nranges != 0 || cs->ntypes != 0 || cs->icase != 0)
+ return (OUT);
+
+ if (cs->nwides > 1)
+ return (OUT);
+
+ /* Count the number of characters present in the bitmap */
for (i = n = 0; i < NC; i++)
if (CHIN(cs, i)) {
n++;
s = i;
}
- if (n == 1)
- return (s);
- if (cs->nwides == 1 && cs->nranges == 0 && cs->ntypes == 0 &&
- cs->icase == 0)
+
+ if (n > 1)
+ return (OUT);
+
+ if (n == 1) {
+ if (cs->nwides == 0)
+ return (s);
+ else
+ return (OUT);
+ }
+ if (cs->nwides == 1)
return (cs->wides[0]);
- /* Don't bother handling the other cases. */
+
return (OUT);
}
diff --git a/lib/libc/tests/regex/multibyte.sh b/lib/libc/tests/regex/multibyte.sh
index a736352bf0a2..18323f500a2b 100755
--- a/lib/libc/tests/regex/multibyte.sh
+++ b/lib/libc/tests/regex/multibyte.sh
@@ -1,4 +1,3 @@
-
atf_test_case bmpat
bmpat_head()
{
@@ -45,8 +44,50 @@ icase_body()
echo $c | atf_check -o "inline:$c\n" sed -ne "/$a/Ip"
}
+atf_test_case mbset cleanup
+mbset_head()
+{
+ atf_set "descr" "Check multibyte sets matching"
+}
+mbset_body()
+{
+ export LC_CTYPE="C.UTF-8"
+
+ # This involved an erroneously implemented optimization which reduces
+ # single-element sets to an exact match with a single codepoint.
+ # Match sets record small-codepoint characters in a bitmap and
+ # large-codepoint characters in an array; the optimization would falsely
+ # trigger if either the bitmap or the array was a singleton, ignoring
+ # the members of the other side of the set.
+ #
+ # To exercise this, we construct sets which have one member of one side
+ # and one or more of the other, and verify that all members can be
+ # found.
+ printf "a" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset
+ printf "à" > mbset; atf_check -o not-empty sed -ne '/[aà]/p' mbset
+ printf "a" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
+ printf "à" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
+ printf "á" > mbset; atf_check -o not-empty sed -ne '/[aàá]/p' mbset
+ printf "à" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
+ printf "a" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
+ printf "b" > mbset; atf_check -o not-empty sed -ne '/[abà]/p' mbset
+ printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset
+ printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aà]/p' mbset
+ printf "a" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
+ printf "à" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
+ printf "á" > mbset; atf_check -o not-empty sed -Ene '/[aàá]/p' mbset
+ printf "à" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
+ printf "a" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
+ printf "b" > mbset; atf_check -o not-empty sed -Ene '/[abà]/p' mbset
+}
+mbset_cleanup()
+{
+ rm -f mbset
+}
+
atf_init_test_cases()
{
atf_add_test_case bmpat
atf_add_test_case icase
+ atf_add_test_case mbset
}