git: 70233fc21258 - stable/12 - regex(3): Interpret many escaped ordinary characters as EESCAPE

Kyle Evans kevans at FreeBSD.org
Mon Dec 28 04:55:31 UTC 2020


The branch stable/12 has been updated by kevans:

URL: https://cgit.FreeBSD.org/src/commit/?id=70233fc21258ab4347fd07401297d3d409a9c4a8

commit 70233fc21258ab4347fd07401297d3d409a9c4a8
Author:     Kyle Evans <kevans at FreeBSD.org>
AuthorDate: 2020-07-29 23:21:56 +0000
Commit:     Kyle Evans <kevans at FreeBSD.org>
CommitDate: 2020-12-28 04:20:27 +0000

    regex(3): Interpret many escaped ordinary characters as EESCAPE
    
    MFC NOTE: This only merged the infrastructure back, the new regcomp symbol
    that actually interprets these as EESCAPE was *dropped*. This is purely to
    make future commits for libregex easier to merge back so that we can choose
    to use it.
    
    In IEEE 1003.1-2008 [1] and earlier revisions, BRE/ERE grammar allows for
    any character to be escaped, but "ORD_CHAR preceded by an unescaped
    <backslash> character [gives undefined results]".
    
    Historically, we've interpreted an escaped ordinary character as the
    ordinary character itself. This becomes problematic when some extensions
    give special meanings to an otherwise ordinary character
    (e.g. GNU's \b, \s, \w), meaning we may have two different valid
    interpretations of the same sequence.
    
    To make this easier to deal with and given that the standard calls this
    undefined, we should throw an error (EESCAPE) if we run into this scenario
    to ease transition into a state where some escaped ordinaries are blessed
    with a special meaning -- it will either error out or have extended
    behavior, rather than have two entirely different versions of undefined
    behavior that leave the consumer of regex(3) guessing as to what behavior
    will be used or leaving them with false impressions.
    
    This change bumps the symbol version of regcomp to FBSD_1.6 and provides the
    old escape semantics for legacy applications, just in case one has an older
    application that would immediately turn into a pumpkin because of an
    extraneous escape that's embedded or otherwise critical to its operation.
    
    This is the final piece needed before enhancing libregex with GNU extensions
    and flipping the switch on bsdgrep.
    
    [1] http://pubs.opengroup.org/onlinepubs/9699919799.2016edition/
    
    (cherry picked from commit adeebf4cd47c3e85155d92f386bda5e519b75ab2)
---
 contrib/netbsd-tests/lib/libc/regex/data/meta.in   |   4 +-
 contrib/netbsd-tests/lib/libc/regex/data/subexp.in |   2 +-
 lib/libc/regex/regcomp.c                           | 103 +++++++++++++++++----
 3 files changed, 90 insertions(+), 19 deletions(-)

diff --git a/contrib/netbsd-tests/lib/libc/regex/data/meta.in b/contrib/netbsd-tests/lib/libc/regex/data/meta.in
index 4533d3591bc6..eb24075aea62 100644
--- a/contrib/netbsd-tests/lib/libc/regex/data/meta.in
+++ b/contrib/netbsd-tests/lib/libc/regex/data/meta.in
@@ -4,7 +4,9 @@ a[bc]d		&	abd	abd
 a\*c		&	a*c	a*c
 a\\b		&	a\b	a\b
 a\\\*b		&	a\*b	a\*b
-a\bc		&	abc	abc
+# Begin FreeBSD
+a\bc		&C	EESCAPE
+# End FreeBSD
 a\		&C	EESCAPE
 a\\bc		&	a\bc	a\bc
 \{		bC	BADRPT
diff --git a/contrib/netbsd-tests/lib/libc/regex/data/subexp.in b/contrib/netbsd-tests/lib/libc/regex/data/subexp.in
index d3efe2eab270..e3d376bb7cb3 100644
--- a/contrib/netbsd-tests/lib/libc/regex/data/subexp.in
+++ b/contrib/netbsd-tests/lib/libc/regex/data/subexp.in
@@ -12,7 +12,7 @@ a(b+)c		-	abbbc	abbbc	bbb
 a(b*)c		-	ac	ac	@c
 (a|ab)(bc([de]+)f|cde)	-	abcdef	abcdef	a,bcdef,de
 # Begin FreeBSD
-a\(b\|c\)d	b	ab|cd	ab|cd	b|c
+a\(b|c\)d	b	ab|cd	ab|cd	b|c
 # End FreeBSD
 # the regression tester only asks for 9 subexpressions
 a(b)(c)(d)(e)(f)(g)(h)(i)(j)k	-	abcdefghijk	abcdefghijk	b,c,d,e,f,g,h,i,j
diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c
index b76e58a26082..5cda77da1a6a 100644
--- a/lib/libc/regex/regcomp.c
+++ b/lib/libc/regex/regcomp.c
@@ -102,11 +102,14 @@ struct parse {
 	sopno pend[NPAREN];	/* -> ) ([0] unused) */
 	bool allowbranch;	/* can this expression branch? */
 	bool bre;		/* convenience; is this a BRE? */
+	int pflags;		/* other parsing flags -- legacy escapes? */
 	bool (*parse_expr)(struct parse *, struct branchc *);
 	void (*pre_parse)(struct parse *, struct branchc *);
 	void (*post_parse)(struct parse *, struct branchc *);
 };
 
+#define PFLAG_LEGACY_ESC	0x00000001
+
 /* ========= begin header generated by ./mkh ========= */
 #ifdef __cplusplus
 extern "C" {
@@ -132,6 +135,7 @@ static void p_b_cclass(struct parse *p, cset *cs);
 static void p_b_eclass(struct parse *p, cset *cs);
 static wint_t p_b_symbol(struct parse *p);
 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
+static bool may_escape(struct parse *p, const wint_t ch);
 static wint_t othercase(wint_t ch);
 static void bothcases(struct parse *p, wint_t ch);
 static void ordinary(struct parse *p, wint_t ch);
@@ -199,22 +203,10 @@ static char nuls[10];		/* place to point scanner in event of error */
 /* Macro used by computejump()/computematchjump() */
 #define MIN(a,b)	((a)<(b)?(a):(b))
 
-/*
- - regcomp - interface for parser and compilation
- = extern int regcomp(regex_t *, const char *, int);
- = #define	REG_BASIC	0000
- = #define	REG_EXTENDED	0001
- = #define	REG_ICASE	0002
- = #define	REG_NOSUB	0004
- = #define	REG_NEWLINE	0010
- = #define	REG_NOSPEC	0020
- = #define	REG_PEND	0040
- = #define	REG_DUMP	0200
- */
-int				/* 0 success, otherwise REG_something */
-regcomp(regex_t * __restrict preg,
+static int				/* 0 success, otherwise REG_something */
+regcomp_internal(regex_t * __restrict preg,
 	const char * __restrict pattern,
-	int cflags)
+	int cflags, int pflags)
 {
 	struct parse pa;
 	struct re_guts *g;
@@ -273,6 +265,7 @@ regcomp(regex_t * __restrict preg,
 	p->end = p->next + len;
 	p->error = 0;
 	p->ncsalloc = 0;
+	p->pflags = pflags;
 	for (i = 0; i < NPAREN; i++) {
 		p->pbegin[i] = 0;
 		p->pend[i] = 0;
@@ -345,6 +338,27 @@ regcomp(regex_t * __restrict preg,
 	return(p->error);
 }
 
+/*
+ - regcomp - interface for parser and compilation
+ = extern int regcomp(regex_t *, const char *, int);
+ = #define	REG_BASIC	0000
+ = #define	REG_EXTENDED	0001
+ = #define	REG_ICASE	0002
+ = #define	REG_NOSUB	0004
+ = #define	REG_NEWLINE	0010
+ = #define	REG_NOSPEC	0020
+ = #define	REG_PEND	0040
+ = #define	REG_DUMP	0200
+ */
+int				/* 0 success, otherwise REG_something */
+regcomp(regex_t * __restrict preg,
+	const char * __restrict pattern,
+	int cflags)
+{
+
+	return (regcomp_internal(preg, pattern, cflags, PFLAG_LEGACY_ESC));
+}
+
 /*
  - p_ere_exp - parse one subERE, an atom possibly followed by a repetition op,
  - return whether we should terminate or not
@@ -435,7 +449,10 @@ p_ere_exp(struct parse *p, struct branchc *bc)
 			EMIT(OEOW, 0);
 			break;
 		default:
-			ordinary(p, wc);
+			if (may_escape(p, wc))
+				ordinary(p, wc);
+			else
+				SETERROR(REG_EESCAPE);
 			break;
 		}
 		break;
@@ -797,7 +814,10 @@ p_simp_re(struct parse *p, struct branchc *bc)
 			return (false);	/* Definitely not $... */
 		p->next--;
 		wc = WGETNEXT();
-		ordinary(p, wc);
+		if ((c & BACKSL) == 0 || may_escape(p, wc))
+			ordinary(p, wc);
+		else
+			SETERROR(REG_EESCAPE);
 		break;
 	}
 
@@ -1094,6 +1114,55 @@ p_b_coll_elem(struct parse *p,
 	return(0);
 }
 
+/*
+ - may_escape - determine whether 'ch' is escape-able in the current context
+ == static int may_escape(struct parse *p, const wint_t ch)
+ */
+static bool
+may_escape(struct parse *p, const wint_t ch)
+{
+
+	if ((p->pflags & PFLAG_LEGACY_ESC) != 0)
+		return (true);
+	if (isalpha(ch) || ch == '\'' || ch == '`')
+		return (false);
+	return (true);
+#ifdef NOTYET
+	/*
+	 * Build a whitelist of characters that may be escaped to produce an
+	 * ordinary in the current context. This assumes that these have not
+	 * been otherwise interpreted as a special character. Escaping an
+	 * ordinary character yields undefined results according to
+	 * IEEE 1003.1-2008. Some extensions (notably, some GNU extensions) take
+	 * advantage of this and use escaped ordinary characters to provide
+	 * special meaning, e.g. \b, \B, \w, \W, \s, \S.
+	 */
+	switch(ch) {
+	case '|':
+	case '+':
+	case '?':
+		/* The above characters may not be escaped in BREs */
+		if (!(p->g->cflags&REG_EXTENDED))
+			return (false);
+		/* Fallthrough */
+	case '(':
+	case ')':
+	case '{':
+	case '}':
+	case '.':
+	case '[':
+	case ']':
+	case '\\':
+	case '*':
+	case '^':
+	case '$':
+		return (true);
+	default:
+		return (false);
+	}
+#endif
+}
+
 /*
  - othercase - return the case counterpart of an alphabetic
  == static wint_t othercase(wint_t ch);


More information about the dev-commits-src-all mailing list