git: 06e63004abb0 - stable/12 - libregex: implement GNU extensions

Kyle Evans kevans at FreeBSD.org
Mon Dec 28 04:55:30 UTC 2020


The branch stable/12 has been updated by kevans:

URL: https://cgit.FreeBSD.org/src/commit/?id=06e63004abb0abc801e9f8af066ef10095189b10

commit 06e63004abb0abc801e9f8af066ef10095189b10
Author:     Kyle Evans <kevans at FreeBSD.org>
AuthorDate: 2020-08-04 02:14:51 +0000
Commit:     Kyle Evans <kevans at FreeBSD.org>
CommitDate: 2020-12-28 04:34:53 +0000

    libregex: implement GNU extensions
    
    18a1e2e9: libregex: Implement a subset of the GNU extensions
    
    The entire patch-set is not yet mature enough for commit, but this usable
    subset is generally enough for googletest to be happy with and mostly map to
    some existing concepts, so they're not as invasive.
    
    The specific changes included here are:
    
    - Branching in BREs with \|
    - \w and \W for [[:alnum:]] and [^[:alnum:]] respectively
    - \s and \S for [[:space:]] and [^[:space:]] respectively
    - Additional quantifiers in BREs, \? and \+ (self-explanatory)
    
    There's some #ifdef'd out work for allowing empty branches as a match-all.
    This is a feature that's under assessment... future work will determine
    how standard this behavior is and act accordingly.
    
    61898cde: libregex: disable some of the unimplemented test cases for now
    
    This should allow the tests to actually pass. Future work will uncomment the
    unimplemented tests as they're implemented.
    
    7518fb34: libc: regex: factor out ISBOW/ISEOW macros
    
    These will be reused for \b (word boundary, which matches both sides).
    
    No functional change.
    
    ca53e5ae: libregex: implement \` and \' (begin-of-subj, end-of-subj)
    
    These are GNU extensions, generally equivalent to ^ and $ except that the
    new syntax will not match beginning of line after the first in a multi-line
    expression or the end of line before absolute last in a multi-line
    expression.
    
    6b986646: libregex: implement \b and \B (word boundary, not word boundary)
    
    This is the last of the needed GNU expressions before we can unleash bsdgrep
    by default. \b is effectively an agnostic equivalent of \< and \>, while
    \B will match every space that isn't making a transition from
    nonchar -> char or char -> nonchar.
    
    4afa7dd6: libc: regex: retire internal EMPTBR ("Empty branch present")
    
    It was realized just a little too late that this was a hack that belonged in
    individual regex(3)-using applications. It was surrounded in NOTYET and not
    implemented in the engine, so remove it.
    
    4f1efa30: libc: regex: partial revert of r368358 (6b986646)
    
    MFC NOTE: Altered to match the legacy behavior of a\bc => abc.
    
    Part of the libregex functionality leaked into the tests it shares with
    the standard regex(3). Introduce a P flag to set the REG_POSIX cflag to
    indicate that libc regex should effectively do nothing while libregex should
    specifically run it in non-extended mode.
    
    This unbreaks the libc/regex test run.
    
    (cherry picked from commit 18a1e2e9b9f109a78c5a9274e4cfb4777801b4fb)
    (cherry picked from commit 61898cde69374d5a9994e2074605bc4101aff72d)
    (cherry picked from commit 7518fb346fe9603f99d2406a073b30fb8e4a270c)
    (cherry picked from commit ca53e5aedfebcc1b4091b68e01b2d5cae923f85e)
    (cherry picked from commit 6b986646d434baa21ae3d74d6a662ad206c7ddbd)
    (cherry picked from commit 4afa7dd61a3a1454a5b3cf5e6de2029c7e2d9a84)
    (cherry picked from commit 4f1efa309ca48a088595dd57969ae6a397dd49d1)
---
 contrib/netbsd-tests/lib/libc/regex/README       |   1 +
 contrib/netbsd-tests/lib/libc/regex/data/meta.in |   2 +-
 contrib/netbsd-tests/lib/libc/regex/main.c       |   5 +-
 lib/libc/regex/engine.c                          | 115 ++++++--
 lib/libc/regex/regcomp.c                         | 336 +++++++++++++++++------
 lib/libc/regex/regex2.h                          |   4 +
 lib/libregex/tests/gnuext.in                     |   4 +
 lib/libregex/tests/libregex_test.sh              |   4 -
 8 files changed, 365 insertions(+), 106 deletions(-)

diff --git a/contrib/netbsd-tests/lib/libc/regex/README b/contrib/netbsd-tests/lib/libc/regex/README
index 6d9a28cf60ca..0e8a09a764ff 100644
--- a/contrib/netbsd-tests/lib/libc/regex/README
+++ b/contrib/netbsd-tests/lib/libc/regex/README
@@ -28,6 +28,7 @@ The full list of flags:
   $	REG_NOTEOL
   #	REG_STARTEND (see below)
   p	REG_PEND
+  P	REG_POSIX
 
 For REG_STARTEND, the start/end offsets are those of the substring
 enclosed in ().
diff --git a/contrib/netbsd-tests/lib/libc/regex/data/meta.in b/contrib/netbsd-tests/lib/libc/regex/data/meta.in
index eb24075aea62..61e432353f6b 100644
--- a/contrib/netbsd-tests/lib/libc/regex/data/meta.in
+++ b/contrib/netbsd-tests/lib/libc/regex/data/meta.in
@@ -5,7 +5,7 @@ a\*c		&	a*c	a*c
 a\\b		&	a\b	a\b
 a\\\*b		&	a\*b	a\*b
 # Begin FreeBSD
-a\bc		&C	EESCAPE
+a\bc		&P	abc	abc
 # End FreeBSD
 a\		&C	EESCAPE
 a\\bc		&	a\bc	a\bc
diff --git a/contrib/netbsd-tests/lib/libc/regex/main.c b/contrib/netbsd-tests/lib/libc/regex/main.c
index eac4e2d9b51e..243c8dc5ff80 100644
--- a/contrib/netbsd-tests/lib/libc/regex/main.c
+++ b/contrib/netbsd-tests/lib/libc/regex/main.c
@@ -338,7 +338,7 @@ options(int type, char *s)
 {
 	char *p;
 	int o = (type == 'c') ? copts : eopts;
-	const char *legal = (type == 'c') ? "bisnmp" : "^$#tl";
+	const char *legal = (type == 'c') ? "bisnmpP" : "^$#tl";
 
 	for (p = s; *p != '\0'; p++)
 		if (strchr(legal, *p) != NULL)
@@ -362,6 +362,9 @@ options(int type, char *s)
 			case 'p':
 				o |= REG_PEND;
 				break;
+			case 'P':
+				o |= REG_POSIX;
+				break;
 			case '^':
 				o |= REG_NOTBOL;
 				break;
diff --git a/lib/libc/regex/engine.c b/lib/libc/regex/engine.c
index e7da4cbc2a5d..bb40018c07e1 100644
--- a/lib/libc/regex/engine.c
+++ b/lib/libc/regex/engine.c
@@ -109,7 +109,7 @@ static int matcher(struct re_guts *g, const char *string, size_t nmatch, regmatc
 static const char *dissect(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst);
 static const char *backref(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, sopno lev, int);
 static const char *walk(struct match *m, const char *start, const char *stop, sopno startst, sopno stopst, bool fast);
-static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft);
+static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_t ch, states aft, int sflags);
 #define MAX_RECURSION	100
 #define	BOL	(OUT-1)
 #define	EOL	(BOL-1)
@@ -118,7 +118,12 @@ static states step(struct re_guts *g, sopno start, sopno stop, states bef, wint_
 #define	BOW	(BOL-4)
 #define	EOW	(BOL-5)
 #define	BADCHAR	(BOL-6)
+#define	NWBND	(BOL-7)
 #define	NONCHAR(c)	((c) <= OUT)
+/* sflags */
+#define	SBOS	0x0001
+#define	SEOS	0x0002
+
 #ifdef REDEBUG
 static void print(struct match *m, const char *caption, states st, int ch, FILE *d);
 #endif
@@ -457,6 +462,10 @@ dissect(struct match *m,
 		case OEOL:
 		case OBOW:
 		case OEOW:
+		case OBOS:
+		case OEOS:
+		case OWBND:
+		case ONWBND:
 			break;
 		case OANY:
 		case OANYOF:
@@ -589,6 +598,17 @@ dissect(struct match *m,
 	return(sp);
 }
 
+#define	ISBOW(m, sp)					\
+    (sp < m->endp && ISWORD(*sp) &&			\
+    ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||	\
+    (sp > m->offp && !ISWORD(*(sp-1)))))
+#define	ISEOW(m, sp)					\
+    (((sp == m->endp && !(m->eflags&REG_NOTEOL)) ||	\
+    (sp < m->endp && *sp == '\n' &&			\
+    (m->g->cflags&REG_NEWLINE)) ||			\
+    (sp < m->endp && !ISWORD(*sp)) ) &&			\
+    (sp > m->beginp && ISWORD(*(sp-1))))		\
+
 /*
  - backref - figure out what matched what, figuring in back references
  == static const char *backref(struct match *m, const char *start, \
@@ -646,6 +666,18 @@ backref(struct match *m,
 			if (wc == BADCHAR || !CHIN(cs, wc))
 				return(NULL);
 			break;
+		case OBOS:
+			if (sp == m->beginp && (m->eflags & REG_NOTBOL) == 0)
+				{ /* yes */ }
+			else
+				return(NULL);
+			break;
+		case OEOS:
+			if (sp == m->endp && (m->eflags & REG_NOTEOL) == 0)
+				{ /* yes */ }
+			else
+				return(NULL);
+			break;
 		case OBOL:
 			if ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
 			    (sp > m->offp && sp < m->endp &&
@@ -662,20 +694,29 @@ backref(struct match *m,
 			else
 				return(NULL);
 			break;
+		case OWBND:
+			if (ISBOW(m, sp) || ISEOW(m, sp))
+				{ /* yes */ }
+			else
+				return(NULL);
+			break;
+		case ONWBND:
+			if (((sp == m->beginp) && !ISWORD(*sp)) ||
+			    (sp == m->endp && !ISWORD(*(sp - 1))))
+				{ /* yes, beginning/end of subject */ }
+			else if (ISWORD(*(sp - 1)) == ISWORD(*sp))
+				{ /* yes, beginning/end of subject */ }
+			else
+				return(NULL);
+			break;
 		case OBOW:
-			if (sp < m->endp && ISWORD(*sp) &&
-			    ((sp == m->beginp && !(m->eflags&REG_NOTBOL)) ||
-			    (sp > m->offp && !ISWORD(*(sp-1)))))
+			if (ISBOW(m, sp))
 				{ /* yes */ }
 			else
 				return(NULL);
 			break;
 		case OEOW:
-			if (( (sp == m->endp && !(m->eflags&REG_NOTEOL)) ||
-					(sp < m->endp && *sp == '\n' &&
-						(m->g->cflags&REG_NEWLINE)) ||
-					(sp < m->endp && !ISWORD(*sp)) ) &&
-					(sp > m->beginp && ISWORD(*(sp-1))) )
+			if (ISEOW(m, sp))
 				{ /* yes */ }
 			else
 				return(NULL);
@@ -814,15 +855,16 @@ walk(struct match *m, const char *start, const char *stop, sopno startst,
 	wint_t c;
 	wint_t lastc;		/* previous c */
 	wint_t flagch;
-	int i;
+	int i, sflags;
 	const char *matchp;	/* last p at which a match ended */
 	size_t clen;
 
+	sflags = 0;
 	AT("slow", start, stop, startst, stopst);
 	CLEAR(st);
 	SET1(st, startst);
 	SP("sstart", st, *p);
-	st = step(m->g, startst, stopst, st, NOTHING, st);
+	st = step(m->g, startst, stopst, st, NOTHING, st, sflags);
 	if (fast)
 		ASSIGN(fresh, st);
 	matchp = NULL;
@@ -839,6 +881,7 @@ walk(struct match *m, const char *start, const char *stop, sopno startst,
 	for (;;) {
 		/* next character */
 		lastc = c;
+		sflags = 0;
 		if (p == m->endp) {
 			c = OUT;
 			clen = 0;
@@ -861,9 +904,20 @@ walk(struct match *m, const char *start, const char *stop, sopno startst,
 			flagch = (flagch == BOL) ? BOLEOL : EOL;
 			i += m->g->neol;
 		}
+		if (lastc == OUT && (m->eflags & REG_NOTBOL) == 0) {
+			sflags |= SBOS;
+			/* Step one more for BOS. */
+			i++;
+		}
+		if (c == OUT && (m->eflags & REG_NOTEOL) == 0) {
+			sflags |= SEOS;
+			/* Step one more for EOS. */
+			i++;
+		}
 		if (i != 0) {
 			for (; i > 0; i--)
-				st = step(m->g, startst, stopst, st, flagch, st);
+				st = step(m->g, startst, stopst, st, flagch, st,
+				    sflags);
 			SP("sboleol", st, c);
 		}
 
@@ -877,9 +931,20 @@ walk(struct match *m, const char *start, const char *stop, sopno startst,
 			flagch = EOW;
 		}
 		if (flagch == BOW || flagch == EOW) {
-			st = step(m->g, startst, stopst, st, flagch, st);
+			st = step(m->g, startst, stopst, st, flagch, st, sflags);
 			SP("sboweow", st, c);
 		}
+		if (lastc != OUT && c != OUT &&
+		    ISWORD(lastc) == ISWORD(c)) {
+			flagch = NWBND;
+		} else if ((lastc == OUT && !ISWORD(c)) ||
+		    (c == OUT && !ISWORD(lastc))) {
+			flagch = NWBND;
+		}
+		if (flagch == NWBND) {
+			st = step(m->g, startst, stopst, st, flagch, st, sflags);
+			SP("snwbnd", st, c);
+		}
 
 		/* are we done? */
 		if (ISSET(st, stopst)) {
@@ -898,9 +963,10 @@ walk(struct match *m, const char *start, const char *stop, sopno startst,
 		else
 			ASSIGN(st, empty);
 		assert(c != OUT);
-		st = step(m->g, startst, stopst, tmp, c, st);
+		st = step(m->g, startst, stopst, tmp, c, st, sflags);
 		SP("saft", st, c);
-		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
+		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st, sflags),
+		    st));
 		p += clen;
 	}
 
@@ -934,7 +1000,8 @@ step(struct re_guts *g,
 	sopno stop,		/* state after stop state within strip */
 	states bef,		/* states reachable before */
 	wint_t ch,		/* character or NONCHAR code */
-	states aft)		/* states already known reachable after */
+	states aft,		/* states already known reachable after */
+	int sflags)		/* state flags */
 {
 	cset *cs;
 	sop s;
@@ -955,6 +1022,14 @@ step(struct re_guts *g,
 			if (ch == OPND(s))
 				FWD(aft, bef, 1);
 			break;
+		case OBOS:
+			if ((ch == BOL || ch == BOLEOL) && (sflags & SBOS) != 0)
+				FWD(aft, bef, 1);
+			break;
+		case OEOS:
+			if ((ch == EOL || ch == BOLEOL) && (sflags & SEOS) != 0)
+				FWD(aft, bef, 1);
+			break;
 		case OBOL:
 			if (ch == BOL || ch == BOLEOL)
 				FWD(aft, bef, 1);
@@ -971,6 +1046,14 @@ step(struct re_guts *g,
 			if (ch == EOW)
 				FWD(aft, bef, 1);
 			break;
+		case OWBND:
+			if (ch == BOW || ch == EOW)
+				FWD(aft, bef, 1);
+			break;
+		case ONWBND:
+			if (ch == NWBND)
+				FWD(aft, aft, 1);
+			break;
 		case OANY:
 			if (!NONCHAR(ch))
 				FWD(aft, bef, 1);
diff --git a/lib/libc/regex/regcomp.c b/lib/libc/regex/regcomp.c
index 5cda77da1a6a..00ab6a77141b 100644
--- a/lib/libc/regex/regcomp.c
+++ b/lib/libc/regex/regcomp.c
@@ -92,6 +92,7 @@ struct parse {
 	const char *next;	/* next character in RE */
 	const char *end;	/* end of string (-> NUL normally) */
 	int error;		/* has an error been seen? */
+	int gnuext;
 	sop *strip;		/* malloced strip */
 	sopno ssize;		/* malloced strip size (allocated) */
 	sopno slen;		/* malloced strip length (used) */
@@ -131,7 +132,9 @@ static int p_count(struct parse *p);
 static void p_bracket(struct parse *p);
 static int p_range_cmp(wchar_t c1, wchar_t c2);
 static void p_b_term(struct parse *p, cset *cs);
+static int p_b_pseudoclass(struct parse *p, char c);
 static void p_b_cclass(struct parse *p, cset *cs);
+static void p_b_cclass_named(struct parse *p, cset *cs, const char[]);
 static void p_b_eclass(struct parse *p, cset *cs);
 static wint_t p_b_symbol(struct parse *p);
 static wint_t p_b_coll_elem(struct parse *p, wint_t endc);
@@ -181,6 +184,7 @@ static char nuls[10];		/* place to point scanner in event of error */
 #define	SEESPEC(a)	(p->bre ? SEETWO('\\', a) : SEE(a))
 #define	EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
 #define	EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
+#define	EATSPEC(a)	(p->bre ? EATTWO('\\', a) : EAT(a))
 #define	NEXT()	(p->next++)
 #define	NEXT2()	(p->next += 2)
 #define	NEXTn(n)	(p->next += (n))
@@ -270,14 +274,22 @@ regcomp_internal(regex_t * __restrict preg,
 		p->pbegin[i] = 0;
 		p->pend[i] = 0;
 	}
+#ifdef LIBREGEX
+	if (cflags&REG_POSIX) {
+		p->gnuext = false;
+		p->allowbranch = (cflags & REG_EXTENDED) != 0;
+	} else
+		p->gnuext = p->allowbranch = true;
+#else
+	p->gnuext = false;
+	p->allowbranch = (cflags & REG_EXTENDED) != 0;
+#endif
 	if (cflags & REG_EXTENDED) {
-		p->allowbranch = true;
 		p->bre = false;
 		p->parse_expr = p_ere_exp;
 		p->pre_parse = NULL;
 		p->post_parse = NULL;
 	} else {
-		p->allowbranch = false;
 		p->bre = true;
 		p->parse_expr = p_simp_re;
 		p->pre_parse = p_bre_pre_parse;
@@ -372,6 +384,10 @@ p_ere_exp(struct parse *p, struct branchc *bc)
 	sopno pos;
 	int count;
 	int count2;
+#ifdef LIBREGEX
+	int i;
+	int handled;
+#endif
 	sopno subno;
 	int wascaret = 0;
 
@@ -379,6 +395,9 @@ p_ere_exp(struct parse *p, struct branchc *bc)
 	assert(MORE());		/* caller should have ensured this */
 	c = GETNEXT();
 
+#ifdef LIBREGEX
+	handled = 0;
+#endif
 	pos = HERE();
 	switch (c) {
 	case '(':
@@ -441,6 +460,59 @@ p_ere_exp(struct parse *p, struct branchc *bc)
 	case '\\':
 		(void)REQUIRE(MORE(), REG_EESCAPE);
 		wc = WGETNEXT();
+#ifdef LIBREGEX
+		if (p->gnuext) {
+			handled = 1;
+			switch (wc) {
+			case '`':
+				EMIT(OBOS, 0);
+				break;
+			case '\'':
+				EMIT(OEOS, 0);
+				break;
+			case 'B':
+				EMIT(ONWBND, 0);
+				break;
+			case 'b':
+				EMIT(OWBND, 0);
+				break;
+			case 'W':
+			case 'w':
+			case 'S':
+			case 's':
+				p_b_pseudoclass(p, wc);
+				break;
+			case '1':
+			case '2':
+			case '3':
+			case '4':
+			case '5':
+			case '6':
+			case '7':
+			case '8':
+			case '9':
+				i = wc - '0';
+				assert(i < NPAREN);
+				if (p->pend[i] != 0) {
+					assert(i <= p->g->nsub);
+					EMIT(OBACK_, i);
+					assert(p->pbegin[i] != 0);
+					assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
+					assert(OP(p->strip[p->pend[i]]) == ORPAREN);
+					(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
+					EMIT(O_BACK, i);
+				} else
+					SETERROR(REG_ESUBREG);
+				p->g->backrefs = 1;
+				break;
+			default:
+				handled = 0;
+			}
+			/* Don't proceed to the POSIX bits if we've already handled it */
+			if (handled)
+				break;
+		}
+#endif
 		switch (wc) {
 		case '<':
 			EMIT(OBOW, 0);
@@ -551,7 +623,7 @@ p_branch_eat_delim(struct parse *p, struct branchc *bc)
 
 	(void)bc;
 	nskip = 0;
-	while (EAT('|'))
+	while (EATSPEC('|'))
 		++nskip;
 	return (nskip);
 }
@@ -697,7 +769,11 @@ p_re(struct parse *p,
 		}
 		if (p->post_parse != NULL)
 			p->post_parse(p, &bc);
-		(void) REQUIRE(HERE() != bc.start, REG_EMPTY);
+		(void) REQUIRE(p->gnuext || HERE() != bc.start, REG_EMPTY);
+#ifdef LIBREGEX
+		if (HERE() == bc.start && !p_branch_empty(p, &bc))
+			break;
+#endif
 		if (!p->allowbranch)
 			break;
 		/*
@@ -724,101 +800,134 @@ static bool			/* was the simple RE an unbackslashed $? */
 p_simp_re(struct parse *p, struct branchc *bc)
 {
 	int c;
+	int cc;			/* convenient/control character */
 	int count;
 	int count2;
 	sopno pos;
+	bool handled;
 	int i;
 	wint_t wc;
 	sopno subno;
 #	define	BACKSL	(1<<CHAR_BIT)
 
 	pos = HERE();		/* repetition op, if any, covers from here */
+	handled = false;
 
 	assert(MORE());		/* caller should have ensured this */
 	c = GETNEXT();
 	if (c == '\\') {
 		(void)REQUIRE(MORE(), REG_EESCAPE);
-		c = BACKSL | GETNEXT();
+		cc = GETNEXT();
+		c = BACKSL | cc;
+#ifdef LIBREGEX
+		if (p->gnuext) {
+			handled = true;
+			switch (c) {
+			case BACKSL|'`':
+				EMIT(OBOS, 0);
+				break;
+			case BACKSL|'\'':
+				EMIT(OEOS, 0);
+				break;
+			case BACKSL|'B':
+				EMIT(ONWBND, 0);
+				break;
+			case BACKSL|'b':
+				EMIT(OWBND, 0);
+				break;
+			case BACKSL|'W':
+			case BACKSL|'w':
+			case BACKSL|'S':
+			case BACKSL|'s':
+				p_b_pseudoclass(p, cc);
+				break;
+			default:
+				handled = false;
+			}
+		}
+#endif
 	}
-	switch (c) {
-	case '.':
-		if (p->g->cflags&REG_NEWLINE)
-			nonnewline(p);
-		else
-			EMIT(OANY, 0);
-		break;
-	case '[':
-		p_bracket(p);
-		break;
-	case BACKSL|'<':
-		EMIT(OBOW, 0);
-		break;
-	case BACKSL|'>':
-		EMIT(OEOW, 0);
-		break;
-	case BACKSL|'{':
-		SETERROR(REG_BADRPT);
-		break;
-	case BACKSL|'(':
-		p->g->nsub++;
-		subno = p->g->nsub;
-		if (subno < NPAREN)
-			p->pbegin[subno] = HERE();
-		EMIT(OLPAREN, subno);
-		/* the MORE here is an error heuristic */
-		if (MORE() && !SEETWO('\\', ')'))
-			p_re(p, '\\', ')');
-		if (subno < NPAREN) {
-			p->pend[subno] = HERE();
-			assert(p->pend[subno] != 0);
+	if (!handled) {
+		switch (c) {
+		case '.':
+			if (p->g->cflags&REG_NEWLINE)
+				nonnewline(p);
+			else
+				EMIT(OANY, 0);
+			break;
+		case '[':
+			p_bracket(p);
+			break;
+		case BACKSL|'<':
+			EMIT(OBOW, 0);
+			break;
+		case BACKSL|'>':
+			EMIT(OEOW, 0);
+			break;
+		case BACKSL|'{':
+			SETERROR(REG_BADRPT);
+			break;
+		case BACKSL|'(':
+			p->g->nsub++;
+			subno = p->g->nsub;
+			if (subno < NPAREN)
+				p->pbegin[subno] = HERE();
+			EMIT(OLPAREN, subno);
+			/* the MORE here is an error heuristic */
+			if (MORE() && !SEETWO('\\', ')'))
+				p_re(p, '\\', ')');
+			if (subno < NPAREN) {
+				p->pend[subno] = HERE();
+				assert(p->pend[subno] != 0);
+			}
+			EMIT(ORPAREN, subno);
+			(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
+			break;
+		case BACKSL|')':	/* should not get here -- must be user */
+			SETERROR(REG_EPAREN);
+			break;
+		case BACKSL|'1':
+		case BACKSL|'2':
+		case BACKSL|'3':
+		case BACKSL|'4':
+		case BACKSL|'5':
+		case BACKSL|'6':
+		case BACKSL|'7':
+		case BACKSL|'8':
+		case BACKSL|'9':
+			i = (c&~BACKSL) - '0';
+			assert(i < NPAREN);
+			if (p->pend[i] != 0) {
+				assert(i <= p->g->nsub);
+				EMIT(OBACK_, i);
+				assert(p->pbegin[i] != 0);
+				assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
+				assert(OP(p->strip[p->pend[i]]) == ORPAREN);
+				(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
+				EMIT(O_BACK, i);
+			} else
+				SETERROR(REG_ESUBREG);
+			p->g->backrefs = 1;
+			break;
+		case '*':
+			/*
+			 * Ordinary if used as the first character beyond BOL anchor of
+			 * a (sub-)expression, counts as a bad repetition operator if it
+			 * appears otherwise.
+			 */
+			(void)REQUIRE(bc->nchain == 0, REG_BADRPT);
+			/* FALLTHROUGH */
+		default:
+			if (p->error != 0)
+				return (false);	/* Definitely not $... */
+			p->next--;
+			wc = WGETNEXT();
+			if ((c & BACKSL) == 0 || may_escape(p, wc))
+				ordinary(p, wc);
+			else
+				SETERROR(REG_EESCAPE);
+			break;
 		}
-		EMIT(ORPAREN, subno);
-		(void)REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
-		break;
-	case BACKSL|')':	/* should not get here -- must be user */
-		SETERROR(REG_EPAREN);
-		break;
-	case BACKSL|'1':
-	case BACKSL|'2':
-	case BACKSL|'3':
-	case BACKSL|'4':
-	case BACKSL|'5':
-	case BACKSL|'6':
-	case BACKSL|'7':
-	case BACKSL|'8':
-	case BACKSL|'9':
-		i = (c&~BACKSL) - '0';
-		assert(i < NPAREN);
-		if (p->pend[i] != 0) {
-			assert(i <= p->g->nsub);
-			EMIT(OBACK_, i);
-			assert(p->pbegin[i] != 0);
-			assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
-			assert(OP(p->strip[p->pend[i]]) == ORPAREN);
-			(void) dupl(p, p->pbegin[i]+1, p->pend[i]);
-			EMIT(O_BACK, i);
-		} else
-			SETERROR(REG_ESUBREG);
-		p->g->backrefs = 1;
-		break;
-	case '*':
-		/*
-		 * Ordinary if used as the first character beyond BOL anchor of
-		 * a (sub-)expression, counts as a bad repetition operator if it
-		 * appears otherwise.
-		 */
-		(void)REQUIRE(bc->nchain == 0, REG_BADRPT);
-		/* FALLTHROUGH */
-	default:
-		if (p->error != 0)
-			return (false);	/* Definitely not $... */
-		p->next--;
-		wc = WGETNEXT();
-		if ((c & BACKSL) == 0 || may_escape(p, wc))
-			ordinary(p, wc);
-		else
-			SETERROR(REG_EESCAPE);
-		break;
 	}
 
 	if (EAT('*')) {		/* implemented as +? */
@@ -827,6 +936,14 @@ p_simp_re(struct parse *p, struct branchc *bc)
 		ASTERN(O_PLUS, pos);
 		INSERT(OQUEST_, pos);
 		ASTERN(O_QUEST, pos);
+#ifdef LIBREGEX
+	} else if (p->gnuext && EATTWO('\\', '?')) {
+		INSERT(OQUEST_, pos);
+		ASTERN(O_QUEST, pos);
+	} else if (p->gnuext && EATTWO('\\', '+')) {
+		INSERT(OPLUS_, pos);
+		ASTERN(O_PLUS, pos);
+#endif
 	} else if (EATTWO('\\', '{')) {
 		count = p_count(p);
 		if (EAT(',')) {
@@ -1018,6 +1135,41 @@ p_b_term(struct parse *p, cset *cs)
 	}
 }
 
+/*
+ - p_b_pseudoclass - parse a pseudo-class (\w, \W, \s, \S)
+ == static int p_b_pseudoclass(struct parse *p, char c)
+ */
+static int
+p_b_pseudoclass(struct parse *p, char c) {
+	cset *cs;
+
+	if ((cs = allocset(p)) == NULL)
+		return(0);
+
+	if (p->g->cflags&REG_ICASE)
+		cs->icase = 1;
+
+	switch (c) {
+	case 'W':
+		cs->invert = 1;
+		/* PASSTHROUGH */
+	case 'w':
+		p_b_cclass_named(p, cs, "alnum");
+		break;
+	case 'S':
+		cs->invert = 1;
+		/* PASSTHROUGH */
+	case 's':
+		p_b_cclass_named(p, cs, "space");
+		break;
+	default:
+		return(0);
+	}
+
+	EMIT(OANYOF, (int)(cs - p->g->sets));
+	return(1);
+}
+
 /*
  - p_b_cclass - parse a character-class name and deal with it
  == static void p_b_cclass(struct parse *p, cset *cs);
@@ -1027,7 +1179,6 @@ p_b_cclass(struct parse *p, cset *cs)
 {
 	const char *sp = p->next;
 	size_t len;
-	wctype_t wct;
 	char clname[16];
 
 	while (MORE() && isalpha((uch)PEEK()))
@@ -1039,6 +1190,17 @@ p_b_cclass(struct parse *p, cset *cs)
 	}
 	memcpy(clname, sp, len);
 	clname[len] = '\0';
+
+	p_b_cclass_named(p, cs, clname);
+}
+/*
+ - p_b_cclass_named - deal with a named character class
+ == static void p_b_cclass_named(struct parse *p, cset *cs, const char []);
+ */
+static void
+p_b_cclass_named(struct parse *p, cset *cs, const char clname[]) {
+	wctype_t wct;
+
 	if ((wct = wctype(clname)) == 0) {
 		SETERROR(REG_ECTYPE);
 		return;
@@ -1718,6 +1880,10 @@ findmust(struct parse *p, struct re_guts *g)
 		case OEOW:
 		case OBOL:
 		case OEOL:
+		case OBOS:
+		case OEOS:
+		case OWBND:
+		case ONWBND:
 		case O_QUEST:
 		case O_CH:
 		case OEND:
@@ -1869,6 +2035,8 @@ altoffset(sop *scan, int offset)
 			try++;
 		case OBOW:
 		case OEOW:
+		case OWBND:
+		case ONWBND:
 		case OLPAREN:
 		case ORPAREN:
 		case OOR2:
diff --git a/lib/libc/regex/regex2.h b/lib/libc/regex/regex2.h
index a7c45683229c..19e3b7992982 100644
--- a/lib/libc/regex/regex2.h
+++ b/lib/libc/regex/regex2.h
@@ -104,6 +104,10 @@ typedef unsigned long sopno;
 #define	O_CH	(18L<<OPSHIFT)	/* end choice	back to OOR1		*/
 #define	OBOW	(19L<<OPSHIFT)	/* begin word	-			*/
 #define	OEOW	(20L<<OPSHIFT)	/* end word	-			*/
+#define	OBOS	(21L<<OPSHIFT)	/* begin subj.  -			*/
+#define	OEOS	(22L<<OPSHIFT)	/* end subj.	-			*/
+#define	OWBND	(23L<<OPSHIFT)	/* word bound	-			*/
+#define	ONWBND	(24L<<OPSHIFT)	/* not bound	-			*/
 
 /*
  * Structures for [] character-set representation.
diff --git a/lib/libregex/tests/gnuext.in b/lib/libregex/tests/gnuext.in
index 86afe499f50a..8f49854235a9 100644
--- a/lib/libregex/tests/gnuext.in
+++ b/lib/libregex/tests/gnuext.in
@@ -23,8 +23,12 @@ a\|b\|c	b	abc	a
 \B[abc]\B	&	<abc>	b
 \B[abc]+	-	<abc>	bc
 \B[abc]\+	b	<abc>	bc
+\`abc	&	abc	abc
+abc\'	&	abc	abc
 \`abc\'	&	abc	abc
 \`.+\'	-	abNc	abNc
 \`.\+\'	b	abNc	abNc
 (\`a)	-	Na
+(a\`)	-	aN
 (a\')	-	aN
+(\'a)	-	Na
diff --git a/lib/libregex/tests/libregex_test.sh b/lib/libregex/tests/libregex_test.sh
index 071f407cdb10..8ebe9b64ab63 100755
--- a/lib/libregex/tests/libregex_test.sh
+++ b/lib/libregex/tests/libregex_test.sh
@@ -29,10 +29,6 @@ check()
 {
 	local dataname="${1}"; shift
 
-	if [ "${dataname}" == "gnuext" ]; then
-		atf_expect_fail "GNU extensions are not currently implemented"
-	fi
-
 	prog="$(atf_get_srcdir)/h_regex"
 	data="$(atf_get_srcdir)/data/${dataname}.in"
 


More information about the dev-commits-src-all mailing list