svn commit: r225318 - user/gabor/grep/trunk/regex
Gabor Kovesdan
gabor at FreeBSD.org
Fri Sep 2 01:55:03 UTC 2011
Author: gabor
Date: Fri Sep 2 01:55:02 2011
New Revision: 225318
URL: http://svn.freebsd.org/changeset/base/225318
Log:
- Add a finer-grained parser to allow a wider range of patterns to be
matched with the fast algorithm
Modified:
user/gabor/grep/trunk/regex/glue.h
user/gabor/grep/trunk/regex/tre-fastmatch.c
Modified: user/gabor/grep/trunk/regex/glue.h
==============================================================================
--- user/gabor/grep/trunk/regex/glue.h Thu Sep 1 21:42:54 2011 (r225317)
+++ user/gabor/grep/trunk/regex/glue.h Fri Sep 2 01:55:02 2011 (r225318)
@@ -22,7 +22,7 @@
#define REG_LITERAL 0020
#define REG_WORD 0100
#define REG_GNU 0400
-#define _REG_HEUR 1000
+#define _REG_HEUR 01000
#define REG_OK 0
Modified: user/gabor/grep/trunk/regex/tre-fastmatch.c
==============================================================================
--- user/gabor/grep/trunk/regex/tre-fastmatch.c Thu Sep 1 21:42:54 2011 (r225317)
+++ user/gabor/grep/trunk/regex/tre-fastmatch.c Fri Sep 2 01:55:02 2011 (r225318)
@@ -368,13 +368,18 @@ static int fastcmp(const void *, const v
* Copies the pattern pat having lenght n to p and stores
* the size in l.
*/
-#define SAVE_PATTERN(p, l) \
- l = (n == 0) ? tre_strlen(pat) : n; \
- p = xmalloc((l + 1) * sizeof(tre_char_t)); \
- if (p == NULL) \
- return REG_ESPACE; \
- memcpy(p, pat, l * sizeof(tre_char_t)); \
- p[l] = TRE_CHAR('\0');
+#define SAVE_PATTERN(src, srclen, dst, dstlen) \
+ dstlen = srclen; \
+ if (dstlen == 0) \
+ dst = TRE_CHAR(""); \
+ else \
+ { \
+ dst = xmalloc((dstlen + 1) * sizeof(tre_char_t)); \
+ if (dst == NULL) \
+ return REG_ESPACE; \
+ memcpy(dst, src, dstlen * sizeof(tre_char_t)); \
+ dst[dstlen] = TRE_CHAR('\0'); \
+ }
/*
* Initializes pattern compiling.
@@ -413,10 +418,10 @@ tre_compile_literal(fastmatch_t *fg, con
return REG_BADPAT;
#ifdef TRE_WCHAR
- SAVE_PATTERN(fg->wpattern, fg->wlen);
+ SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen);
STORE_MBS_PAT;
#else
- SAVE_PATTERN(fg->pattern, fg->len);
+ SAVE_PATTERN(pat, n, fg->pattern, fg->len);
#endif
DPRINT(("tre_compile_literal: pattern: %s, icase: %c, word: %c, "
@@ -440,14 +445,11 @@ int
tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n,
int cflags)
{
- INIT_COMP;
+ tre_char_t *tmp;
+ size_t pos = 0;
+ bool escaped = false;
- /* Remove end-of-line character ('$'). */
- if ((n > 0) && (pat[n - 1] == TRE_CHAR('$')))
- {
- fg->eol = true;
- n--;
- }
+ INIT_COMP;
/* Remove beginning-of-line character ('^'). */
if (pat[0] == TRE_CHAR('^'))
@@ -472,34 +474,138 @@ tre_compile_fast(fastmatch_t *fg, const
if (fg->word && (TRE_MB_CUR_MAX > 1))
return REG_BADPAT;
- /* Look for ways to cheat...er...avoid the full regex engine. */
- for (unsigned int i = 0; i < n; i++)
+ tmp = xmalloc((n + 1) * sizeof(tre_char_t));
+ if (tmp == NULL)
+ return REG_ESPACE;
+
+#define STORE_CHAR \
+ do \
+ { \
+ tmp[pos++] = pat[i]; \
+ escaped = false; \
+ continue; \
+ } while (0)
+
+ /*
+ * Used for heuristic, only beginning ^, trailing $ and . are treated
+ * as special.
+ */
+ if (cflags & _REG_HEUR)
{
- /* Can still cheat? */
- if (!(cflags & _REG_HEUR) &&
- ((tre_isalnum(pat[i])) || tre_isspace(pat[i]) ||
- (pat[i] == TRE_CHAR('_')) || (pat[i] == TRE_CHAR(',')) ||
- (pat[i] == TRE_CHAR('=')) || (pat[i] == TRE_CHAR('-')) ||
- (pat[i] == TRE_CHAR(':')) || (pat[i] == TRE_CHAR('/'))))
+ for (int i = 0; i < n; i++)
+ switch (pat[i])
+ {
+ case TRE_CHAR('.'):
+ fg->hasdot = true;
+ STORE_CHAR;
+ break;
+ case TRE_CHAR('$'):
+ if (i == n - 1)
+ fg->eol = true;
+ else
+ STORE_CHAR;
+ break;
+ default:
+ STORE_CHAR;
+ }
+ }
+ else
+ for (int i = 0; i < n; i++)
+ {
+ switch (pat[i])
+ {
+ case TRE_CHAR('\\'):
+ if (escaped)
+ STORE_CHAR;
+ else
+ escaped = true;
+ break;
+ case TRE_CHAR('['):
+ if (escaped)
+ STORE_CHAR;
+ else
+ goto badpat;
+ break;
+ case TRE_CHAR('*'):
+ if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
+ STORE_CHAR;
+ else
+ goto badpat;
+ break;
+ case TRE_CHAR('+'):
+ case TRE_CHAR('?'):
+ if ((cflags & REG_EXTENDED) && (i == 0))
+ continue;
+ else if ((cflags & REG_EXTENDED) ^ !escaped)
+ STORE_CHAR;
+ else
+ goto badpat;
+ case TRE_CHAR('.'):
+ if (escaped)
+ goto badpat;
+ else
+ {
+ fg->hasdot = true;
+ STORE_CHAR;
+ }
+ break;
+ case TRE_CHAR('^'):
+ STORE_CHAR;
+ break;
+ case TRE_CHAR('$'):
+ if (!escaped && (i == n - 1))
+ fg->eol = true;
+ else
+ STORE_CHAR;
+ break;
+ case TRE_CHAR('('):
+ if ((cflags & REG_EXTENDED) ^ escaped)
+ goto badpat;
+ else
+ STORE_CHAR;
+ break;
+ case TRE_CHAR('{'):
+ if (escaped && (i == 0))
+ STORE_CHAR;
+ else if (!(cflags & REG_EXTENDED) && (i == 0))
+ STORE_CHAR;
+ else if ((cflags & REG_EXTENDED) && (i == 0))
+ continue;
+ else
+ goto badpat;
+ break;
+ case TRE_CHAR('|'):
+ if ((cflags & REG_EXTENDED) ^ (!escaped))
+ goto badpat;
+ else
+ STORE_CHAR;
+ break;
+ default:
+ if (escaped)
+ goto badpat;
+ else
+ STORE_CHAR;
+ }
continue;
- else if (pat[i] == TRE_CHAR('.'))
- fg->hasdot = i;
- else
+badpat:
+ xfree(tmp);
return REG_BADPAT;
- }
+ }
/*
- * pat has been adjusted earlier to not include '^', '$' or
- * the word match character classes at the beginning and ending
- * of the string respectively.
+ * The pattern has been processed and copied to tmp as a literal string
+ * with escapes, anchors (^$) and the word boundary match character
+ * classes stripped out.
*/
#ifdef TRE_WCHAR
- SAVE_PATTERN(fg->wpattern, fg->wlen);
+ SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen);
STORE_MBS_PAT;
#else
- SAVE_PATTERN(fg->pattern, fg->len);
+ SAVE_PATTERN(tmp, pos, fg->pattern, fg->len);
#endif
+ xfree(tmp);
+
DPRINT(("tre_compile_fast: pattern: %s, bol %c, eol %c, "
"icase: %c, word: %c, newline %c\n", fg->pattern,
fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n',
More information about the svn-src-user
mailing list