svn commit: r225347 - user/gabor/tre-integration/contrib/tre/lib
Gabor Kovesdan
gabor at FreeBSD.org
Fri Sep 2 18:18:24 UTC 2011
Author: gabor
Date: Fri Sep 2 18:18:24 2011
New Revision: 225347
URL: http://svn.freebsd.org/changeset/base/225347
Log:
- Merge some improvements and fixes from grep
- Fix a cast [1]
Submitted by: ache [1]
Modified:
user/gabor/tre-integration/contrib/tre/lib/fastmatch.c
user/gabor/tre-integration/contrib/tre/lib/regcomp.c
user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c
Modified: user/gabor/tre-integration/contrib/tre/lib/fastmatch.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/fastmatch.c Fri Sep 2 18:13:46 2011 (r225346)
+++ user/gabor/tre-integration/contrib/tre/lib/fastmatch.c Fri Sep 2 18:18:24 2011 (r225347)
@@ -1,3 +1,5 @@
+/* $FreeBSD$ */
+
/*-
* Copyright (C) 2011 Gabor Kovesdan <gabor at FreeBSD.org>
* All rights reserved.
@@ -27,7 +29,9 @@
#ifdef HAVE_CONFIG_H
#include <config.h>
#endif /* HAVE_CONFIG_H */
+#include <errno.h>
#include <fastmatch.h>
+#include <regex.h>
#include <string.h>
#include "tre-fastmatch.h"
@@ -36,67 +40,72 @@
/* XXX: avoid duplication */
#define CONV_PAT \
- int ret; \
- tre_char_t *wregex; \
- size_t wlen; \
- \
- wregex = xmalloc(sizeof(tre_char_t) * (n + 1)); \
- if (wregex == NULL) \
- return REG_ESPACE; \
- \
- if (TRE_MB_CUR_MAX == 1) \
- { \
- unsigned int i; \
- const unsigned char *str = (const unsigned char *)regex; \
- tre_char_t *wstr = wregex; \
- \
- for (i = 0; i < n; i++) \
- *(wstr++) = *(str++); \
- wlen = n; \
- } \
- else \
- { \
- int consumed; \
- tre_char_t *wcptr = wregex; \
- mbstate_t state; \
- memset(&state, '\0', sizeof(state)); \
- while (n > 0) \
- { \
- consumed = tre_mbrtowc(wcptr, regex, n, &state); \
+ { \
+ wregex = xmalloc(sizeof(tre_char_t) * (n + 1)); \
+ if (wregex == NULL) \
+ return REG_ESPACE; \
\
- switch (consumed) \
- { \
- case 0: \
- if (*regex == '\0') \
- consumed = 1; \
- else \
- { \
- xfree(wregex); \
- return REG_BADPAT; \
- } \
- break; \
- case -1: \
- DPRINT(("mbrtowc: error %d: %s.\n", errno, \
- strerror(errno))); \
- xfree(wregex); \
- return REG_BADPAT; \
- case -2: \
- consumed = n; \
- break; \
- } \
- regex += consumed; \
- n -= consumed; \
- wcptr++; \
- } \
- wlen = wcptr - wregex; \
- } \
+ if (TRE_MB_CUR_MAX == 1) \
+ { \
+ unsigned int i; \
+ const unsigned char *str = (const unsigned char *)regex; \
+ tre_char_t *wstr = wregex; \
+ \
+ for (i = 0; i < n; i++) \
+ *(wstr++) = *(str++); \
+ wlen = n; \
+ } \
+ else \
+ { \
+ int consumed; \
+ tre_char_t *wcptr = wregex; \
+ mbstate_t state; \
+ memset(&state, '\0', sizeof(state)); \
+ while (n > 0) \
+ { \
+ consumed = tre_mbrtowc(wcptr, regex, n, &state); \
+ \
+ switch (consumed) \
+ { \
+ case 0: \
+ if (*regex == '\0') \
+ consumed = 1; \
+ else \
+ { \
+ xfree(wregex); \
+ return REG_BADPAT; \
+ } \
+ break; \
+ case -1: \
+ DPRINT(("mbrtowc: error %d: %s.\n", errno, \
+ strerror(errno))); \
+ xfree(wregex); \
+ return REG_BADPAT; \
+ case -2: \
+ consumed = n; \
+ break; \
+ } \
+ regex += consumed; \
+ n -= consumed; \
+ wcptr++; \
+ } \
+ wlen = wcptr - wregex; \
+ } \
\
- wregex[wlen] = L'\0';
+ wregex[wlen] = L'\0'; \
+ }
int
tre_fixncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags)
{
- CONV_PAT;
+ int ret;
+ tre_char_t *wregex;
+ size_t wlen;
+
+ if (n != 0)
+ CONV_PAT
+ else
+ return tre_compile_literal(preg, NULL, 0, cflags);
ret = tre_compile_literal(preg, wregex, wlen, cflags);
xfree(wregex);
@@ -107,7 +116,14 @@ tre_fixncomp(fastmatch_t *preg, const ch
int
tre_fastncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags)
{
- CONV_PAT;
+ int ret;
+ tre_char_t *wregex;
+ size_t wlen;
+
+ if (n != 0)
+ CONV_PAT
+ else
+ return tre_compile_literal(preg, NULL, 0, cflags);
ret = (cflags & REG_LITERAL) ?
tre_compile_literal(preg, wregex, wlen, cflags) :
@@ -121,34 +137,13 @@ tre_fastncomp(fastmatch_t *preg, const c
int
tre_fixcomp(fastmatch_t *preg, const char *regex, int cflags)
{
- size_t len;
-
- if (cflags & REG_PEND)
- {
- if (preg->re_endp >= regex)
- len = preg->re_endp - regex
- else
- len = preg ? strlen(regex) : 0;
- return tre_fixncomp(preg, regex, len, cflags);
- }
- else
- return tre_fixncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
+ return tre_fixncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
}
int
tre_fastcomp(fastmatch_t *preg, const char *regex, int cflags)
{
- size_t len;
-
- if (cflags & REG_PEND)
- {
- len = (preg->re_endp >= regex)
- ? preg->re_endp - regex
- : 0;
- return tre_fastncomp(preg, regex, len ? strlen(regex) : 0, cflags);
- }
- else
- return tre_fastncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
+ return tre_fastncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
}
int
Modified: user/gabor/tre-integration/contrib/tre/lib/regcomp.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/regcomp.c Fri Sep 2 18:13:46 2011 (r225346)
+++ user/gabor/tre-integration/contrib/tre/lib/regcomp.c Fri Sep 2 18:18:24 2011 (r225347)
@@ -117,11 +117,10 @@ tre_regcomp(regex_t *preg, const char *r
if (cflags & REG_PEND)
{
if (preg->re_endp >= regex)
- len = preg->re_endp - regex
+ len = preg->re_endp - regex;
else
len = regex ? strlen(regex) : 0;
- )
- return tre_regncomp(preg, regex, len, cflags);
+ return tre_regncomp(preg, regex, len, cflags);
}
else
return tre_regncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
Modified: user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c Fri Sep 2 18:13:46 2011 (r225346)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c Fri Sep 2 18:18:24 2011 (r225347)
@@ -208,14 +208,14 @@ static int fastcmp(const void *, const v
fg->qsBc[i] = fg->len - fg->hasdot; \
for (int i = fg->hasdot + 1; i < fg->len; i++) \
{ \
- fg->qsBc[(unsigned)fg->pattern[i]] = fg->len - i; \
+ fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i; \
DPRINT(("BC shift for char %c is %d\n", fg->pattern[i], \
fg->len - i)); \
if (fg->icase) \
{ \
char c = islower(fg->pattern[i]) ? toupper(fg->pattern[i]) \
: tolower(fg->pattern[i]); \
- fg->qsBc[(unsigned)c] = fg->len - i; \
+ fg->qsBc[(unsigned char)c] = fg->len - i; \
DPRINT(("BC shift for char %c is %d\n", c, fg->len - i)); \
} \
}
@@ -370,13 +370,18 @@ static int fastcmp(const void *, const v
* Copies the pattern pat having lenght n to p and stores
* the size in l.
*/
-#define SAVE_PATTERN(p, l) \
- l = (n == 0) ? tre_strlen(pat) : n; \
- p = xmalloc((l + 1) * sizeof(tre_char_t)); \
- if (p == NULL) \
- return REG_ESPACE; \
- memcpy(p, pat, l * sizeof(tre_char_t)); \
- p[l] = TRE_CHAR('\0');
+#define SAVE_PATTERN(src, srclen, dst, dstlen) \
+ dstlen = srclen; \
+ if (dstlen == 0) \
+ dst = TRE_CHAR(""); \
+ else \
+ { \
+ dst = xmalloc((dstlen + 1) * sizeof(tre_char_t)); \
+ if (dst == NULL) \
+ return REG_ESPACE; \
+ memcpy(dst, src, dstlen * sizeof(tre_char_t)); \
+ dst[dstlen] = TRE_CHAR('\0'); \
+ }
/*
* Initializes pattern compiling.
@@ -392,12 +397,18 @@ static int fastcmp(const void *, const v
if (n == 0) \
{ \
fg->matchall = true; \
+ fg->pattern = ""; \
+ fg->wpattern = TRE_CHAR(""); \
+ DPRINT(("Matching every input\n")); \
return REG_OK; \
- }
+ } \
\
/* Cannot handle REG_ICASE with MB string */ \
if (fg->icase && (TRE_MB_CUR_MAX > 1)) \
- return REG_BADPAT; \
+ { \
+ DPRINT(("Cannot use fast matcher for MBS with REG_ICASE\n")); \
+ return REG_BADPAT; \
+ }
/*
* Returns: REG_OK on success, error code otherwise
@@ -413,14 +424,14 @@ tre_compile_literal(fastmatch_t *fg, con
return REG_BADPAT;
#ifdef TRE_WCHAR
- SAVE_PATTERN(fg->wpattern, fg->wlen);
+ SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen);
STORE_MBS_PAT;
#else
- SAVE_PATTERN(fg->pattern, fg->len);
+ SAVE_PATTERN(pat, n, fg->pattern, fg->len);
#endif
- DPRINT(("tre_compile_literal: pattern: %s, icase: %c, word: %c, "
- "newline %c\n", fg->pattern, fg->icase ? 'y' : 'n',
+ DPRINT(("tre_compile_literal: pattern: %s, len %u, icase: %c, word: %c, "
+ "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n',
fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n'));
FILL_QSBC;
@@ -440,14 +451,11 @@ int
tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n,
int cflags)
{
- INIT_COMP;
+ tre_char_t *tmp;
+ size_t pos = 0;
+ bool escaped = false;
- /* Remove end-of-line character ('$'). */
- if ((n > 0) && (pat[n - 1] == TRE_CHAR('$')))
- {
- fg->eol = true;
- n--;
- }
+ INIT_COMP;
/* Remove beginning-of-line character ('^'). */
if (pat[0] == TRE_CHAR('^'))
@@ -472,36 +480,140 @@ tre_compile_fast(fastmatch_t *fg, const
if (fg->word && (TRE_MB_CUR_MAX > 1))
return REG_BADPAT;
- /* Look for ways to cheat...er...avoid the full regex engine. */
- for (unsigned int i = 0; i < n; i++)
+ tmp = xmalloc((n + 1) * sizeof(tre_char_t));
+ if (tmp == NULL)
+ return REG_ESPACE;
+
+#define STORE_CHAR \
+ do \
+ { \
+ tmp[pos++] = pat[i]; \
+ escaped = false; \
+ continue; \
+ } while (0)
+
+ /*
+ * Used for heuristic, only beginning ^, trailing $ and . are treated
+ * as special.
+ */
+ if (cflags & _REG_HEUR)
{
- /* Can still cheat? */
- if (!(cflags & _REG_HEUR) &&
- ((tre_isalnum(pat[i])) || tre_isspace(pat[i]) ||
- (pat[i] == TRE_CHAR('_')) || (pat[i] == TRE_CHAR(',')) ||
- (pat[i] == TRE_CHAR('=')) || (pat[i] == TRE_CHAR('-')) ||
- (pat[i] == TRE_CHAR(':')) || (pat[i] == TRE_CHAR('/'))))
+ for (int i = 0; i < n; i++)
+ switch (pat[i])
+ {
+ case TRE_CHAR('.'):
+ fg->hasdot = i;
+ STORE_CHAR;
+ break;
+ case TRE_CHAR('$'):
+ if (i == n - 1)
+ fg->eol = true;
+ else
+ STORE_CHAR;
+ break;
+ default:
+ STORE_CHAR;
+ }
+ }
+ else
+ for (int i = 0; i < n; i++)
+ {
+ switch (pat[i])
+ {
+ case TRE_CHAR('\\'):
+ if (escaped)
+ STORE_CHAR;
+ else
+ escaped = true;
+ break;
+ case TRE_CHAR('['):
+ if (escaped)
+ STORE_CHAR;
+ else
+ goto badpat;
+ break;
+ case TRE_CHAR('*'):
+ if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
+ STORE_CHAR;
+ else
+ goto badpat;
+ break;
+ case TRE_CHAR('+'):
+ case TRE_CHAR('?'):
+ if ((cflags & REG_EXTENDED) && (i == 0))
+ continue;
+ else if ((cflags & REG_EXTENDED) ^ !escaped)
+ STORE_CHAR;
+ else
+ goto badpat;
+ case TRE_CHAR('.'):
+ if (escaped)
+ goto badpat;
+ else
+ {
+ fg->hasdot = true;
+ STORE_CHAR;
+ }
+ break;
+ case TRE_CHAR('^'):
+ STORE_CHAR;
+ break;
+ case TRE_CHAR('$'):
+ if (!escaped && (i == n - 1))
+ fg->eol = true;
+ else
+ STORE_CHAR;
+ break;
+ case TRE_CHAR('('):
+ if ((cflags & REG_EXTENDED) ^ escaped)
+ goto badpat;
+ else
+ STORE_CHAR;
+ break;
+ case TRE_CHAR('{'):
+ if (escaped && (i == 0))
+ STORE_CHAR;
+ else if (!(cflags & REG_EXTENDED) && (i == 0))
+ STORE_CHAR;
+ else if ((cflags & REG_EXTENDED) && (i == 0))
+ continue;
+ else
+ goto badpat;
+ break;
+ case TRE_CHAR('|'):
+ if ((cflags & REG_EXTENDED) ^ (!escaped))
+ goto badpat;
+ else
+ STORE_CHAR;
+ break;
+ default:
+ if (escaped)
+ goto badpat;
+ else
+ STORE_CHAR;
+ }
continue;
- else if (pat[i] == TRE_CHAR('.'))
- fg->hasdot = i;
- else
+badpat:
+ xfree(tmp);
return REG_BADPAT;
- }
+ }
/*
- * pat has been adjusted earlier to not include '^', '$' or
- * the word match character classes at the beginning and ending
- * of the string respectively.
+ * The pattern has been processed and copied to tmp as a literal string
+ * with escapes, anchors (^$) and the word boundary match character
+ * classes stripped out.
*/
#ifdef TRE_WCHAR
- SAVE_PATTERN(fg->wpattern, fg->wlen);
+ SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen);
STORE_MBS_PAT;
#else
- SAVE_PATTERN(fg->pattern, fg->len);
+ SAVE_PATTERN(tmp, pos, fg->pattern, fg->len);
#endif
- DPRINT(("tre_compile_fast: pattern: %s, bol %c, eol %c, "
- "icase: %c, word: %c, newline %c\n", fg->pattern,
+ xfree(tmp);
+
+ DPRINT(("tre_compile_fast: pattern: %s, len %u, bol %c, eol %c, "
+ "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len,
fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n',
fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n',
fg->newline ? 'y' : 'n'));
@@ -593,7 +705,7 @@ tre_match_fast(const fastmatch_t *fg, co
const tre_char_t *str_wide = data;
/* Calculate length if unspecified. */
- if (len == (unsigned)-1)
+ if (len == (size_t)-1)
switch (type)
{
case STR_WIDE:
More information about the svn-src-user
mailing list