svn commit: r225702 - in user/gabor/grep/trunk: . regex
Gabor Kovesdan
gabor at FreeBSD.org
Tue Sep 20 21:54:43 UTC 2011
Author: gabor
Date: Tue Sep 20 21:54:43 2011
New Revision: 225702
URL: http://svn.freebsd.org/changeset/base/225702
Log:
- Merge improvements from TRE
Modified:
user/gabor/grep/trunk/Makefile
user/gabor/grep/trunk/regex/fastmatch.c
user/gabor/grep/trunk/regex/glue.h
user/gabor/grep/trunk/regex/tre-fastmatch.c
Modified: user/gabor/grep/trunk/Makefile
==============================================================================
--- user/gabor/grep/trunk/Makefile Tue Sep 20 21:53:46 2011 (r225701)
+++ user/gabor/grep/trunk/Makefile Tue Sep 20 21:54:43 2011 (r225702)
@@ -17,7 +17,7 @@ SRCS= file.c grep.c queue.c util.c
# Extra files ported backported form some regex improvements
.PATH: ${.CURDIR}/regex
-SRCS+= fastmatch.c hashtable.c tre-fastmatch.c xmalloc.c
+SRCS+= fastmatch.c hashtable.c tre-compile.c tre-fastmatch.c xmalloc.c
CFLAGS+=-I${.CURDIR}/regex
.if ${MK_BSD_GREP} == "yes"
Modified: user/gabor/grep/trunk/regex/fastmatch.c
==============================================================================
--- user/gabor/grep/trunk/regex/fastmatch.c Tue Sep 20 21:53:46 2011 (r225701)
+++ user/gabor/grep/trunk/regex/fastmatch.c Tue Sep 20 21:54:43 2011 (r225702)
@@ -36,63 +36,6 @@
#include "tre-fastmatch.h"
#include "xmalloc.h"
-/* XXX: avoid duplication */
-#define CONV_PAT \
- { \
- wregex = xmalloc(sizeof(tre_char_t) * (n + 1)); \
- if (wregex == NULL) \
- return REG_ESPACE; \
- \
- if (TRE_MB_CUR_MAX == 1) \
- { \
- unsigned int i; \
- const unsigned char *str = (const unsigned char *)regex; \
- tre_char_t *wstr = wregex; \
- \
- for (i = 0; i < n; i++) \
- *(wstr++) = *(str++); \
- wlen = n; \
- } \
- else \
- { \
- int consumed; \
- tre_char_t *wcptr = wregex; \
- mbstate_t state; \
- memset(&state, '\0', sizeof(state)); \
- while (n > 0) \
- { \
- consumed = tre_mbrtowc(wcptr, regex, n, &state); \
- \
- switch (consumed) \
- { \
- case 0: \
- if (*regex == '\0') \
- consumed = 1; \
- else \
- { \
- xfree(wregex); \
- return REG_BADPAT; \
- } \
- break; \
- case -1: \
- DPRINT(("mbrtowc: error %d: %s.\n", errno, \
- strerror(errno))); \
- xfree(wregex); \
- return REG_BADPAT; \
- case -2: \
- consumed = n; \
- break; \
- } \
- regex += consumed; \
- n -= consumed; \
- wcptr++; \
- } \
- wlen = wcptr - wregex; \
- } \
- \
- wregex[wlen] = L'\0'; \
- }
-
int
tre_fixncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags)
{
@@ -101,14 +44,17 @@ tre_fixncomp(fastmatch_t *preg, const ch
size_t wlen;
if (n != 0)
- CONV_PAT
+ {
+ ret = tre_convert_pattern(regex, n, &wregex, &wlen);
+ if (ret != REG_OK)
+ return ret;
+ else
+ ret = tre_compile_literal(preg, wregex, wlen, cflags);
+ tre_free_pattern(wregex);
+ return ret;
+ }
else
return tre_compile_literal(preg, NULL, 0, cflags);
-
- ret = tre_compile_literal(preg, wregex, wlen, cflags);
- xfree(wregex);
-
- return ret;
}
int
@@ -119,16 +65,19 @@ tre_fastncomp(fastmatch_t *preg, const c
size_t wlen;
if (n != 0)
- CONV_PAT
+ {
+ ret = tre_convert_pattern(regex, n, &wregex, &wlen);
+ if (ret != REG_OK)
+ return ret;
+ else
+ ret = (cflags & REG_LITERAL)
+ ? tre_compile_literal(preg, wregex, wlen, cflags)
+ : tre_compile_fast(preg, wregex, wlen, cflags);
+ tre_free_pattern(wregex);
+ return ret;
+ }
else
return tre_compile_literal(preg, NULL, 0, cflags);
-
- ret = (cflags & REG_LITERAL) ?
- tre_compile_literal(preg, wregex, wlen, cflags) :
- tre_compile_fast(preg, wregex, wlen, cflags);
- xfree(wregex);
-
- return ret;
}
@@ -176,30 +125,6 @@ tre_fastfree(fastmatch_t *preg)
tre_free_fast(preg);
}
-/* XXX: avoid duplication */
-#define ADJUST_OFFSETS \
- { \
- size_t slen = (size_t)(pmatch[0].rm_eo - pmatch[0].rm_so); \
- size_t offset = pmatch[0].rm_so; \
- int ret; \
- \
- if ((pmatch[0].rm_so < 0) || (pmatch[0].rm_eo < 0)) \
- return REG_NOMATCH; \
- if ((len != (unsigned)-1) && ((unsigned long)pmatch[0].rm_eo > len))\
- return REG_NOMATCH; \
- if ((long long)pmatch[0].rm_eo - pmatch[0].rm_so < 0) \
- return REG_NOMATCH; \
- ret = tre_match_fast(preg, &string[offset], slen, type, nmatch, \
- pmatch, eflags); \
- for (unsigned i = 0; (i == 0) || (!(eflags & REG_NOSUB) && \
- (i < nmatch)); i++) \
- { \
- pmatch[i].rm_so += offset; \
- pmatch[i].rm_eo += offset; \
- } \
- return ret; \
- }
-
int
tre_fastnexec(const fastmatch_t *preg, const char *string, size_t len,
size_t nmatch, regmatch_t pmatch[], int eflags)
@@ -207,7 +132,8 @@ tre_fastnexec(const fastmatch_t *preg, c
tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
if (eflags & REG_STARTEND)
- ADJUST_OFFSETS
+ CALL_WITH_OFFSET(tre_match_fast(preg, &string[offset], slen,
+ type, nmatch, pmatch, eflags));
else
return tre_match_fast(preg, string, len, type, nmatch,
pmatch, eflags);
@@ -227,7 +153,8 @@ tre_fastwnexec(const fastmatch_t *preg,
tre_str_type_t type = STR_WIDE;
if (eflags & REG_STARTEND)
- ADJUST_OFFSETS
+ CALL_WITH_OFFSET(tre_match_fast(preg, &string[offset], slen,
+ type, nmatch, pmatch, eflags));
else
return tre_match_fast(preg, string, len, type, nmatch,
pmatch, eflags);
Modified: user/gabor/grep/trunk/regex/glue.h
==============================================================================
--- user/gabor/grep/trunk/regex/glue.h Tue Sep 20 21:53:46 2011 (r225701)
+++ user/gabor/grep/trunk/regex/glue.h Tue Sep 20 21:54:43 2011 (r225702)
@@ -11,6 +11,7 @@
#define TRE_WCHAR 1
#define TRE_MULTIBYTE 1
+#define HAVE_MBSTATE_T 1
#define TRE_CHAR(n) L##n
@@ -37,4 +38,29 @@
#define MAX(a,b) ((a > b) ? (a) : (b))
typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t;
+
+#define CALL_WITH_OFFSET(fn) \
+ do \
+ { \
+ size_t slen = (size_t)(pmatch[0].rm_eo - pmatch[0].rm_so); \
+ size_t offset = pmatch[0].rm_so; \
+ int ret; \
+ \
+ if ((long long)pmatch[0].rm_eo - pmatch[0].rm_so < 0) \
+ return REG_NOMATCH; \
+ ret = fn; \
+ for (unsigned i = 0; (!(eflags & REG_NOSUB) && (i < nmatch)); i++)\
+ { \
+ pmatch[i].rm_so += offset; \
+ pmatch[i].rm_eo += offset; \
+ } \
+ return ret; \
+ } while (0 /*CONSTCOND*/)
+
+int
+tre_convert_pattern(const char *regex, size_t n, tre_char_t **w,
+ size_t *wn);
+
+void
+tre_free_pattern(tre_char_t *wregex);
#endif
Modified: user/gabor/grep/trunk/regex/tre-fastmatch.c
==============================================================================
--- user/gabor/grep/trunk/regex/tre-fastmatch.c Tue Sep 20 21:53:46 2011 (r225701)
+++ user/gabor/grep/trunk/regex/tre-fastmatch.c Tue Sep 20 21:54:43 2011 (r225702)
@@ -42,8 +42,8 @@
#include "tre-fastmatch.h"
#include "xmalloc.h"
-static int fastcmp(const void *, const bool *, const void *, size_t,
- tre_str_type_t, bool, bool);
+static int fastcmp(const fastmatch_t *fg, const void *data,
+ tre_str_type_t type);
/*
* Clean up if pattern compilation fails.
@@ -97,24 +97,6 @@ static int fastcmp(const void *, const b
fg->pattern[siz] = '\0'; \
} \
-/*
- * Compares the pattern to the input string at the position
- * stored in startptr.
- */
-#define COMPARE \
- switch (type) \
- { \
- case STR_WIDE: \
- mismatch = fastcmp(fg->wpattern, fg->wescmap, startptr, \
- fg->wlen, type, \
- fg->icase, fg->newline); \
- break; \
- default: \
- mismatch = fastcmp(fg->pattern, fg->escmap, startptr, \
- fg->len, type, \
- fg->icase, fg->newline); \
- } \
-
#define IS_OUT_OF_BOUNDS \
((!fg->reversed \
? ((type == STR_WIDE) ? ((j + fg->wlen) > len) \
@@ -154,7 +136,7 @@ static int fastcmp(const void *, const b
gs = fg->bmGs[mismatch]; \
} \
bc = (r == HASH_OK) ? bc : fg->defBc; \
- DPRINT(("tre_fast_match: mismatch on character %lc, " \
+ DPRINT(("tre_fast_match: mismatch on character" CHF ", " \
"BC %d, GS %d\n", \
((const tre_char_t *)startptr)[mismatch + 1], \
bc, gs)); \
@@ -297,7 +279,7 @@ static int fastcmp(const void *, const b
r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \
if ((r == HASH_FAIL) || (r == HASH_FULL)) \
FAIL_COMP(REG_ESPACE); \
- DPRINT(("BC shift for wide char %lc is %d\n", fg->wpattern[i], \
+ DPRINT(("BC shift for wide char " CHF " is %d\n", fg->wpattern[i],\
k)); \
if (fg->icase) \
{ \
@@ -306,7 +288,7 @@ static int fastcmp(const void *, const b
r = hashtable_put(fg->qsBc_table, &wc, &k); \
if ((r == HASH_FAIL) || (r == HASH_FULL)) \
FAIL_COMP(REG_ESPACE); \
- DPRINT(("BC shift for wide char %lc is %d\n", wc, k)); \
+ DPRINT(("BC shift for wide char " CHF " is %d\n", wc, k)); \
} \
}
@@ -327,7 +309,7 @@ static int fastcmp(const void *, const b
r = hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \
if ((r == HASH_FAIL) || (r == HASH_FULL)) \
FAIL_COMP(REG_ESPACE); \
- DPRINT(("Reverse BC shift for wide char %lc is %d\n", \
+ DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", \
fg->wpattern[i], k)); \
if (fg->icase) \
{ \
@@ -336,7 +318,8 @@ static int fastcmp(const void *, const b
r = hashtable_put(fg->qsBc_table, &wc, &k); \
if ((r == HASH_FAIL) || (r == HASH_FULL)) \
FAIL_COMP(REG_ESPACE); \
- DPRINT(("Reverse BC shift for wide char %lc is %d\n", wc, k));\
+ DPRINT(("Reverse BC shift for wide char " CHF " is %d\n", wc, \
+ k)); \
} \
}
@@ -853,7 +836,7 @@ badpat:
*/
int
tre_match_fast(const fastmatch_t *fg, const void *data, size_t len,
- tre_str_type_t type, int nmatch __unused, regmatch_t pmatch[], int eflags)
+ tre_str_type_t type, int nmatch, regmatch_t pmatch[], int eflags)
{
unsigned int shift, u = 0, v = 0;
ssize_t j = 0;
@@ -878,7 +861,7 @@ tre_match_fast(const fastmatch_t *fg, co
/* Shortcut for empty pattern */
if (fg->matchall)
{
- if (!fg->nosub)
+ if (!fg->nosub && nmatch >= 1)
{
pmatch[0].rm_so = 0;
pmatch[0].rm_eo = len;
@@ -932,12 +915,12 @@ tre_match_fast(const fastmatch_t *fg, co
/* Determine where in data to start search at. */
j = fg->eol ? len - (type == STR_WIDE ? fg->wlen : fg->len) : 0;
SKIP_CHARS(j);
- COMPARE;
+ mismatch = fastcmp(fg, startptr, type);
if (mismatch == REG_OK)
{
if (fg->word && !IS_ON_WORD_BOUNDARY)
return ret;
- if (!fg->nosub)
+ if (!fg->nosub && nmatch >= 1)
{
pmatch[0].rm_so = j;
pmatch[0].rm_eo = j + (type == STR_WIDE ? fg->wlen : fg->len);
@@ -952,7 +935,7 @@ tre_match_fast(const fastmatch_t *fg, co
do
{
SKIP_CHARS(j);
- COMPARE;
+ mismatch = fastcmp(fg, startptr, type);
if (mismatch == REG_OK)
{
if (fg->word)
@@ -961,7 +944,7 @@ tre_match_fast(const fastmatch_t *fg, co
CHECK_BOL_ANCHOR;
if (fg->eol)
CHECK_EOL_ANCHOR;
- if (!fg->nosub)
+ if (!fg->nosub && nmatch >= 1)
{
pmatch[0].rm_so = j;
pmatch[0].rm_eo = j + ((type == STR_WIDE) ? fg->wlen : fg->len);
@@ -1008,14 +991,15 @@ tre_free_fast(fastmatch_t *fg)
* REG_OK on success
*/
static inline int
-fastcmp(const void *pat, const bool *escmap, const void *data, size_t len,
- tre_str_type_t type, bool icase, bool newline)
+fastcmp(const fastmatch_t *fg, const void *data, tre_str_type_t type)
{
const char *str_byte = data;
- const char *pat_byte = pat;
- int ret = REG_OK;
+ const char *pat_byte = fg->pattern;
const tre_char_t *str_wide = data;
- const tre_char_t *pat_wide = pat;
+ const tre_char_t *pat_wide = fg->wpattern;
+ const bool *escmap = (type == STR_WIDE) ? fg->wescmap : fg->escmap;
+ size_t len = (type == STR_WIDE) ? fg->wlen : fg->len;
+ int ret = REG_OK;
/* Compare the pattern and the input char-by-char from the last position. */
for (int i = len - 1; i >= 0; i--) {
@@ -1024,23 +1008,25 @@ fastcmp(const void *pat, const bool *esc
case STR_WIDE:
/* Check dot */
- if (pat_wide[i] == TRE_CHAR('.') && (!escmap || !escmap[i]) &&
- (!newline || (str_wide[i] != TRE_CHAR('\n'))))
+ if (fg->hasdot && pat_wide[i] == TRE_CHAR('.') &&
+ (!escmap || !escmap[i]) &&
+ (!fg->newline || (str_wide[i] != TRE_CHAR('\n'))))
continue;
/* Compare */
- if (icase ? (towlower(pat_wide[i]) == towlower(str_wide[i]))
+ if (fg->icase ? (towlower(pat_wide[i]) == towlower(str_wide[i]))
: (pat_wide[i] == str_wide[i]))
continue;
break;
default:
/* Check dot */
- if (pat_byte[i] == '.' && (!escmap || !escmap[i]) &&
- (!newline || (str_byte[i] != '\n')))
+ if (fg->hasdot && pat_byte[i] == '.' &&
+ (!escmap || !escmap[i]) &&
+ (!fg->newline || (str_byte[i] != '\n')))
continue;
/* Compare */
- if (icase ? (tolower(pat_byte[i]) == tolower(str_byte[i]))
+ if (fg->icase ? (tolower(pat_byte[i]) == tolower(str_byte[i]))
: (pat_byte[i] == str_byte[i]))
continue;
}
More information about the svn-src-user
mailing list