svn commit: r231897 - user/gabor/tre-integration/contrib/tre/lib
Gabor Kovesdan
gabor at FreeBSD.org
Sat Feb 18 19:37:02 UTC 2012
Author: gabor
Date: Sat Feb 18 19:37:02 2012
New Revision: 231897
URL: http://svn.freebsd.org/changeset/base/231897
Log:
Implement multi-pattern heuristic matching. Not yet connected to the build
and totally untested.
Modified:
user/gabor/tre-integration/contrib/tre/lib/mregcomp.c
user/gabor/tre-integration/contrib/tre/lib/mregexec.c
user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.c
user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.h
Modified: user/gabor/tre-integration/contrib/tre/lib/mregcomp.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/mregcomp.c Sat Feb 18 16:54:01 2012 (r231896)
+++ user/gabor/tre-integration/contrib/tre/lib/mregcomp.c Sat Feb 18 19:37:02 2012 (r231897)
@@ -54,81 +54,102 @@ __weak_reference(tre_mregfree, mregfree)
int
tre_mcompile(mregex_t *preg, size_t nr, const tre_char_t *wregex[],
- size_t wn[], int cflags)
+ size_t wn[], const char *regex[], size_t n, int cflags)
{
int ret;
- size_t mfrag = 0;
tre_char_t **frags;
size_t *siz;
wmsearch_t *wm;
preg->k = nr;
+ preg->cflags = cflags;
preg->patterns = xmalloc(nr * sizeof(regex_t));
if (!preg->patterns)
return REG_ESPACE;
+ /* Compile NFA, BM and heuristic for each pattern. */
for (int i = 0; i < nr; i++)
{
- ret = tre_compile_nfa(&preg->patterns[i], wregex[i], wn[i], cflags);
+ ret = tre_compile(&preg->patterns[i], wregex[i], wn[i],
+ regex[i], n[i], cflags);
if (ret != REG_OK)
- goto err;
- ret = tre_compile_heur(&preg->patterns[i], wregex[i], wn[i], cflags);
- if (ret != REG_OK)
- goto err;
+ return ret;
}
- for (mfrag = 0; mfrag < nr; mfrag++)
- for (int j = 0; j < nr; j++)
- if (((heur_t)preg->patterns[j]->heur)->arr[mfrag] == NULL)
- goto out;
-out:
-
- preg->mfrag = mfrag;
-
- /* Worst case, not all patterns have a literal prefix */
- if (mfrag == 0)
- return REG_OK;
-
- wm = xmalloc(mfrag * sizeof(wmsearch_t));
- if (!wm)
- goto err;
+ /* If not literal, check if any of them have fixed-length prefix. */
+ if (!(cflags & REG_LITERAL))
+ for (int i = 0; i < nr; i++)
+ if ((preg->patterns[i]->heur == NULL) ||
+ (((heur_t)preg->patterns[i]->heur)->arr[0] == NULL))
+ {
+ preg->type = MHEUR_NONE;
+ goto finish;
+ }
- frags = xmalloc(nr * sizeof(char *));
- if (!frags)
- goto err;
+ /*
+ * Set frag and siz to point to the fragments to compile and
+ * their respective sizes.
+ */
+ if (!(cflags & REG_LITERAL))
+ {
+ frags = xmalloc(nr * sizeof(char *));
+ if (!frags)
+ goto err;
- siz = xmalloc(nr * sizeof(size_t));
- // XXX: check NULL
+ siz = xmalloc(nr * sizeof(size_t));
+ if (!siz)
+ goto err;
- for (int i = 0; i < mfrag; i++)
- {
for (int j = 0; j < nr; j++)
{
- frags[j] = &((heur_t)preg->patterns[j]->heur)->arr[i];
- siz[j] = ((heur_t)preg->patterns[j]->heur)->siz[i];
+ frags[j] = &((heur_t)preg->patterns[j]->heur)->arr[0];
+ siz[j] = ((heur_t)preg->patterns[j]->heur)->siz[0];
}
- ret = tre_wmcomp(&wm[i], nr, frags, siz, cflags);
- if (ret != REG_OK)
- goto err;
+ }
+ else
+ {
+ frags = wregex;
+ siz = wn;
}
+ /* Alloc and compile the fragments for Wu-Manber algorithm. */
+ wm = xmalloc(sizeof(wmsearch_t));
+ if (!wm)
+ goto err;
+ ret = tre_wmcomp(wm, nr, frags, siz, cflags);
+ if (ret != REG_OK)
+ goto err;
preg->searchdata = wm;
- return REG_OK;
+ /* Set the specific type of matching. */
+ if (cflags & REG_LITERAL)
+ preg->type = MHEUR_LITERAL;
+ else if (cflags & REG_NEWLINE)
+ preg->type = MHEUR_LONGEST;
+ else
+ preg->type = MHEUR_PREFIX;
+
+ goto finish;
err:
if (preg->patterns)
- xfree(preg->patterns);
+ {
+ for (int i = 1; i < nr; i++)
+ if (preg->patterns[i])
+ tre_regfree(preg->patterns[i]);
+ xfree(preg->patterns);
+ }
if (wm)
+ xfree(wm);
+
+finish:
+ if (!(cflags & REG_LITERAL))
{
- for (int i = 0; i < mfrag; i++)
- tre_wmfree(&wm[i]);
- xfree(wm);
- }
- if (frags)
- xfree(frags);
- if (siz)
- xfree(siz);
+ if (frag)
+ xfree(frag);
+ if (siz)
+ xfree(siz);
+ }
return ret;
}
Modified: user/gabor/tre-integration/contrib/tre/lib/mregexec.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/mregexec.c Sat Feb 18 16:54:01 2012 (r231896)
+++ user/gabor/tre-integration/contrib/tre/lib/mregexec.c Sat Feb 18 19:37:02 2012 (r231897)
@@ -80,8 +80,230 @@ tre_mmatch(const void *str, size_t len,
size_t nmatch, regmatch_t pmatch[], int eflags,
const mregex_t *preg)
{
-
- // XXX: Wu-Manber search with specific cases
+ tre_char_t *str_wide = str;
+ char *str_byte = str;
+ int ret;
+ bool need_offsets;
+
+ need_offsets = (preg->cflags & REG_NOSUB) && (nmatch > 0);
+
+#define INPUT(pos) ((type == STR_WIDE) ? str_wide[pos] : str_byte[pos])
+
+ /*
+ * Worst case: at least one pattern does not have a literal
+ * prefix so the Wu-Manber algorithm cannot be used to speed
+ * up the match. There is no trivial best solution either,
+ * so just try matching for each of the patterns and return
+ * the earliest.
+ */
+ if (preg->type == MHEUR_NONE)
+ {
+ regmatch_t **pm;
+ int i, first = -1;
+
+ /* Alloc one regmatch_t for each pattern */
+ if (need_offsets)
+ {
+ pm = xmalloc(preg->k * sizeof(regmatch_t *));
+ if (!pm)
+ return REG_ESPACE;
+ for (i = 0; i < preg->k; i++)
+ {
+ pm[i] = xmalloc(nmatch * sizeof(regmatch_t));
+ if (!pm[i])
+ goto finish;
+ }
+ }
+
+ /* Run matches for each pattern and save first matching offsets. */
+ for (i = 0; i < preg->k; i++)
+ {
+ ret = tre_match(&preg->patterns[i], str, len, type,
+ need_offsets ? nmatch : 0, pm[i], eflags);
+
+ /* Mark if there is no match for the pattern. */
+ if (!need_offsets)
+ {
+ if (ret == REG_OK)
+ return REG_OK;
+ }
+ else if (ret == REG_NOMATCH)
+ pm[i][0].rm_so = -1;
+ else if (ret != REG_OK)
+ goto finish;
+ }
+
+ if (!need_offsets)
+ return REG_NOMATCH;
+
+ /* Check whether there has been a match at all. */
+ for (i; i < preg->k; i++)
+ if (pm[i][0].rm_so != -1)
+ {
+ first = i;
+ break;
+ }
+
+ if (first == -1)
+ ret = REG_NOMATCH;
+
+ /* If there are matches, find the first one. */
+ else
+ {
+ for (i = first + 1; i < preg->k; i++)
+ if ((pm[i][0].rm_so < pm[first][0].rm_so) || (pm[i][0].rm_eo > pm[first][0].rm_eo))
+ {
+ first = i;
+ }
+ }
+
+ /* Fill int the offsets before returning. */
+ for (i = 0; need_offsets && (i < 0); i++)
+ {
+ pmatch[i].rm_so = pm[first][i].rm_so;
+ pmatch[i].rm_eo = pm[first][i].rm_eo;
+ pmatch[i].p = i;
+ }
+ ret = REG_OK;
+
+finish:
+ if (pm)
+ {
+ for (i = 0; i < preg->k; i++)
+ if (pm[i])
+ xfree(pm[i]);
+ xfree(pm);
+ }
+ return ret;
+ }
+
+ /*
+ * REG_NEWLINE: only searching the longest fragment of each
+ * pattern and then isolating the line and calling the
+ * automaton.
+ */
+ else if (preg->type == MHEUR_LONGEST)
+ {
+ regmatch_t *pm, rpm;
+ size_t st = 0, bl, el;
+
+ /* Alloc regmatch_t structures for results */
+ if (need_offsets)
+ {
+ pm = xmalloc(nmatch * sizeof(regmatch_t *));
+ if (!pm)
+ return REG_ESPACE;
+ }
+
+ while (st < len)
+ {
+ /* Look for a possible match. */
+ ret = tre_wmexec(INPUT(st), len, type, 1, &rpm,
+ eflags, preg->wm);
+ if (ret != REG_OK)
+ goto finish;
+
+ /* Need to start from here if this fails. */
+ st += rpm.rm_so + 1;
+
+ /* Look for the beginning of the line. */
+ for (bl = st; bl > 0; bl--)
+ if ((type == STR_WIDE) ? (str_wide[bl] == TRE_CHAR('\n'))
+ (str_byte[bl] == '\n')
+ break;
+
+ /* Look for the end of the line. */
+ for (el = st; el < len; el++)
+ if ((type == STR_WIDE) ? (str_wide[el] == TRE_CHAR('\n'))
+ (str_byte[el] == '\n')
+ break;
+
+ /* Try to match the pattern on the line. */
+ ret = tre_match(&preg->patterns[rpm.p], INPUT(bl), el - bl,
+ type, need_offsets ? nmatch : 0, &pm, eflags);
+
+ /* Evaluate result. */
+ if (ret == REG_NOMATCH)
+ continue;
+ else if (ret == REG_OK)
+ {
+ if (!need_offsets)
+ return REG_OK;
+ else
+ {
+ for (int i = 0; i < nmatch; i++)
+ {
+ pmatch[i].rm_so = pm[i].rm_so;
+ pmatch[i].rm_eo = pm[i].rm_eo;
+ pmatch[i].p = rpm.p;
+ goto finish;
+ }
+ }
+ }
+ else
+ goto finish;
+ }
+finish:
+ if (!pm)
+ xfree(pm)
+ return ret;
+ }
+
+ /*
+ * Literal case. It is enough to search with Wu-Manber and immediately
+ * return the match.
+ */
+ else if (preg->type == MHEUR_LITERAL)
+ {
+ return tre_wmexec(str, len, type, nmatch, pmatch, eflags, preg->wm);
+ }
+
+ /*
+ * General case. Look for the beginning of any of the patterns with the
+ * Wu-Manber algorithm and try to match from there with the automaton.
+ */
+ else
+ {
+ regmatch_t *pm, rpm;
+ size_t st = 0;
+
+ /* Alloc regmatch_t structures for results */
+ if (need_offsets)
+ {
+ pm = xmalloc(nmatch * sizeof(regmatch_t *));
+ if (!pm)
+ return REG_ESPACE;
+ }
+
+ while (st < len)
+ {
+ ret = tre_wmexec(INPUT(st), len, type, nmatch, &rpm, eflags, preg->wm);
+ if (ret != REG_OK)
+ return ret;
+
+ ret = tre_match(&preg->patterns[rpm.p], INPUT(rpm.rm_so),
+ len - rpm.rm_so, type, need_offsets ? nmatch : 0,
+ pm, eflags);
+ if ((ret == REG_OK) && (pm[0].rm_so == 0))
+ {
+ for (int i = 0; i < nmatch; i++)
+ {
+ pm[i].rm_so += st;
+ pm[i].rm_eo += eo;
+ pm[i].p = rpm.p;
+ }
+ goto finish;
+ }
+ else if ((ret != REG_NOMATCH) || (ret != REG_OK))
+ goto finish;
+ st += pm1.rm_so + 1;
+ }
+
+finish:
+ if (pm)
+ xfree(pm);
+ return ret;
+ }
}
Modified: user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.c Sat Feb 18 16:54:01 2012 (r231896)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.c Sat Feb 18 19:37:02 2012 (r231897)
@@ -232,11 +232,14 @@ fail:
#define MATCH(beg, end, p) \
do \
{ \
- match->rm_so = beg; \
- match->rm_eo = end; \
- match->p = p; \
- err = REG_OK; \
- goto finish; \
+ if (!(preg->cflags & REG_NOSUB) && (nmatch > 0)) \
+ { \
+ pmatch->rm_so = beg; \
+ pmatch->rm_eo = end; \
+ pmatch->p = p; \
+ err = REG_OK; \
+ goto finish; \
+ } \
} while (0);
#define _WMSEARCH(data, pats, sizes, mlen, tbl, dshift) \
@@ -292,7 +295,7 @@ fail:
int
tre_wmexec(const void *str, size_t len, tre_str_type_t type,
size_t nmatch, regmatch_t pmatch[], int eflags,
- const wmsearch_t *wm, regmatch_t *match)
+ const wmsearch_t *wm)
{
wmentry_t *s_entry, *p_entry;
tre_char_t *wide_str = str;
Modified: user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.h
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.h Sat Feb 18 16:54:01 2012 (r231896)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.h Sat Feb 18 19:37:02 2012 (r231897)
@@ -9,7 +9,13 @@
#define WM_MAXPAT 64
+#define MHEUR_NONE 0
+#define MHEUR_PREFIX 1
+#define MHEUR_LONGEST 2
+#define MHEUR_LITERAL 3
+
typedef struct {
+ int cflags;
char **pat; /* Patterns */
size_t *siz; /* Pattern sizes */
size_t n; /* No of patterns */
More information about the svn-src-user
mailing list