svn commit: r231897 - user/gabor/tre-integration/contrib/tre/lib

Gabor Kovesdan gabor at FreeBSD.org
Sat Feb 18 19:37:02 UTC 2012


Author: gabor
Date: Sat Feb 18 19:37:02 2012
New Revision: 231897
URL: http://svn.freebsd.org/changeset/base/231897

Log:
  Implement multi-pattern heuristic matching.  Not yet connected to the build
  and totally untested.

Modified:
  user/gabor/tre-integration/contrib/tre/lib/mregcomp.c
  user/gabor/tre-integration/contrib/tre/lib/mregexec.c
  user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.c
  user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.h

Modified: user/gabor/tre-integration/contrib/tre/lib/mregcomp.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/mregcomp.c	Sat Feb 18 16:54:01 2012	(r231896)
+++ user/gabor/tre-integration/contrib/tre/lib/mregcomp.c	Sat Feb 18 19:37:02 2012	(r231897)
@@ -54,81 +54,102 @@ __weak_reference(tre_mregfree, mregfree)
 
 int
 tre_mcompile(mregex_t *preg, size_t nr, const tre_char_t *wregex[],
-	     size_t wn[], int cflags)
+	     size_t wn[], const char *regex[], size_t n, int cflags)
 {
   int ret;
-  size_t mfrag = 0;
   tre_char_t **frags;
   size_t *siz;
   wmsearch_t *wm;
 
   preg->k = nr;
+  preg->cflags = cflags;
   preg->patterns = xmalloc(nr * sizeof(regex_t));
   if (!preg->patterns)
     return REG_ESPACE;
 
+  /* Compile NFA, BM and heuristic for each pattern. */
   for (int i = 0; i < nr; i++)
     {
-      ret = tre_compile_nfa(&preg->patterns[i], wregex[i], wn[i], cflags);
+      ret = tre_compile(&preg->patterns[i], wregex[i], wn[i],
+			regex[i], n[i], cflags);
       if (ret != REG_OK)
-	goto err;
-      ret = tre_compile_heur(&preg->patterns[i], wregex[i], wn[i], cflags);
-      if (ret != REG_OK)
-	goto err;
+	return ret;
     }
 
-  for (mfrag = 0; mfrag < nr; mfrag++)
-    for (int j = 0; j < nr; j++)
-      if (((heur_t)preg->patterns[j]->heur)->arr[mfrag] == NULL)
-	goto out;
-out:
-
-  preg->mfrag = mfrag;
-
-  /* Worst case, not all patterns have a literal prefix */
-  if (mfrag == 0)
-    return REG_OK;
-
-  wm = xmalloc(mfrag * sizeof(wmsearch_t));
-  if (!wm)
-    goto err;
+  /* If not literal, check if any of them have fixed-length prefix. */
+  if (!(cflags & REG_LITERAL))
+    for (int i = 0; i < nr; i++)
+      if ((preg->patterns[i]->heur == NULL) ||
+	 (((heur_t)preg->patterns[i]->heur)->arr[0] == NULL))
+	{
+	  preg->type = MHEUR_NONE;
+	  goto finish;
+	}
 
-  frags = xmalloc(nr * sizeof(char *));
-  if (!frags)
-    goto err;
+  /*
+   * Set frag and siz to point to the fragments to compile and
+   * their respective sizes.
+   */
+  if (!(cflags & REG_LITERAL))
+    {
+      frags = xmalloc(nr * sizeof(char *));
+      if (!frags)
+        goto err;
 
-  siz = xmalloc(nr * sizeof(size_t));
-  // XXX: check NULL
+      siz = xmalloc(nr * sizeof(size_t));
+      if (!siz)
+	goto err;
 
-  for (int i = 0; i < mfrag; i++)
-    {
       for (int j = 0; j < nr; j++)
 	{
-	  frags[j] = &((heur_t)preg->patterns[j]->heur)->arr[i];
-	  siz[j] = ((heur_t)preg->patterns[j]->heur)->siz[i];
+	  frags[j] = &((heur_t)preg->patterns[j]->heur)->arr[0];
+	  siz[j] = ((heur_t)preg->patterns[j]->heur)->siz[0];
 	}
-      ret = tre_wmcomp(&wm[i], nr, frags, siz, cflags);
-      if (ret != REG_OK)
-	goto err;
+    }
+  else
+    {
+      frags = wregex;
+      siz = wn;
     }
 
+  /* Alloc and compile the fragments for Wu-Manber algorithm. */
+  wm = xmalloc(sizeof(wmsearch_t));
+  if (!wm)
+    goto err;
+  ret = tre_wmcomp(wm, nr, frags, siz, cflags);
+    if (ret != REG_OK)
+      goto err;
   preg->searchdata = wm;
 
-  return REG_OK;
+  /* Set the specific type of matching. */
+  if (cflags & REG_LITERAL)
+    preg->type = MHEUR_LITERAL;
+  else if (cflags & REG_NEWLINE)
+    preg->type = MHEUR_LONGEST;
+  else
+    preg->type = MHEUR_PREFIX;
+
+  goto finish;
 
 err:
   if (preg->patterns)
-    xfree(preg->patterns);
+    {
+      for (int i = 1; i < nr; i++)
+	if (preg->patterns[i])
+	  tre_regfree(preg->patterns[i]);
+      xfree(preg->patterns);
+    }
   if (wm)
+    xfree(wm);
+
+finish:
+  if (!(cflags & REG_LITERAL))
     {
-      for (int i = 0; i < mfrag; i++)
-	tre_wmfree(&wm[i]);
-      xfree(wm);
-    }
-  if (frags)
-    xfree(frags);
-  if (siz)
-    xfree(siz);
+      if (frag)
+	xfree(frag);
+      if (siz)
+	xfree(siz);
+    }
   return ret;
 }
 

Modified: user/gabor/tre-integration/contrib/tre/lib/mregexec.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/mregexec.c	Sat Feb 18 16:54:01 2012	(r231896)
+++ user/gabor/tre-integration/contrib/tre/lib/mregexec.c	Sat Feb 18 19:37:02 2012	(r231897)
@@ -80,8 +80,230 @@ tre_mmatch(const void *str, size_t len, 
 	   size_t nmatch, regmatch_t pmatch[], int eflags,
 	   const mregex_t *preg)
 {
-
-  // XXX: Wu-Manber search with specific cases
+  tre_char_t *str_wide = str;
+  char *str_byte = str;
+  int ret;
+  bool need_offsets;
+
+  need_offsets = (preg->cflags & REG_NOSUB) && (nmatch > 0);
+
+#define INPUT(pos) ((type == STR_WIDE) ? str_wide[pos] : str_byte[pos])
+
+  /*
+   * Worst case: at least one pattern does not have a literal
+   * prefix so the Wu-Manber algorithm cannot be used to speed
+   * up the match.  There is no trivial best solution either,
+   * so just try matching for each of the patterns and return
+   * the earliest.
+   */
+  if (preg->type == MHEUR_NONE)
+    {
+      regmatch_t **pm;
+      int i, first = -1;
+
+      /* Alloc one regmatch_t for each pattern */
+      if (need_offsets)
+	{
+	  pm = xmalloc(preg->k * sizeof(regmatch_t *));
+	  if (!pm)
+	    return REG_ESPACE;
+	  for (i = 0; i < preg->k; i++)
+	    {
+	      pm[i] = xmalloc(nmatch * sizeof(regmatch_t));
+	      if (!pm[i])
+		goto finish;
+	    }
+	}
+
+      /* Run matches for each pattern and save first matching offsets. */
+      for (i = 0; i < preg->k; i++)
+	{
+	  ret = tre_match(&preg->patterns[i], str, len, type,
+			  need_offsets ? nmatch : 0, pm[i], eflags);
+
+	  /* Mark if there is no match for the pattern. */
+	  if (!need_offsets)
+	    {
+	      if (ret == REG_OK)
+		return REG_OK;
+	    }
+	  else if (ret == REG_NOMATCH)
+	    pm[i][0].rm_so = -1;
+	  else if (ret != REG_OK)
+	    goto finish;
+	}
+
+      if (!need_offsets)
+	return REG_NOMATCH;
+
+      /* Check whether there has been a match at all. */
+      for (i; i < preg->k; i++)
+	if (pm[i][0].rm_so != -1)
+	  {
+	    first = i;
+	    break;
+	  }
+
+      if (first == -1)
+	ret = REG_NOMATCH;
+
+      /* If there are matches, find the first one. */
+      else
+	{
+	  for (i = first + 1; i < preg->k; i++)
+	    if ((pm[i][0].rm_so < pm[first][0].rm_so) || (pm[i][0].rm_eo > pm[first][0].rm_eo))
+	      {
+		first = i;
+	      }
+	}
+
+      /* Fill int the offsets before returning. */
+      for (i = 0; need_offsets && (i < 0); i++)
+	{
+	  pmatch[i].rm_so = pm[first][i].rm_so;
+	  pmatch[i].rm_eo = pm[first][i].rm_eo;
+	  pmatch[i].p = i;
+	}
+      ret = REG_OK;
+
+finish:
+      if (pm)
+	{
+	  for (i = 0; i < preg->k; i++)
+	    if (pm[i])
+	      xfree(pm[i]);
+	  xfree(pm);
+	}
+      return ret;
+    }
+
+  /*
+   * REG_NEWLINE: only searching the longest fragment of each
+   * pattern and then isolating the line and calling the
+   * automaton.
+   */
+  else if (preg->type == MHEUR_LONGEST)
+    {
+      regmatch_t *pm, rpm;
+      size_t st = 0, bl, el;
+
+      /* Alloc regmatch_t structures for results */
+      if (need_offsets)
+	{
+	  pm = xmalloc(nmatch * sizeof(regmatch_t *));
+	  if (!pm)
+	    return REG_ESPACE;
+        }
+
+      while (st < len)
+	{
+	  /* Look for a possible match. */
+	  ret = tre_wmexec(INPUT(st), len, type, 1, &rpm,
+			   eflags, preg->wm);
+	  if (ret != REG_OK)
+	    goto finish;
+
+	  /* Need to start from here if this fails. */
+	  st += rpm.rm_so + 1;
+
+	  /* Look for the beginning of the line. */
+	  for (bl = st; bl > 0; bl--)
+	    if ((type == STR_WIDE) ? (str_wide[bl] == TRE_CHAR('\n'))
+		(str_byte[bl] == '\n')
+	      break;
+
+	  /* Look for the end of the line. */
+	  for (el = st; el < len; el++)
+	   if ((type == STR_WIDE) ? (str_wide[el] == TRE_CHAR('\n'))
+	        (str_byte[el] == '\n')
+	      break;
+
+	  /* Try to match the pattern on the line. */
+	  ret = tre_match(&preg->patterns[rpm.p], INPUT(bl), el - bl,
+			  type, need_offsets ? nmatch : 0, &pm, eflags);
+
+	  /* Evaluate result. */
+	  if (ret == REG_NOMATCH)
+	    continue;
+	  else if (ret == REG_OK)
+	    {
+	      if (!need_offsets)
+		return REG_OK;
+	      else
+		{
+		  for (int i = 0; i < nmatch; i++)
+		    {
+		      pmatch[i].rm_so = pm[i].rm_so;
+		      pmatch[i].rm_eo = pm[i].rm_eo;
+		      pmatch[i].p = rpm.p;
+		      goto finish;
+		    }
+		}
+	    }
+	  else
+	    goto finish;
+	}
+finish:
+      if (!pm)
+	xfree(pm)
+      return ret;
+    }
+
+  /*
+   * Literal case.  It is enough to search with Wu-Manber and immediately
+   * return the match.
+   */
+  else if (preg->type == MHEUR_LITERAL)
+    {
+      return tre_wmexec(str, len, type, nmatch, pmatch, eflags, preg->wm);
+    }
+
+  /*
+   * General case. Look for the beginning of any of the patterns with the
+   * Wu-Manber algorithm and try to match from there with the automaton.
+   */
+  else
+    {
+      regmatch_t *pm, rpm;
+      size_t st = 0;
+
+      /* Alloc regmatch_t structures for results */
+      if (need_offsets)
+	{
+	  pm = xmalloc(nmatch * sizeof(regmatch_t *));
+	  if (!pm)
+	    return REG_ESPACE;
+	}
+
+      while (st < len)
+	{
+	  ret = tre_wmexec(INPUT(st), len, type, nmatch, &rpm, eflags, preg->wm);
+	  if (ret != REG_OK)
+	    return ret;
+
+	  ret = tre_match(&preg->patterns[rpm.p], INPUT(rpm.rm_so),
+			  len - rpm.rm_so, type, need_offsets ? nmatch : 0,
+			  pm, eflags);
+	  if ((ret == REG_OK) && (pm[0].rm_so == 0))
+	    {
+	      for (int i = 0; i < nmatch; i++)
+		{
+		  pm[i].rm_so += st;
+		  pm[i].rm_eo += eo;
+		  pm[i].p = rpm.p;
+		}
+	      goto finish;
+	    }
+	  else if ((ret != REG_NOMATCH) || (ret != REG_OK))
+	    goto finish;
+	  st += pm1.rm_so + 1;
+	}
+
+finish:
+      if (pm)
+	xfree(pm);
+      return ret;
+    }
 
 }
 

Modified: user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.c	Sat Feb 18 16:54:01 2012	(r231896)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.c	Sat Feb 18 19:37:02 2012	(r231897)
@@ -232,11 +232,14 @@ fail:
 #define MATCH(beg, end, p)						\
   do									\
     {									\
-      match->rm_so = beg;						\
-      match->rm_eo = end;						\
-      match->p = p;							\
-      err = REG_OK;							\
-      goto finish;							\
+      if (!(preg->cflags & REG_NOSUB) && (nmatch > 0))			\
+	{								\
+	  pmatch->rm_so = beg;						\
+	  pmatch->rm_eo = end;						\
+	  pmatch->p = p;						\
+	  err = REG_OK;							\
+	  goto finish;							\
+	}								\
     } while (0);
 
 #define _WMSEARCH(data, pats, sizes, mlen, tbl, dshift)			\
@@ -292,7 +295,7 @@ fail:
 int
 tre_wmexec(const void *str, size_t len, tre_str_type_t type,
 	   size_t nmatch, regmatch_t pmatch[], int eflags,
-	   const wmsearch_t *wm, regmatch_t *match)
+	   const wmsearch_t *wm)
 {
   wmentry_t *s_entry, *p_entry;
   tre_char_t *wide_str = str;

Modified: user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.h
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.h	Sat Feb 18 16:54:01 2012	(r231896)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-mfastmatch.h	Sat Feb 18 19:37:02 2012	(r231897)
@@ -9,7 +9,13 @@
 
 #define WM_MAXPAT 64
 
+#define MHEUR_NONE 0
+#define MHEUR_PREFIX 1
+#define MHEUR_LONGEST 2
+#define MHEUR_LITERAL 3
+
 typedef struct {
+	int cflags;
 	char **pat;		/* Patterns */
 	size_t *siz;		/* Pattern sizes */
 	size_t n;		/* No of patterns */


More information about the svn-src-user mailing list