svn commit: r231668 - user/gabor/tre-integration/contrib/tre/lib

Gabor Kovesdan gabor at FreeBSD.org
Tue Feb 14 11:16:13 UTC 2012


Author: gabor
Date: Tue Feb 14 11:16:13 2012
New Revision: 231668
URL: http://svn.freebsd.org/changeset/base/231668

Log:
  - Fix a bug that always resulted in a HEUR_PREFIX_ARRAY heuristic
  - Store the fragments for later processing (will be used for the Wu-Manber
    algorithm to match more patterns at a time)
  - Refactor the code for better readability

Modified:
  user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c
  user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h

Modified: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c	Tue Feb 14 10:51:24 2012	(r231667)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c	Tue Feb 14 11:16:13 2012	(r231668)
@@ -126,7 +126,10 @@
 int
 tre_compile_heur(heur_t *h, const tre_char_t *regex, size_t len, int cflags)
 {
-  tre_char_t *arr[MAX_FRAGMENTS], *heur;
+  tre_char_t **arr, *heur;
+  tre_char_t **farr;
+  char **barr;
+  size_t *bsiz, *fsiz;
   size_t length[MAX_FRAGMENTS];
   ssize_t tlen = 0;
   int errcode, j = 0, pos = 0, st = 0;
@@ -136,6 +139,13 @@ tre_compile_heur(heur_t *h, const tre_ch
   if (!heur)
     return REG_ESPACE;
 
+  arr = xmalloc(MAX_FRAGMENTS * sizeof(tre_char_t *));
+  if (!arr)
+    {
+      errcode = REG_ESPACE;
+      goto err;
+    }
+
   h->type = HEUR_ARRAY;
 
   while (true)
@@ -316,7 +326,7 @@ tre_compile_heur(heur_t *h, const tre_ch
       st = len;
  
 end_segment:
-
+      /* Check if pattern is open-ended */
       if (st == len && pos == 0)
 	{
 	  if (j == 0)
@@ -327,6 +337,7 @@ end_segment:
 	  h->type = HEUR_PREFIX_ARRAY;
 	  goto ok;
 	}
+      /* Continue if we got some variable-length part */
       else if (pos == 0)
 	continue;
 
@@ -349,95 +360,202 @@ end_segment:
       length[j] = pos;
       j++;
       pos = 0;
+
+      /* Check whether there is more input */
+      if (st == len)
+	goto ok;
     }
 
 ok:
   {
-    size_t m = 1;
-    int ret;
+    size_t m = 0;
+    int i, ret;
 
     h->tlen = tlen;
 
     /* Look up maximum length fragment */
-    for (int i = 1; i < j; i++)
+    for (i = 1; i < j; i++)
       m = (length[i] > length[m]) ? i : m;
 
+    /* Will hold the final fragments that we actually use */
+    farr = xmalloc(4 * sizeof(tre_char_t *));
+    if (!farr)
+      {
+	errcode = REG_ESPACE;
+	goto err;
+      }
+
+    /* Sizes for the final fragments */
+    fsiz = xmalloc(4 * sizeof(size_t));
+    if (!fsiz)
+      {
+        errcode = REG_ESPACE;
+        goto err;
+      }
+
     /*
-     * If possible, store prefix, maximum internal fragment and suffix.
-     * If not possible, store prefix and either maximum internal fragment
-     * or suffix if it is the same.  In the worst case, only prefix is
-     * stored.  The closing element is always NULL.
+     * Only save the longest fragment if match is line-based.
      */
-    for (int i = 0; i < MIN(3, j + 1); i++)
+    if (cflags & REG_NEWLINE)
       {
-	h->heurs[i] = xmalloc(sizeof(fastmatch_t));
-	if (!h->heurs[i])
+	farr[0] = arr[m];
+	arr[m] = NULL;
+	fsiz[0] = length[0];
+	farr[1] = NULL;
+      }
+    /*
+     * Otherwise try to save up to three fragments: beginning, longest
+     * intermediate pattern, ending.  If either the beginning or the
+     * ending fragment is longer than any intermediate fragments, we will
+     * not save any intermediate one.  The point here is to always maximize
+     * the possible shifting when searching in the input.  Measurements
+     * have shown that the eager approach works best.
+     */
+    else
+      {
+	size_t idx = 0;
+
+	/* Always start by saving the beginning */
+	farr[idx] = arr[0];
+	arr[0] = NULL;
+	fsiz[idx++] = length[0];
+
+	/*
+	 * If the longest pattern is not the beginning nor the ending,
+	 * save it.
+	 */
+	if ((m != 1) && (m != j - 1))
+	  {
+	    farr[idx] = arr[m];
+	    fsiz[idx++] = length[m];
+	    arr[m] = NULL;
+	  }
+
+	/*
+	 * If we have an ending pattern (even if the pattern is
+	 * "open-ended"), save it.
+	 */
+	if (j > 1)
+	  {
+	    farr[idx] = arr[j - 1];
+	    fsiz[idx++] = length[j - 1];
+	    arr[j - 1] = NULL;
+	  }
+
+	farr[idx] = NULL;
+      }
+
+    /* Once necessary pattern saved, free original array */
+    for (i = 0; i < j; i++)
+      if (arr[i])
+	xfree(arr[i]);
+    xfree(arr);
+
+/*
+ * Store the array in single-byte and wide char forms in the
+ * heur_t struct for later reuse.  When supporting whcar_t
+ * convert the fragments to single-byte string because
+ * conversion is probably faster than processing the patterns
+ * again in single-byte form.
+ */
+#ifdef TRE_WCHAR
+    barr = xmalloc(4 * sizeof(char *));
+    if (!barr)
+      {
+	errcode = REG_ESPACE;
+	goto err;
+      }
+
+    bsiz = xmalloc(4 * sizeof(size_t));
+      if (!bsiz)
+	{
+	  errcode = REG_ESPACE;
+	  goto err;
+	}
+
+    for (i = 0; farr[i] != NULL; i++)
+      {
+	bsiz[i] = mbstowcs(farr[i], NULL, 0);
+	barr[i] = xmalloc(bsiz[i] + 1);
+	if (!barr[i])
 	  {
 	    errcode = REG_ESPACE;
 	    goto err;
 	  }
+	mbstowcs(farr[i], barr[i], bsiz[i]);
+	barr[i][bsiz[i]] = '\0';
       }
+    barr[i] = NULL;
 
-#define CHECK_ERR							\
-  if (ret != REG_OK)							\
-    {									\
-      errcode = REG_BADPAT;						\
-      goto err2;							\
-    }
+    h->warr = farr;
+    h->wsiz = fsiz;
+    h->arr = barr;
+    h->siz = bsiz;
+#else
+    h->arr = farr;
+    h->siz = fsiz;
+#endif
 
-    if (cflags & REG_NEWLINE)
+    /*
+     * Compile all the useful fragments for actual matching.
+     */
+    h->heurs = xmalloc(4 * sizeof(fastmatch_t *));
+    if (!h->heurs)
       {
-	/* For REG_NEWLINE, only store longest fragment. */
-	ret = tre_compile_literal(h->heurs[0], arr[m], length[m], 0);
-	CHECK_ERR
-	h->type = HEUR_LONGEST;
+	errcode = REG_ESPACE;
+	goto err;
       }
-    else
+    for (i = 0; farr[i] != NULL; i++)
       {
-	/*
-	 * If possible, store prefix, maximum internal fragment and suffix.
-	 * If not possible, store prefix and either maximum internal fragment
-	 * or suffix if it is the same.  In the worst case, only prefix is
-	 * stored.  The closing element is always NULL.
-	 */
-	ret = tre_compile_literal(h->heurs[0], arr[0], length[0], 0);
-	CHECK_ERR
-	if (j == 1)
+	h->heurs[i] = xmalloc(sizeof(fastmatch_t));
+	if (!h->heurs[i])
 	  {
-	    free(h->heurs[1]);
-	    h->heurs[1] = NULL;
-	    errcode = REG_OK;
-	    goto finish;
+	    errcode = REG_ESPACE;
+	    goto err;
 	  }
-	else
-	  ret = tre_compile_literal(h->heurs[1], arr[m], length[m], 0);
-	CHECK_ERR
-	if ((h->type == HEUR_PREFIX_ARRAY) || (m == j - 1))
+	ret = tre_compile_literal(h->heurs[i], farr[i], fsiz[i], 0);
+	if (ret != REG_OK)
 	  {
-	    xfree(h->heurs[2]);
-	    h->heurs[2] = NULL;
-	    errcode = REG_OK;
-	    goto finish;
+	    errcode = REG_BADPAT;
+	    goto err;
 	  }
-	else
-	  ret = tre_compile_literal(h->heurs[2], arr[j - 1], length[j - 1], 0);
-	CHECK_ERR
-	h->heurs[3] = NULL;
       }
 
-      errcode = REG_OK;
-      goto finish;
+    h->heurs[i] = NULL;
+    errcode = REG_OK;
+    goto finish;
   }
 
-err2:
-  for (int i = 0; h->heurs[i] != NULL; i++)
-    tre_free_fast(h->heurs[i]);
-  xfree(h->heurs);
 err:
+#ifdef TRE_WCHAR
+  if (barr)
+    {
+      for (int i = 0; i < 4; i++)
+	if (barr[i])
+	  xfree(barr[i]);
+      xfree(barr);
+    }
+  if (bsiz)
+    xfree(bsiz);
+#endif
+  if (farr)
+    {
+      for (int i = 0; i < j; i++)
+	if (farr[i])
+	  xfree(farr[i]);
+      xfree(farr);
+    }
+  if (fsiz)
+    xfree(fsiz);
+  if (h->heurs)
+    {
+      for (int i = 0; h->heurs[i] != NULL; i++)
+	tre_free_fast(h->heurs[i]);
+      xfree(h->heurs);
+    }
 finish:
-  for (int i = 0; i < j; i++)
-    xfree(arr[i]);
-  xfree(heur);
+  if (heur)
+    xfree(heur);
   return errcode;
 }
 
@@ -450,5 +568,24 @@ tre_free_heur(heur_t *h)
   for (int i = 0; h->heurs[i] != NULL; i++)
     tre_free_fast(h->heurs[i]);
 
+  if (h->arr)
+    {
+      for (int i = 0; h->arr[i] != NULL; i++)
+	if (h->arr[i])
+	  xfree(h->arr[i]);
+      xfree(h->arr);
+    }
+
+#ifdef TRE_WCHAR
+  if (h->warr)
+    {
+      for (int i = 0; h->warr[i] != NULL; i++)
+	if (h->warr[i])
+	  xfree(h->warr[i]);
+      xfree(h->warr);
+    }
+
+#endif
+
   DPRINT("tre_free_heur: resources are freed\n");
 }

Modified: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h	Tue Feb 14 10:51:24 2012	(r231667)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h	Tue Feb 14 11:16:13 2012	(r231668)
@@ -12,7 +12,13 @@
 #define HEUR_LONGEST		2
 
 typedef struct {
-  fastmatch_t *heurs[4];
+  char **arr;
+  size_t *siz;
+#ifdef TRE_WCHAR
+  tre_char_t **warr;
+  size_t *wsiz;
+#endif
+  fastmatch_t **heurs;
   ssize_t tlen;
   int type;
 } heur_t;


More information about the svn-src-user mailing list