svn commit: r231668 - user/gabor/tre-integration/contrib/tre/lib
Gabor Kovesdan
gabor at FreeBSD.org
Tue Feb 14 11:16:13 UTC 2012
Author: gabor
Date: Tue Feb 14 11:16:13 2012
New Revision: 231668
URL: http://svn.freebsd.org/changeset/base/231668
Log:
- Fix a bug that always resulted in a HEUR_PREFIX_ARRAY heuristic
- Store the fragments for later processing (will be used for the Wu-Manber
algorithm to match more patterns at a time)
- Refactor the code for better readability
Modified:
user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c
user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h
Modified: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c Tue Feb 14 10:51:24 2012 (r231667)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c Tue Feb 14 11:16:13 2012 (r231668)
@@ -126,7 +126,10 @@
int
tre_compile_heur(heur_t *h, const tre_char_t *regex, size_t len, int cflags)
{
- tre_char_t *arr[MAX_FRAGMENTS], *heur;
+ tre_char_t **arr, *heur;
+ tre_char_t **farr;
+ char **barr;
+ size_t *bsiz, *fsiz;
size_t length[MAX_FRAGMENTS];
ssize_t tlen = 0;
int errcode, j = 0, pos = 0, st = 0;
@@ -136,6 +139,13 @@ tre_compile_heur(heur_t *h, const tre_ch
if (!heur)
return REG_ESPACE;
+ arr = xmalloc(MAX_FRAGMENTS * sizeof(tre_char_t *));
+ if (!arr)
+ {
+ errcode = REG_ESPACE;
+ goto err;
+ }
+
h->type = HEUR_ARRAY;
while (true)
@@ -316,7 +326,7 @@ tre_compile_heur(heur_t *h, const tre_ch
st = len;
end_segment:
-
+ /* Check if pattern is open-ended */
if (st == len && pos == 0)
{
if (j == 0)
@@ -327,6 +337,7 @@ end_segment:
h->type = HEUR_PREFIX_ARRAY;
goto ok;
}
+ /* Continue if we got some variable-length part */
else if (pos == 0)
continue;
@@ -349,95 +360,202 @@ end_segment:
length[j] = pos;
j++;
pos = 0;
+
+ /* Check whether there is more input */
+ if (st == len)
+ goto ok;
}
ok:
{
- size_t m = 1;
- int ret;
+ size_t m = 0;
+ int i, ret;
h->tlen = tlen;
/* Look up maximum length fragment */
- for (int i = 1; i < j; i++)
+ for (i = 1; i < j; i++)
m = (length[i] > length[m]) ? i : m;
+ /* Will hold the final fragments that we actually use */
+ farr = xmalloc(4 * sizeof(tre_char_t *));
+ if (!farr)
+ {
+ errcode = REG_ESPACE;
+ goto err;
+ }
+
+ /* Sizes for the final fragments */
+ fsiz = xmalloc(4 * sizeof(size_t));
+ if (!fsiz)
+ {
+ errcode = REG_ESPACE;
+ goto err;
+ }
+
/*
- * If possible, store prefix, maximum internal fragment and suffix.
- * If not possible, store prefix and either maximum internal fragment
- * or suffix if it is the same. In the worst case, only prefix is
- * stored. The closing element is always NULL.
+ * Only save the longest fragment if match is line-based.
*/
- for (int i = 0; i < MIN(3, j + 1); i++)
+ if (cflags & REG_NEWLINE)
{
- h->heurs[i] = xmalloc(sizeof(fastmatch_t));
- if (!h->heurs[i])
+ farr[0] = arr[m];
+ arr[m] = NULL;
+ fsiz[0] = length[0];
+ farr[1] = NULL;
+ }
+ /*
+ * Otherwise try to save up to three fragments: beginning, longest
+ * intermediate pattern, ending. If either the beginning or the
+ * ending fragment is longer than any intermediate fragments, we will
+ * not save any intermediate one. The point here is to always maximize
+ * the possible shifting when searching in the input. Measurements
+ * have shown that the eager approach works best.
+ */
+ else
+ {
+ size_t idx = 0;
+
+ /* Always start by saving the beginning */
+ farr[idx] = arr[0];
+ arr[0] = NULL;
+ fsiz[idx++] = length[0];
+
+ /*
+ * If the longest pattern is not the beginning nor the ending,
+ * save it.
+ */
+ if ((m != 1) && (m != j - 1))
+ {
+ farr[idx] = arr[m];
+ fsiz[idx++] = length[m];
+ arr[m] = NULL;
+ }
+
+ /*
+ * If we have an ending pattern (even if the pattern is
+ * "open-ended"), save it.
+ */
+ if (j > 1)
+ {
+ farr[idx] = arr[j - 1];
+ fsiz[idx++] = length[j - 1];
+ arr[j - 1] = NULL;
+ }
+
+ farr[idx] = NULL;
+ }
+
+ /* Once necessary pattern saved, free original array */
+ for (i = 0; i < j; i++)
+ if (arr[i])
+ xfree(arr[i]);
+ xfree(arr);
+
+/*
+ * Store the array in single-byte and wide char forms in the
+ * heur_t struct for later reuse. When supporting whcar_t
+ * convert the fragments to single-byte string because
+ * conversion is probably faster than processing the patterns
+ * again in single-byte form.
+ */
+#ifdef TRE_WCHAR
+ barr = xmalloc(4 * sizeof(char *));
+ if (!barr)
+ {
+ errcode = REG_ESPACE;
+ goto err;
+ }
+
+ bsiz = xmalloc(4 * sizeof(size_t));
+ if (!bsiz)
+ {
+ errcode = REG_ESPACE;
+ goto err;
+ }
+
+ for (i = 0; farr[i] != NULL; i++)
+ {
+ bsiz[i] = mbstowcs(farr[i], NULL, 0);
+ barr[i] = xmalloc(bsiz[i] + 1);
+ if (!barr[i])
{
errcode = REG_ESPACE;
goto err;
}
+ mbstowcs(farr[i], barr[i], bsiz[i]);
+ barr[i][bsiz[i]] = '\0';
}
+ barr[i] = NULL;
-#define CHECK_ERR \
- if (ret != REG_OK) \
- { \
- errcode = REG_BADPAT; \
- goto err2; \
- }
+ h->warr = farr;
+ h->wsiz = fsiz;
+ h->arr = barr;
+ h->siz = bsiz;
+#else
+ h->arr = farr;
+ h->siz = fsiz;
+#endif
- if (cflags & REG_NEWLINE)
+ /*
+ * Compile all the useful fragments for actual matching.
+ */
+ h->heurs = xmalloc(4 * sizeof(fastmatch_t *));
+ if (!h->heurs)
{
- /* For REG_NEWLINE, only store longest fragment. */
- ret = tre_compile_literal(h->heurs[0], arr[m], length[m], 0);
- CHECK_ERR
- h->type = HEUR_LONGEST;
+ errcode = REG_ESPACE;
+ goto err;
}
- else
+ for (i = 0; farr[i] != NULL; i++)
{
- /*
- * If possible, store prefix, maximum internal fragment and suffix.
- * If not possible, store prefix and either maximum internal fragment
- * or suffix if it is the same. In the worst case, only prefix is
- * stored. The closing element is always NULL.
- */
- ret = tre_compile_literal(h->heurs[0], arr[0], length[0], 0);
- CHECK_ERR
- if (j == 1)
+ h->heurs[i] = xmalloc(sizeof(fastmatch_t));
+ if (!h->heurs[i])
{
- free(h->heurs[1]);
- h->heurs[1] = NULL;
- errcode = REG_OK;
- goto finish;
+ errcode = REG_ESPACE;
+ goto err;
}
- else
- ret = tre_compile_literal(h->heurs[1], arr[m], length[m], 0);
- CHECK_ERR
- if ((h->type == HEUR_PREFIX_ARRAY) || (m == j - 1))
+ ret = tre_compile_literal(h->heurs[i], farr[i], fsiz[i], 0);
+ if (ret != REG_OK)
{
- xfree(h->heurs[2]);
- h->heurs[2] = NULL;
- errcode = REG_OK;
- goto finish;
+ errcode = REG_BADPAT;
+ goto err;
}
- else
- ret = tre_compile_literal(h->heurs[2], arr[j - 1], length[j - 1], 0);
- CHECK_ERR
- h->heurs[3] = NULL;
}
- errcode = REG_OK;
- goto finish;
+ h->heurs[i] = NULL;
+ errcode = REG_OK;
+ goto finish;
}
-err2:
- for (int i = 0; h->heurs[i] != NULL; i++)
- tre_free_fast(h->heurs[i]);
- xfree(h->heurs);
err:
+#ifdef TRE_WCHAR
+ if (barr)
+ {
+ for (int i = 0; i < 4; i++)
+ if (barr[i])
+ xfree(barr[i]);
+ xfree(barr);
+ }
+ if (bsiz)
+ xfree(bsiz);
+#endif
+ if (farr)
+ {
+ for (int i = 0; i < j; i++)
+ if (farr[i])
+ xfree(farr[i]);
+ xfree(farr);
+ }
+ if (fsiz)
+ xfree(fsiz);
+ if (h->heurs)
+ {
+ for (int i = 0; h->heurs[i] != NULL; i++)
+ tre_free_fast(h->heurs[i]);
+ xfree(h->heurs);
+ }
finish:
- for (int i = 0; i < j; i++)
- xfree(arr[i]);
- xfree(heur);
+ if (heur)
+ xfree(heur);
return errcode;
}
@@ -450,5 +568,24 @@ tre_free_heur(heur_t *h)
for (int i = 0; h->heurs[i] != NULL; i++)
tre_free_fast(h->heurs[i]);
+ if (h->arr)
+ {
+ for (int i = 0; h->arr[i] != NULL; i++)
+ if (h->arr[i])
+ xfree(h->arr[i]);
+ xfree(h->arr);
+ }
+
+#ifdef TRE_WCHAR
+ if (h->warr)
+ {
+ for (int i = 0; h->warr[i] != NULL; i++)
+ if (h->warr[i])
+ xfree(h->warr[i]);
+ xfree(h->warr);
+ }
+
+#endif
+
DPRINT("tre_free_heur: resources are freed\n");
}
Modified: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h Tue Feb 14 10:51:24 2012 (r231667)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.h Tue Feb 14 11:16:13 2012 (r231668)
@@ -12,7 +12,13 @@
#define HEUR_LONGEST 2
typedef struct {
- fastmatch_t *heurs[4];
+ char **arr;
+ size_t *siz;
+#ifdef TRE_WCHAR
+ tre_char_t **warr;
+ size_t *wsiz;
+#endif
+ fastmatch_t **heurs;
ssize_t tlen;
int type;
} heur_t;
More information about the svn-src-user
mailing list