svn commit: r224713 - user/gabor/tre-integration/contrib/tre/lib
Gabor Kovesdan
gabor at FreeBSD.org
Mon Aug 8 14:40:24 UTC 2011
Author: gabor
Date: Mon Aug 8 14:40:23 2011
New Revision: 224713
URL: http://svn.freebsd.org/changeset/base/224713
Log:
- Use the Turbo Boyer-Moore algorithm for literal patterns
- Patterns containing . will still use the quick sort algorithm because it is
harder to adapt the TBM algorithm for this case but this is broken
atm anyway
- Use a larger hash table to reduce hash collisions at insertion
Modified:
user/gabor/tre-integration/contrib/tre/lib/fastmatch.c
user/gabor/tre-integration/contrib/tre/lib/fastmatch.h
Modified: user/gabor/tre-integration/contrib/tre/lib/fastmatch.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/fastmatch.c Mon Aug 8 14:02:08 2011 (r224712)
+++ user/gabor/tre-integration/contrib/tre/lib/fastmatch.c Mon Aug 8 14:40:23 2011 (r224713)
@@ -116,30 +116,85 @@ static void revs(char *str, int len);
#define SHIFT \
CHECKBOUNDS; \
{ \
- int k = 0, r = -1; \
+ int bc = 0, gs = 0, ts, r = -1; \
\
switch (type) \
{ \
case STR_BYTE: \
case STR_MBS: \
- k = fg->qsBc[(unsigned char)((char *)startptr) \
+ if (!fg->hasdot) \
+ { \
+ if (u != 0 && mismatch == fg->len - 1 - shift) \
+ mismatch -= u; \
+ v = fg->len - 1 - mismatch; \
+ gs = fg->sbmGs[mismatch]; \
+ } \
+ bc = fg->qsBc[(unsigned char)((char *)startptr) \
[mismatch + 1]]; \
break; \
case STR_WIDE: \
- r = hashtable_get(fg->qsBc_table, \
- &((wchar_t *)startptr)[mismatch + 1], &k); \
- k = (r == 0) ? k : fg->defBc; \
+ if (!fg->hasdot) \
+ { \
+ if (u != 0 && mismatch == fg->wlen - 1 - shift) \
+ mismatch -= u; \
+ v = fg->wlen - 1 - mismatch; \
+ r = hashtable_get(fg->qsBc_table, \
+ &((wchar_t *)startptr)[mismatch + 1], &bc); \
+ gs = fg->bmGs[mismatch]; \
+ } \
+ bc = (r == 0) ? bc : fg->defBc; \
break; \
default: \
/* XXX */ \
break; \
} \
- j += k; \
+ if (fg->hasdot) \
+ shift = bc; \
+ else \
+ { \
+ ts = u - v; \
+ shift = MAX(ts, bc); \
+ shift = MAX(shift, gs); \
+ if (shift == gs) \
+ u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - \
+ shift, v); \
+ else \
+ { \
+ if (ts < bc) \
+ shift = MAX(shift, u + 1); \
+ u = 0; \
+ } \
+ } \
+ j += shift; \
}
#else
#define SHIFT \
CHECKBOUNDS; \
- j += fg->qsBc[(unsigned char)((char *)startptr)[mismatch + 1]];
+ { \
+ int bc, gs; \
+ bc = fg->qsBc[(unsigned char)((char *)startptr)[mismatch + 1]]; \
+ if (fg->hasdot) \
+ shift = bc; \
+ else \
+ { \
+ gs = fg->bmGs[mismatch]; \
+ if (u != 0 && mismatch == fg->wlen - 1 - shift) \
+ mismatch -= u; \
+ v = fg->wlen - 1 - mismatch; \
+ ts = u - v; \
+ shift = MAX(ts, bc); \
+ shift = MAX(shift, gs); \
+ if (shift == gs) \
+ u = MIN(fg->wlen - shift, v); \
+ else \
+ { \
+ if (ts < bc) \
+ shift = MAX(shift, u + 1); \
+ u = 0; \
+ } \
+ } \
+ j += shift; \
+ }
#endif
/*
@@ -163,8 +218,8 @@ static void revs(char *str, int len);
#define FILL_ARRAY(pat, plen) \
for (unsigned int i = 0; i <= UCHAR_MAX; i++) \
- fg->qsBc[i] = plen - hasDot; \
- for (int i = hasDot + 1; i < plen; i++) \
+ fg->qsBc[i] = plen - fg->hasdot; \
+ for (int i = fg->hasdot + 1; i < plen; i++) \
{ \
fg->qsBc[(unsigned)pat[i]] = plen - i; \
if (fg->icase) \
@@ -179,12 +234,12 @@ static void revs(char *str, int len);
#ifdef TRE_WCHAR
#define FILL_QSBC \
/* Adjust the shift based on location of the last dot ('.'). */ \
- fg->defBc = fg->wlen - hasDot; \
+ fg->defBc = fg->wlen - fg->hasdot; \
\
/* Preprocess pattern. */ \
- fg->qsBc_table = hashtable_init(fg->wlen, sizeof(tre_char_t), \
+ fg->qsBc_table = hashtable_init(fg->wlen * 4, sizeof(tre_char_t), \
sizeof(int)); \
- for (unsigned int i = hasDot + 1; i < fg->wlen; i++) \
+ for (unsigned int i = fg->hasdot + 1; i < fg->wlen; i++) \
{ \
int k = fg->wlen - i; \
hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \
@@ -201,6 +256,76 @@ static void revs(char *str, int len);
#define FILL_QSBC FILL_ARRAY(fg->wpattern, fg->wlen);
#endif
+#define FILL_BMGS(arr, pat, plen, wide) \
+ { \
+ char *p; \
+ wchar_t *wp; \
+ \
+ if (wide) \
+ { \
+ if (fg->icase) \
+ { \
+ wp = alloca(plen * sizeof(wint_t)); \
+ for (int i = 0; i < plen; i++) \
+ wp[i] = towlower(pat[i]); \
+ _FILL_BMGS(arr, wp, plen); \
+ } \
+ else \
+ _FILL_BMGS(arr, pat, plen); \
+ } \
+ else \
+ { \
+ if (fg->icase) \
+ { \
+ p = alloca(plen); \
+ for (int i = 0; i < plen; i++) \
+ p[i] = tolower(pat[i]); \
+ _FILL_BMGS(arr, p, plen); \
+ } \
+ else \
+ _FILL_BMGS(arr, pat, plen); \
+ } \
+ }
+
+#define _FILL_BMGS(arr, pat, plen) \
+ { \
+ int f, g; \
+ \
+ int *suff = xmalloc(plen * sizeof(int)); \
+ if (suff == NULL) \
+ return REG_ESPACE; \
+ \
+ suff[plen - 1] = plen; \
+ g = plen - 1; \
+ for (int i = plen - 2; i >= 0; i--) \
+ { \
+ if (i > g && suff[i + plen - 1 - f] < i - g) \
+ suff[i] = suff[i + plen - 1 - f]; \
+ else \
+ { \
+ if (i < g) \
+ g = i; \
+ f = i; \
+ while (g >= 0 && pat[g] == pat[g + plen - 1 - f]) \
+ g--; \
+ suff[i] = f - g; \
+ } \
+ } \
+ \
+ for (int i = 0; i < plen; i++) \
+ arr[i] = plen; \
+ g = 0; \
+ for (int i = plen - 1; i >= 0; i--) \
+ if (suff[i] == i + 1) \
+ for(; g < plen - 1 - i; g++) \
+ if (arr[g] == plen) \
+ arr[g] = plen - 1 - i; \
+ for (int i = 0; i <= plen - 2; i++) \
+ arr[plen - 1 - suff[i]] = plen - 1 - i; \
+ \
+ free(suff); \
+ }
+
#define REVFUNC(name, argtype) \
static inline void \
name(argtype *str, int len) \
@@ -218,7 +343,6 @@ name(argtype *str, int len) \
REVFUNC(revstr, tre_char_t)
REVFUNC(revs, char)
-
/*
* Returns: REG_OK on success, error code otherwise
*/
@@ -226,8 +350,6 @@ int
tre_fastcomp_literal(fastmatch_t *fg, const tre_char_t *wpat, size_t n,
int cflags)
{
- int hasDot = 0;
-
/* Initialize. */
memset(fg, 0, sizeof(*fg));
fg->icase = (cflags & REG_ICASE);
@@ -246,6 +368,10 @@ tre_fastcomp_literal(fastmatch_t *fg, co
#endif
FILL_QSBC;
+ FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true);
+#ifdef TRE_WCHAR
+ FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false);
+#endif
return REG_OK;
}
@@ -259,7 +385,6 @@ tre_fastcomp(fastmatch_t *fg, const tre_
{
int firstHalfDot = -1;
int firstLastHalfDot = -1;
- int hasDot = 0;
int lastHalfDot = 0;
/* Initialize. */
@@ -316,7 +441,7 @@ tre_fastcomp(fastmatch_t *fg, const tre_
(fg->wpattern[i] == TRE_CHAR(':')) || (fg->wpattern[i] == TRE_CHAR('/'))) {
continue;
} else if (fg->wpattern[i] == TRE_CHAR('.')) {
- hasDot = i;
+ fg->hasdot = i;
if (i < fg->wlen / 2) {
if (firstHalfDot < 0)
/* Closest dot to the beginning */
@@ -343,19 +468,25 @@ tre_fastcomp(fastmatch_t *fg, const tre_
* Determine if a reverse search would be faster based on the placement
* of the dots.
*/
- if ((!(fg->bol || fg->eol)) &&
- (lastHalfDot && ((firstHalfDot < 0) ||
- ((fg->wlen - (lastHalfDot + 1)) < (size_t)firstHalfDot)))) {
- fg->reversed = true;
- hasDot = fg->wlen - (firstHalfDot < 0 ?
- firstLastHalfDot : firstHalfDot) - 1;
- revstr(fg->wpattern, fg->wlen);
-#ifdef TRE_WCHAR
- revs(fg->pattern, fg->len);
-#endif
- }
+// if ((!(fg->bol || fg->eol)) &&
+// (lastHalfDot && ((firstHalfDot < 0) ||
+// ((fg->wlen - (lastHalfDot + 1)) < (size_t)firstHalfDot)))) {
+// fg->reversed = true;
+// fg->hasdot = fg->wlen - (firstHalfDot < 0 ?
+// firstLastHalfDot : firstHalfDot) - 1;
+// revstr(fg->wpattern, fg->wlen);
+//#ifdef TRE_WCHAR
+// revs(fg->pattern, fg->len);
+//#endif
+// }
FILL_QSBC;
+ if (!fg->hasdot)
+ FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true);
+#ifdef TRE_WCHAR
+ if (!fg->hasdot)
+ FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false);
+#endif
/*
* Put pattern back to normal after pre-processing to allow for easy
@@ -378,7 +509,7 @@ tre_fastexec(const fastmatch_t *fg, cons
{
unsigned int j;
int ret = REG_NOMATCH;
- int mismatch;
+ int mismatch, shift, u = 0, v;
const char *str_byte = data;
const void *startptr = NULL;
#ifdef TRE_WCHAR
@@ -406,6 +537,16 @@ tre_fastexec(const fastmatch_t *fg, cons
if (len < fg->len)
return ret;
+ switch (type)
+ {
+ case STR_WIDE:
+ shift = fg->wlen;
+ break;
+ default:
+ shift = fg->len;
+ break;
+ }
+
/* Only try once at the beginning or ending of the line. */
if (fg->bol || fg->eol) {
/* Simple text comparison. */
Modified: user/gabor/tre-integration/contrib/tre/lib/fastmatch.h
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/fastmatch.h Mon Aug 8 14:02:08 2011 (r224712)
+++ user/gabor/tre-integration/contrib/tre/lib/fastmatch.h Mon Aug 8 14:40:23 2011 (r224713)
@@ -35,16 +35,21 @@
#include "tre.h"
#include "tre-internal.h"
+#define BM_MAX_LEN 1024
+
typedef struct {
size_t wlen;
size_t len;
tre_char_t *wpattern;
- char *pattern;
+ int hasdot;
+ int qsBc[UCHAR_MAX + 1];
+ int bmGs[BM_MAX_LEN];
#ifdef TRE_WCHAR
+ char *pattern;
int defBc;
hashtable *qsBc_table;
+ int sbmGs[BM_MAX_LEN];
#endif
- int qsBc[UCHAR_MAX + 1];
/* flags */
bool bol;
bool eol;
More information about the svn-src-user
mailing list