svn commit: r225434 - in user/gabor/tre-integration:
contrib/tre/lib include
Gabor Kovesdan
gabor at FreeBSD.org
Wed Sep 7 12:53:19 UTC 2011
Author: gabor
Date: Wed Sep 7 12:53:18 2011
New Revision: 225434
URL: http://svn.freebsd.org/changeset/base/225434
Log:
- Make the heuristic code loosly coupled to the fast matcher by providing
properly escaped patterns instead of using an internal flag
- Add some struct fields for escaped dots, forgotten in a previous commit
Modified:
user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c
user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c
user/gabor/tre-integration/include/fastmatch.h
user/gabor/tre-integration/include/regex.h
Modified: user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c Wed Sep 7 07:52:45 2011 (r225433)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c Wed Sep 7 12:53:18 2011 (r225434)
@@ -263,6 +263,15 @@ static int fastcmp(const void *, const b
} \
}
+#ifdef TRE_DEBUG
+#define DPRINT_BMGS(len, fmt_str, sh) \
+ for (int i = 0; i < len; i++) \
+ DPRINT((fmt_str, i, sh[i]));
+#else
+#define DPRINT_BMGS(len, fmt_str, sh) \
+ do { } while(/*CONSTCOND*/0)
+#endif
+
/*
* Fills in the good suffix table for SB/MB strings.
*/
@@ -276,6 +285,7 @@ static int fastcmp(const void *, const b
fg->sbmGs[0] = 1; \
else \
_FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false); \
+ DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs); \
}
/*
@@ -291,6 +301,8 @@ static int fastcmp(const void *, const b
fg->bmGs[0] = 1; \
else \
_FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true); \
+ DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n", \
+ fg->bmGs); \
}
#define _FILL_BMGS(arr, pat, plen, wide) \
@@ -496,121 +508,99 @@ tre_compile_fast(fastmatch_t *fg, const
continue; \
} while (0)
- /*
- * Used for heuristic, only beginning ^, trailing $ and . are treated
- * as special.
- */
- if (cflags & _REG_HEUR)
+ for (int i = 0; i < n; i++)
{
- for (int i = 0; i < n; i++)
- switch (pat[i])
- {
- case TRE_CHAR('.'):
- fg->hasdot = i;
+ switch (pat[i])
+ {
+ case TRE_CHAR('\\'):
+ if (escaped)
STORE_CHAR;
- break;
- case TRE_CHAR('$'):
- if (i == n - 1)
- fg->eol = true;
- else
- STORE_CHAR;
- break;
- default:
+ else
+ escaped = true;
+ continue;
+ case TRE_CHAR('['):
+ if (escaped)
STORE_CHAR;
- }
- }
- else
- for (int i = 0; i < n; i++)
- {
- switch (pat[i])
- {
- case TRE_CHAR('\\'):
- if (escaped)
- STORE_CHAR;
- else
- escaped = true;
- break;
- case TRE_CHAR('['):
- if (escaped)
- STORE_CHAR;
- else
- goto badpat;
- break;
- case TRE_CHAR('*'):
- if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
- STORE_CHAR;
- else
- goto badpat;
- break;
- case TRE_CHAR('+'):
- case TRE_CHAR('?'):
- if ((cflags & REG_EXTENDED) && (i == 0))
- continue;
- else if ((cflags & REG_EXTENDED) ^ !escaped)
- STORE_CHAR;
- else
- goto badpat;
- case TRE_CHAR('.'):
- if (escaped)
- {
- if (!_escmap)
- _escmap = xmalloc(n * sizeof(bool));
- if (!_escmap)
- {
- xfree(tmp);
- return REG_ESPACE;
- }
- _escmap[i] = true;
- STORE_CHAR;
- }
- else
- {
- fg->hasdot = i;
- STORE_CHAR;
- }
- break;
- case TRE_CHAR('^'):
+ else
+ goto badpat;
+ continue;
+ case TRE_CHAR('*'):
+ if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
STORE_CHAR;
- break;
- case TRE_CHAR('$'):
- if (!escaped && (i == n - 1))
- fg->eol = true;
- else
- STORE_CHAR;
- break;
- case TRE_CHAR('('):
- if ((cflags & REG_EXTENDED) ^ escaped)
- goto badpat;
- else
- STORE_CHAR;
- break;
- case TRE_CHAR('{'):
- if (escaped && (i == 0))
- STORE_CHAR;
- else if (!(cflags & REG_EXTENDED) && (i == 0))
- STORE_CHAR;
- else if ((cflags & REG_EXTENDED) && (i == 0))
- continue;
- else
- goto badpat;
- break;
- case TRE_CHAR('|'):
- if ((cflags & REG_EXTENDED) ^ (!escaped))
- goto badpat;
- else
- STORE_CHAR;
- break;
- default:
- if (escaped)
- goto badpat;
- else
+ else
+ goto badpat;
+ continue;
+ case TRE_CHAR('+'):
+ case TRE_CHAR('?'):
+ if ((cflags & REG_EXTENDED) && (i == 0))
+ continue;
+ else if ((cflags & REG_EXTENDED) ^ !escaped)
+ STORE_CHAR;
+ else
+ goto badpat;
+ continue;
+ case TRE_CHAR('.'):
+ if (escaped)
+ {
+ if (!_escmap)
+ _escmap = xmalloc(n * sizeof(bool));
+ if (!_escmap)
+ {
+ xfree(tmp);
+ return REG_ESPACE;
+ }
+ _escmap[i] = true;
+ STORE_CHAR;
+ }
+ else
+ {
+ fg->hasdot = i;
STORE_CHAR;
- }
- continue;
+ }
+ continue;
+ case TRE_CHAR('^'):
+ STORE_CHAR;
+ continue;
+ case TRE_CHAR('$'):
+ if (!escaped && (i == n - 1))
+ fg->eol = true;
+ else
+ STORE_CHAR;
+ continue;
+ case TRE_CHAR('('):
+ if ((cflags & REG_EXTENDED) ^ escaped)
+ goto badpat;
+ else
+ STORE_CHAR;
+ continue;
+ case TRE_CHAR('{'):
+ if (!(cflags & REG_EXTENDED) ^ escaped)
+ STORE_CHAR;
+ else if (!(cflags & REG_EXTENDED) && (i == 0))
+ STORE_CHAR;
+ else if ((cflags & REG_EXTENDED) && (i == 0))
+ continue;
+ else
+ goto badpat;
+ continue;
+ case TRE_CHAR('|'):
+ if ((cflags & REG_EXTENDED) ^ escaped)
+ goto badpat;
+ else
+ STORE_CHAR;
+ continue;
+ default:
+ if (escaped)
+ goto badpat;
+ else
+ STORE_CHAR;
+ continue;
+ }
+ continue;
badpat:
- xfree(tmp);
- return REG_BADPAT;
- }
+ xfree(tmp);
+ return REG_BADPAT;
+ }
/*
* The pattern has been processed and copied to tmp as a literal string
Modified: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c Wed Sep 7 07:52:45 2011 (r225433)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c Wed Sep 7 12:53:18 2011 (r225434)
@@ -127,11 +127,14 @@
goto end_segment; \
} while (0)
-#define STORE_CHAR \
+#define STORE_CHAR(esc) \
do \
{ \
- escaped = false; \
+ if (esc) \
+ heur[pos++] = TRE_CHAR('\\'); \
heur[pos++] = regex[i]; \
+ escaped = false; \
+ continue; \
} while (0)
@@ -178,11 +181,13 @@ tre_compile_heur(heur_t *h, const tre_ch
* bracket is escaped.
*/
case TRE_CHAR('['):
- PARSE_BRACKETS;
if (escaped)
- STORE_CHAR;
+ STORE_CHAR(true);
else
- heur[pos++] = TRE_CHAR('.');
+ {
+ PARSE_BRACKETS;
+ heur[pos++] = TRE_CHAR('.');
+ }
continue;
/*
@@ -192,9 +197,9 @@ tre_compile_heur(heur_t *h, const tre_ch
*/
case TRE_CHAR('{'):
if (escaped && (i == 1))
- STORE_CHAR;
+ STORE_CHAR(true);
else if ((i == 0) && !(cflags & REG_EXTENDED))
- STORE_CHAR;
+ STORE_CHAR(true);
else if ((i == 0) && (cflags & REG_EXTENDED))
continue;
@@ -205,7 +210,7 @@ tre_compile_heur(heur_t *h, const tre_ch
END_SEGMENT;
}
else
- STORE_CHAR;
+ STORE_CHAR(cflags & REG_EXTENDED);
continue;
/*
@@ -213,11 +218,13 @@ tre_compile_heur(heur_t *h, const tre_ch
* otherwise treated as a normal character.
*/
case TRE_CHAR('('):
- PARSE_UNIT('(', ')');
if (escaped ^ (cflags & REG_EXTENDED))
- END_SEGMENT;
+ {
+ PARSE_UNIT('(', ')');
+ END_SEGMENT;
+ }
else
- STORE_CHAR;
+ STORE_CHAR(cflags & REG_EXTENDED);
continue;
/*
@@ -227,9 +234,9 @@ tre_compile_heur(heur_t *h, const tre_ch
*/
case TRE_CHAR('\\'):
if (escaped)
- STORE_CHAR;
+ STORE_CHAR(true);
else
- escaped = !escaped;
+ escaped = true;
continue;
/*
@@ -240,7 +247,7 @@ tre_compile_heur(heur_t *h, const tre_ch
*/
case TRE_CHAR('*'):
if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
- STORE_CHAR;
+ STORE_CHAR(true);
else if ((i != 0))
{
pos--;
@@ -262,7 +269,7 @@ tre_compile_heur(heur_t *h, const tre_ch
else if ((cflags & REG_EXTENDED) ^ escaped)
END_SEGMENT;
else
- STORE_CHAR;
+ STORE_CHAR(cflags & REG_EXTENDED);
continue;
/*
@@ -281,7 +288,7 @@ tre_compile_heur(heur_t *h, const tre_ch
END_SEGMENT;
}
else
- STORE_CHAR;
+ STORE_CHAR(true);
continue;
/*
@@ -296,7 +303,7 @@ tre_compile_heur(heur_t *h, const tre_ch
else if (!(cflags & REG_EXTENDED) && escaped)
END_SEGMENT;
else
- STORE_CHAR;
+ STORE_CHAR(cflags & REG_EXTENDED);
continue;
/*
@@ -304,10 +311,7 @@ tre_compile_heur(heur_t *h, const tre_ch
* cannot handle it.
*/
case TRE_CHAR('.'):
- if (escaped)
- END_SEGMENT;
- else
- STORE_CHAR;
+ STORE_CHAR(escaped);
continue;
/*
@@ -319,7 +323,7 @@ tre_compile_heur(heur_t *h, const tre_ch
if (escaped)
END_SEGMENT;
else
- STORE_CHAR;
+ STORE_CHAR(false);
continue;
}
}
@@ -352,7 +356,7 @@ end_segment:
goto space1;
}
- ret = tre_compile_fast(h->start, heur, pos, _REG_HEUR);
+ ret = tre_compile_fast(h->start, heur, pos, 0);
if (ret != REG_OK)
{
errcode = REG_BADPAT;
@@ -386,7 +390,7 @@ end_segment:
goto space2;
}
- ret = tre_compile_fast(h->end, heur, pos, _REG_HEUR);
+ ret = tre_compile_fast(h->end, heur, pos, 0);
if (ret != REG_OK)
{
xfree(h->end);
Modified: user/gabor/tre-integration/include/fastmatch.h
==============================================================================
--- user/gabor/tre-integration/include/fastmatch.h Wed Sep 7 07:52:45 2011 (r225433)
+++ user/gabor/tre-integration/include/fastmatch.h Wed Sep 7 12:53:18 2011 (r225434)
@@ -12,10 +12,12 @@ typedef struct {
size_t wlen;
size_t len;
wchar_t *wpattern;
+ bool *wescmap;
int hasdot;
int qsBc[UCHAR_MAX + 1];
int *bmGs;
char *pattern;
+ bool *escmap;
int defBc;
void *qsBc_table;
int *sbmGs;
Modified: user/gabor/tre-integration/include/regex.h
==============================================================================
--- user/gabor/tre-integration/include/regex.h Wed Sep 7 07:52:45 2011 (r225433)
+++ user/gabor/tre-integration/include/regex.h Wed Sep 7 12:53:18 2011 (r225434)
@@ -110,7 +110,6 @@ typedef enum {
#define REG_PEND (REG_UNGREEDY << 1)
#define REG_GNU (REG_PEND << 1)
#define REG_WORD (REG_GNU << 1)
-#define _REG_HEUR (REG_WORD << 1)
/* POSIX tre_regexec() flags. */
#define REG_NOTBOL 1
More information about the svn-src-user
mailing list