svn commit: r224980 - in user/gabor/grep/trunk: . regex
Gabor Kovesdan
gabor at FreeBSD.org
Thu Aug 18 17:09:10 UTC 2011
Author: gabor
Date: Thu Aug 18 17:09:10 2011
New Revision: 224980
URL: http://svn.freebsd.org/changeset/base/224980
Log:
- Backport the fast matching algorithm from TRE that has been written
during this summer as a part of GSoC 2011. This only improves the
performance slightly but expected to be much better tested and
much more robust.
Added:
user/gabor/grep/trunk/regex/
user/gabor/grep/trunk/regex/fastmatch.c (contents, props changed)
user/gabor/grep/trunk/regex/fastmatch.h (contents, props changed)
user/gabor/grep/trunk/regex/glue.h (contents, props changed)
user/gabor/grep/trunk/regex/hashtable.c (contents, props changed)
user/gabor/grep/trunk/regex/hashtable.h (contents, props changed)
user/gabor/grep/trunk/regex/tre-fastmatch.c (contents, props changed)
user/gabor/grep/trunk/regex/tre-fastmatch.h (contents, props changed)
user/gabor/grep/trunk/regex/xmalloc.c (contents, props changed)
user/gabor/grep/trunk/regex/xmalloc.h (contents, props changed)
Deleted:
user/gabor/grep/trunk/fastgrep.c
Modified:
user/gabor/grep/trunk/Makefile
user/gabor/grep/trunk/grep.c
user/gabor/grep/trunk/grep.h
user/gabor/grep/trunk/util.c
Modified: user/gabor/grep/trunk/Makefile
==============================================================================
--- user/gabor/grep/trunk/Makefile Thu Aug 18 16:57:51 2011 (r224979)
+++ user/gabor/grep/trunk/Makefile Thu Aug 18 17:09:10 2011 (r224980)
@@ -9,7 +9,12 @@ PROG= grep
.else
PROG= bsdgrep
.endif
-SRCS= fastgrep.c file.c grep.c queue.c util.c
+SRCS= file.c grep.c queue.c util.c
+
+# Extra files ported backported form some regex improvements
+.PATH: ${.CURDIR}/regex
+SRCS+= fastmatch.c hashtable.c tre-fastmatch.c xmalloc.c
+CFLAGS+=-I${.CURDIR}/regex
.if ${MK_BSD_GREP} == "yes"
LINKS= ${BINDIR}/grep ${BINDIR}/egrep \
Modified: user/gabor/grep/trunk/grep.c
==============================================================================
--- user/gabor/grep/trunk/grep.c Thu Aug 18 16:57:51 2011 (r224979)
+++ user/gabor/grep/trunk/grep.c Thu Aug 18 17:09:10 2011 (r224980)
@@ -48,6 +48,7 @@ __FBSDID("$FreeBSD$");
#include <string.h>
#include <unistd.h>
+#include "fastmatch.h"
#include "grep.h"
#ifndef WITHOUT_NLS
@@ -83,7 +84,7 @@ bool matchall;
unsigned int patterns, pattern_sz;
char **pattern;
regex_t *r_pattern;
-fastgrep_t *fg_pattern;
+fastmatch_t *fg_pattern;
/* Filename exclusion/inclusion patterns */
unsigned int fpatterns, fpattern_sz;
@@ -662,10 +663,10 @@ main(int argc, char *argv[])
/* Check if cheating is allowed (always is for fgrep). */
if (grepbehave == GREP_FIXED) {
for (i = 0; i < patterns; ++i)
- fgrepcomp(&fg_pattern[i], pattern[i]);
+ fixcomp(&fg_pattern[i], pattern[i], cflags);
} else {
for (i = 0; i < patterns; ++i) {
- if (fastcomp(&fg_pattern[i], pattern[i])) {
+ if (fastcomp(&fg_pattern[i], pattern[i], cflags) != 0) {
/* Fall back to full regex library */
c = regcomp(&r_pattern[i], pattern[i], cflags);
if (c != 0) {
Modified: user/gabor/grep/trunk/grep.h
==============================================================================
--- user/gabor/grep/trunk/grep.h Thu Aug 18 16:57:51 2011 (r224979)
+++ user/gabor/grep/trunk/grep.h Thu Aug 18 17:09:10 2011 (r224980)
@@ -36,6 +36,8 @@
#include <stdio.h>
#include <zlib.h>
+#include "fastmatch.h"
+
#ifdef WITHOUT_NLS
#define getstr(n) errstr[n]
#else
@@ -95,17 +97,6 @@ struct epat {
int mode;
};
-typedef struct {
- size_t len;
- unsigned char *pattern;
- int qsBc[UCHAR_MAX + 1];
- /* flags */
- bool bol;
- bool eol;
- bool reversed;
- bool word;
-} fastgrep_t;
-
/* Flags passed to regcomp() and regexec() */
extern int cflags, eflags;
@@ -125,7 +116,7 @@ extern unsigned int dpatterns, fpatterns
extern char **pattern;
extern struct epat *dpattern, *fpattern;
extern regex_t *er_pattern, *r_pattern;
-extern fastgrep_t *fg_pattern;
+extern fastmatch_t *fg_pattern;
/* For regex errors */
#define RE_ERROR_BUF 512
@@ -150,8 +141,3 @@ void clearqueue(void);
void grep_close(struct file *f);
struct file *grep_open(const char *path);
char *grep_fgetln(struct file *f, size_t *len);
-
-/* fastgrep.c */
-int fastcomp(fastgrep_t *, const char *);
-void fgrepcomp(fastgrep_t *, const char *);
-int grep_search(fastgrep_t *, const unsigned char *, size_t, regmatch_t *);
Added: user/gabor/grep/trunk/regex/fastmatch.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/gabor/grep/trunk/regex/fastmatch.c Thu Aug 18 17:09:10 2011 (r224980)
@@ -0,0 +1,227 @@
+/* $FreeBSD$ */
+
+/*-
+ * Copyright (C) 2011 Gabor Kovesdan <gabor at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "glue.h"
+
+#include <fastmatch.h>
+#include <regex.h>
+#include <string.h>
+
+#include "tre-fastmatch.h"
+#include "xmalloc.h"
+
+/* XXX: avoid duplication */
+#define CONV_PAT \
+ int ret; \
+ tre_char_t *wregex; \
+ size_t wlen; \
+ \
+ wregex = xmalloc(sizeof(tre_char_t) * (n + 1)); \
+ if (wregex == NULL) \
+ return REG_ESPACE; \
+ \
+ if (TRE_MB_CUR_MAX == 1) \
+ { \
+ unsigned int i; \
+ const unsigned char *str = (const unsigned char *)regex; \
+ tre_char_t *wstr = wregex; \
+ \
+ for (i = 0; i < n; i++) \
+ *(wstr++) = *(str++); \
+ wlen = n; \
+ } \
+ else \
+ { \
+ int consumed; \
+ tre_char_t *wcptr = wregex; \
+ mbstate_t state; \
+ memset(&state, '\0', sizeof(state)); \
+ while (n > 0) \
+ { \
+ consumed = tre_mbrtowc(wcptr, regex, n, &state); \
+ \
+ switch (consumed) \
+ { \
+ case 0: \
+ if (*regex == '\0') \
+ consumed = 1; \
+ else \
+ { \
+ xfree(wregex); \
+ return REG_BADPAT; \
+ } \
+ break; \
+ case -1: \
+ DPRINT(("mbrtowc: error %d: %s.\n", errno, \
+ strerror(errno))); \
+ xfree(wregex); \
+ return REG_BADPAT; \
+ case -2: \
+ consumed = n; \
+ break; \
+ } \
+ regex += consumed; \
+ n -= consumed; \
+ wcptr++; \
+ } \
+ wlen = wcptr - wregex; \
+ } \
+ \
+ wregex[wlen] = L'\0';
+
+int
+tre_fixncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags)
+{
+ CONV_PAT;
+
+ ret = tre_compile_literal(preg, wregex, wlen, cflags);
+ xfree(wregex);
+
+ return ret;
+}
+
+int
+tre_fastncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags)
+{
+ CONV_PAT;
+
+ ret = (cflags & REG_LITERAL) ?
+ tre_compile_literal(preg, wregex, wlen, cflags) :
+ tre_compile_fast(preg, wregex, wlen, cflags);
+ xfree(wregex);
+
+ return ret;
+}
+
+
+int
+tre_fixcomp(fastmatch_t *preg, const char *regex, int cflags)
+{
+ return tre_fixncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
+}
+
+int
+tre_fastcomp(fastmatch_t *preg, const char *regex, int cflags)
+{
+ return tre_fastncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
+}
+
+int
+tre_fixwncomp(fastmatch_t *preg, const wchar_t *regex, size_t n, int cflags)
+{
+ return tre_compile_literal(preg, regex, n, cflags);
+}
+
+int
+tre_fastwncomp(fastmatch_t *preg, const wchar_t *regex, size_t n, int cflags)
+{
+ return (cflags & REG_LITERAL) ?
+ tre_compile_literal(preg, regex, n, cflags) :
+ tre_compile_fast(preg, regex, n, cflags);
+}
+
+int
+tre_fixwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags)
+{
+ return tre_fixwncomp(preg, regex, regex ? tre_strlen(regex) : 0, cflags);
+}
+
+int
+tre_fastwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags)
+{
+ return tre_fastwncomp(preg, regex, regex ? tre_strlen(regex) : 0, cflags);
+}
+
+void
+tre_fastfree(fastmatch_t *preg)
+{
+ tre_free_fast(preg);
+}
+
+/* XXX: avoid duplication */
+#define ADJUST_OFFSETS \
+ { \
+ size_t slen = (size_t)(pmatch[0].rm_eo - pmatch[0].rm_so); \
+ size_t offset = pmatch[0].rm_so; \
+ int ret; \
+ \
+ if ((len != (unsigned)-1) && (pmatch[0].rm_eo > len)) \
+ return REG_NOMATCH; \
+ if ((long long)pmatch[0].rm_eo - pmatch[0].rm_so < 0) \
+ return REG_NOMATCH; \
+ ret = tre_match_fast(preg, &string[offset], slen, type, nmatch, \
+ pmatch, eflags); \
+ for (unsigned i = 0; (i == 0) || (!(eflags & REG_NOSUB) && \
+ (i < nmatch)); i++) \
+ { \
+ pmatch[i].rm_so += offset; \
+ pmatch[i].rm_eo += offset; \
+ } \
+ return ret; \
+ }
+
+int
+tre_fastnexec(const fastmatch_t *preg, const char *string, size_t len,
+ size_t nmatch, regmatch_t pmatch[], int eflags)
+{
+ tre_str_type_t type = (TRE_MB_CUR_MAX == 1) ? STR_BYTE : STR_MBS;
+
+ if (eflags & REG_STARTEND)
+ ADJUST_OFFSETS
+ else
+ return tre_match_fast(preg, string, len, type, nmatch,
+ pmatch, eflags);
+}
+
+int
+tre_fastexec(const fastmatch_t *preg, const char *string, size_t nmatch,
+ regmatch_t pmatch[], int eflags)
+{
+ return tre_fastnexec(preg, string, (size_t)-1, nmatch, pmatch, eflags);
+}
+
+int
+tre_fastwnexec(const fastmatch_t *preg, const wchar_t *string, size_t len,
+ size_t nmatch, regmatch_t pmatch[], int eflags)
+{
+ tre_str_type_t type = STR_WIDE;
+
+ if (eflags & REG_STARTEND)
+ ADJUST_OFFSETS
+ else
+ return tre_match_fast(preg, string, len, type, nmatch,
+ pmatch, eflags);
+}
+
+int
+tre_fastwexec(const fastmatch_t *preg, const wchar_t *string,
+ size_t nmatch, regmatch_t pmatch[], int eflags)
+{
+ return tre_fastwnexec(preg, string, (size_t)-1, nmatch, pmatch, eflags);
+}
+
Added: user/gabor/grep/trunk/regex/fastmatch.h
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/gabor/grep/trunk/regex/fastmatch.h Thu Aug 18 17:09:10 2011 (r224980)
@@ -0,0 +1,103 @@
+/* $FreeBSD$ */
+
+#ifndef FASTMATCH_H
+#define FASTMATCH_H 1
+
+#include <hashtable.h>
+#include <limits.h>
+#include <regex.h>
+#include <stdbool.h>
+#include <wchar.h>
+
+typedef struct {
+ size_t wlen;
+ size_t len;
+ wchar_t *wpattern;
+ int hasdot;
+ int qsBc[UCHAR_MAX + 1];
+ int *bmGs;
+ char *pattern;
+ int defBc;
+ hashtable *qsBc_table;
+ int *sbmGs;
+
+ /* flags */
+ bool bol;
+ bool eol;
+ bool word;
+ bool icase;
+ bool newline;
+} fastmatch_t;
+
+extern int
+tre_fixcomp(fastmatch_t *preg, const char *regex, int cflags);
+
+extern int
+tre_fastcomp(fastmatch_t *preg, const char *regex, int cflags);
+
+extern int
+tre_fastexec(const fastmatch_t *preg, const char *string, size_t nmatch,
+ regmatch_t pmatch[], int eflags);
+
+extern void
+tre_fastfree(fastmatch_t *preg);
+
+extern int
+tre_fixwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags);
+
+extern int
+tre_fastwcomp(fastmatch_t *preg, const wchar_t *regex, int cflags);
+
+extern int
+tre_fastwexec(const fastmatch_t *preg, const wchar_t *string,
+ size_t nmatch, regmatch_t pmatch[], int eflags);
+
+/* Versions with a maximum length argument and therefore the capability to
+ handle null characters in the middle of the strings. */
+extern int
+tre_fixncomp(fastmatch_t *preg, const char *regex, size_t len, int cflags);
+
+extern int
+tre_fastncomp(fastmatch_t *preg, const char *regex, size_t len, int cflags);
+
+extern int
+tre_fastnexec(const fastmatch_t *preg, const char *string, size_t len,
+ size_t nmatch, regmatch_t pmatch[], int eflags);
+
+extern int
+tre_fixwncomp(fastmatch_t *preg, const wchar_t *regex, size_t len, int cflags);
+
+extern int
+tre_fastwncomp(fastmatch_t *preg, const wchar_t *regex, size_t len, int cflags);
+
+extern int
+tre_fastwnexec(const fastmatch_t *preg, const wchar_t *string, size_t len,
+ size_t nmatch, regmatch_t pmatch[], int eflags);
+
+#define fixncomp tre_fixncomp
+#define fastncomp tre_fastncomp
+#define fixcomp tre_fixcomp
+#define fastcomp tre_fastcomp
+#define fixwncomp tre_fixwncomp
+#define fastwncomp tre_fastwncomp
+#define fixwcomp tre_fixwcomp
+#define fastwcomp tre_fastwcomp
+#define fastfree tre_fastfree
+#define fastnexec tre_fastnexec
+#define fastexec tre_fastexec
+#define fastwnexec tre_fastwnexec
+#define fastwexec tre_fastwexec
+#define fixcomp tre_fixcomp
+#define fastcomp tre_fastcomp
+#define fastexec tre_fastexec
+#define fastfree tre_fastfree
+#define fixwcomp tre_fixwcomp
+#define fastwcomp tre_fastwcomp
+#define fastwexec tre_fastwexec
+#define fixncomp tre_fixncomp
+#define fastncomp tre_fastncomp
+#define fastnexec tre_fastnexec
+#define fixwncomp tre_fixwncomp
+#define fastwncomp tre_fastwncomp
+#define fastwnexec tre_fastwnexec
+#endif /* FASTMATCH_H */
Added: user/gabor/grep/trunk/regex/glue.h
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/gabor/grep/trunk/regex/glue.h Thu Aug 18 17:09:10 2011 (r224980)
@@ -0,0 +1,30 @@
+/* $FreeBSD$ */
+
+#ifndef GLUE_H
+#define GLUE_H
+
+#include <regex.h>
+
+#define TRE_WCHAR 1
+#define TRE_MULTIBYTE 1
+
+#define tre_char_t wchar_t
+#define tre_mbrtowc(pwc, s, n, ps) (mbrtowc((pwc), (s), (n), (ps)))
+#define tre_strlen wcslen
+#define tre_isspace iswspace
+#define tre_isalnum iswalnum
+
+#define REG_LITERAL 0020
+#define REG_WORD 0100
+#define REG_GNU 0400
+
+#define REG_OK 0
+
+#define TRE_MB_CUR_MAX MB_CUR_MAX
+
+#define DPRINT(msg)
+#define MIN(a,b) ((a > b) ? (b) : (a))
+#define MAX(a,b) ((a > b) ? (a) : (b))
+
+typedef enum { STR_WIDE, STR_BYTE, STR_MBS, STR_USER } tre_str_type_t;
+#endif
Added: user/gabor/grep/trunk/regex/hashtable.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/gabor/grep/trunk/regex/hashtable.c Thu Aug 18 17:09:10 2011 (r224980)
@@ -0,0 +1,159 @@
+/* $FreeBSD$ */
+
+/*-
+ * Copyright (C) 2011 Gabor Kovesdan <gabor at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include <sys/hash.h>
+#include <hashtable.h>
+#include <stdlib.h>
+#include <string.h>
+
+hashtable
+*hashtable_init(size_t table_size, size_t key_size, size_t value_size)
+{
+ hashtable *tbl;
+
+ tbl = malloc(sizeof(hashtable));
+ if (tbl == NULL)
+ return (NULL);
+
+ tbl->entries = calloc(sizeof(hashtable_entry *), table_size);
+ if (tbl->entries == NULL) {
+ free(tbl);
+ return (NULL);
+ }
+
+ tbl->table_size = table_size;
+ tbl->usage = 0;
+ tbl->key_size = key_size;
+ tbl->value_size = value_size;
+
+ return (tbl);
+}
+
+int
+hashtable_put(hashtable *tbl, const void *key, const void *value)
+{
+ uint32_t hash = 0;
+
+ if (tbl->table_size == tbl->usage)
+ return (-1);
+
+ hash = hash32_buf(key, tbl->key_size, hash);
+ hash %= tbl->table_size;
+
+ while (tbl->entries[hash] != NULL)
+ hash = (hash >= tbl->table_size) ? 0 : hash + 1;
+
+ tbl->entries[hash] = malloc(sizeof(hashtable_entry));
+ if (tbl->entries[hash] == NULL)
+ return (-1);
+
+ tbl->entries[hash]->key = malloc(tbl->key_size);
+ if (tbl->entries[hash]->key == NULL) {
+ free(tbl->entries[hash]);
+ return (-1);
+ }
+
+ tbl->entries[hash]->value = malloc(tbl->value_size);
+ if (tbl->entries[hash]->value == NULL) {
+ free(tbl->entries[hash]->key);
+ free(tbl->entries[hash]);
+ return (-1);
+ }
+
+ memcpy(&tbl->entries[hash]->key, key, tbl->key_size);
+ memcpy(&tbl->entries[hash]->value, value, tbl->value_size);
+ tbl->usage++;
+
+ return (0);
+}
+
+static hashtable_entry
+*hashtable_lookup(const hashtable *tbl, const void *key)
+{
+ uint32_t hash = 0;
+
+ hash = hash32_buf(key, tbl->key_size, hash);
+ hash %= tbl->table_size;
+
+ for (;;) {
+ if (tbl->entries[hash] == NULL)
+ return (NULL);
+ else if (memcmp(key, &tbl->entries[hash]->key,
+ tbl->key_size) == 0)
+ return (tbl->entries[hash]);
+
+ hash = (hash == tbl->table_size) ? 0 : hash + 1;
+ }
+}
+
+int
+hashtable_get(hashtable *tbl, const void *key, void *value)
+{
+ hashtable_entry *entry;
+
+ entry = hashtable_lookup(tbl, key);
+ if (entry == NULL)
+ return (-1);
+
+ memcpy(value, &entry->value, tbl->value_size);
+ return (0);
+}
+
+int
+hashtable_remove(hashtable *tbl, const void *key)
+{
+ hashtable_entry *entry;
+
+ entry = hashtable_lookup(tbl, key);
+ if (entry == NULL)
+ return (-1);
+
+// free(entry->key);
+// free(entry->value);
+ free(entry);
+
+ tbl->usage--;
+
+ return (0);
+}
+
+void
+hashtable_free(hashtable *tbl)
+{
+
+ if (tbl == NULL)
+ return;
+
+ for (unsigned int i = 0; i < tbl->table_size; i++)
+ if (tbl->entries[i] != NULL) {
+// free(tbl->entries[i]->key);
+// free(tbl->entries[i]->value);
+// free(tbl->entries[i]);
+ }
+ free(tbl->entries);
+}
Added: user/gabor/grep/trunk/regex/hashtable.h
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/gabor/grep/trunk/regex/hashtable.h Thu Aug 18 17:09:10 2011 (r224980)
@@ -0,0 +1,27 @@
+/* $FreeBSD$ */
+
+#ifndef HASHTABLE_H
+#define HASHTABLE_H 1
+
+#include <sys/types.h>
+
+typedef struct {
+ void *key;
+ void *value;
+} hashtable_entry;
+
+typedef struct {
+ size_t key_size;
+ size_t table_size;
+ size_t usage;
+ size_t value_size;
+ hashtable_entry **entries;
+} hashtable;
+
+void hashtable_free(hashtable *);
+int hashtable_get(hashtable *, const void *, void *);
+hashtable *hashtable_init(size_t, size_t, size_t);
+int hashtable_put(hashtable *, const void *, const void *);
+int hashtable_remove(hashtable *, const void *);
+
+#endif /* HASHTABLE.H */
Added: user/gabor/grep/trunk/regex/tre-fastmatch.c
==============================================================================
--- /dev/null 00:00:00 1970 (empty, because file is newly added)
+++ user/gabor/grep/trunk/regex/tre-fastmatch.c Thu Aug 18 17:09:10 2011 (r224980)
@@ -0,0 +1,704 @@
+/* $FreeBSD$ */
+
+/*-
+ * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
+ * Copyright (C) 2008-2011 Gabor Kovesdan <gabor at FreeBSD.org>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+#include "glue.h"
+
+#include <ctype.h>
+#include <hashtable.h>
+#include <limits.h>
+#include <regex.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#ifdef TRE_WCHAR
+#include <wchar.h>
+#include <wctype.h>
+#endif
+
+#include "hashtable.h"
+#include "tre-fastmatch.h"
+#include "xmalloc.h"
+
+static int fastcmp(const void *, const void *, size_t,
+ tre_str_type_t, bool, bool);
+
+/*
+ * We will work with wide characters if they are supported
+ */
+#ifdef TRE_WCHAR
+#define TRE_CHAR(n) L##n
+#else
+#define TRE_CHAR(n) n
+#endif
+
+/*
+ * Skips n characters in the input string and assigns the start
+ * address to startptr. Note: as per IEEE Std 1003.1-2008
+ * matching is based on bit pattern not character representations
+ * so we can handle MB strings as byte sequences just like
+ * SB strings.
+ */
+#define SKIP_CHARS(n) \
+ switch (type) \
+ { \
+ case STR_WIDE: \
+ startptr = str_wide + n; \
+ break; \
+ default: \
+ startptr = str_byte + n; \
+ }
+
+/*
+ * Converts the wide string pattern to SB/MB string and stores
+ * it in fg->pattern. Sets fg->len to the byte length of the
+ * converted string.
+ */
+#define STORE_MBS_PAT \
+ { \
+ size_t siz; \
+ \
+ siz = wcstombs(NULL, fg->wpattern, 0); \
+ if (siz == (size_t)-1) \
+ return REG_BADPAT; \
+ fg->len = siz; \
+ fg->pattern = xmalloc(siz + 1); \
+ if (fg->pattern == NULL) \
+ return REG_ESPACE; \
+ wcstombs(fg->pattern, fg->wpattern, siz); \
+ fg->pattern[siz] = '\0'; \
+ } \
+
+/*
+ * Compares the pattern to the input string at the position
+ * stored in startptr.
+ */
+#define COMPARE \
+ switch (type) \
+ { \
+ case STR_WIDE: \
+ mismatch = fastcmp(fg->wpattern, startptr, fg->wlen, type, \
+ fg->icase, fg->newline); \
+ break; \
+ default: \
+ mismatch = fastcmp(fg->pattern, startptr, fg->len, type, \
+ fg->icase, fg->newline); \
+ } \
+
+#define IS_OUT_OF_BOUNDS \
+ ((type == STR_WIDE) ? ((j + fg->wlen) > len) : ((j + fg->len) > len))
+
+/*
+ * Checks whether the new position after shifting in the input string
+ * is out of the bounds and break out from the loop if so.
+ */
+#define CHECKBOUNDS \
+ if (IS_OUT_OF_BOUNDS) \
+ break; \
+
+/*
+ * Shifts in the input string after a mismatch. The position of the
+ * mismatch is stored in the mismatch variable.
+ */
+#define SHIFT \
+ CHECKBOUNDS; \
+ \
+ { \
+ int bc = 0, gs = 0, ts, r = -1; \
+ \
+ switch (type) \
+ { \
+ case STR_WIDE: \
+ if (!fg->hasdot) \
+ { \
+ if (u != 0 && mismatch == fg->wlen - 1 - shift) \
+ mismatch -= u; \
+ v = fg->wlen - 1 - mismatch; \
+ r = hashtable_get(fg->qsBc_table, \
+ &((tre_char_t *)startptr)[mismatch + 1], &bc); \
+ gs = fg->bmGs[mismatch]; \
+ } \
+ bc = (r == 0) ? bc : fg->defBc; \
+ break; \
+ default: \
+ if (!fg->hasdot) \
+ { \
+ if (u != 0 && mismatch == fg->len - 1 - shift) \
+ mismatch -= u; \
+ v = fg->len - 1 - mismatch; \
+ gs = fg->sbmGs[mismatch]; \
+ } \
+ bc = fg->qsBc[((unsigned char *)startptr)[mismatch + 1]]; \
+ } \
+ if (fg->hasdot) \
+ shift = bc; \
+ else \
+ { \
+ ts = u - v; \
+ shift = MAX(ts, bc); \
+ shift = MAX(shift, gs); \
+ if (shift == gs) \
+ u = MIN((type == STR_WIDE ? fg->wlen : fg->len) - shift, v); \
+ else \
+ { \
+ if (ts < bc) \
+ shift = MAX(shift, u + 1); \
+ u = 0; \
+ } \
+ } \
+ j += shift; \
+ }
+
+/*
+ * Normal Quick Search would require a shift based on the position the
+ * next character after the comparison is within the pattern. With
+ * wildcards, the position of the last dot effects the maximum shift
+ * distance.
+ * The closer to the end the wild card is the slower the search.
+ *
+ * Examples:
+ * Pattern Max shift
+ * ------- ---------
+ * this 5
+ * .his 4
+ * t.is 3
+ * th.s 2
+ * thi. 1
+ */
+
+/*
+ * Fills in the bad character shift array for SB/MB strings.
+ */
+#define FILL_QSBC \
+ for (unsigned int i = 0; i <= UCHAR_MAX; i++) \
+ fg->qsBc[i] = fg->len - fg->hasdot; \
+ for (int i = fg->hasdot + 1; i < fg->len; i++) \
+ { \
+ fg->qsBc[(unsigned)fg->pattern[i]] = fg->len - i; \
+ if (fg->icase) \
+ { \
+ char c = islower(fg->pattern[i]) ? toupper(fg->pattern[i]) \
+ : tolower(fg->pattern[i]); \
+ fg->qsBc[(unsigned)c] = fg->len - i; \
+ } \
+ }
+
+/*
+ * Fills in the bad character shifts into a hastable for wide strings.
+ * With wide characters it is not possible any more to use a normal
+ * array because there are too many characters and we could not
+ * provide enough memory. Fortunately, we only have to store distinct
+ * values for so many characters as the number of distinct characters
+ * in the pattern, so we can store them in a hashtable and store a
+ * default shift value for the rest.
+ */
+#define FILL_QSBC_WIDE \
+ /* Adjust the shift based on location of the last dot ('.'). */ \
+ fg->defBc = fg->wlen - fg->hasdot; \
+ \
+ /* Preprocess pattern. */ \
+ fg->qsBc_table = hashtable_init(fg->wlen * 4, sizeof(tre_char_t), \
+ sizeof(int)); \
+ for (unsigned int i = fg->hasdot + 1; i < fg->wlen; i++) \
+ { \
+ int k = fg->wlen - i; \
+ hashtable_put(fg->qsBc_table, &fg->wpattern[i], &k); \
+ if (fg->icase) \
+ { \
+ tre_char_t wc = iswlower(fg->wpattern[i]) ? \
+ towupper(fg->wpattern[i]) : towlower(fg->wpattern[i]); \
+ hashtable_put(fg->qsBc_table, &wc, &k); \
+ } \
+ }
+
+/*
+ * Fills in the good suffix table for SB/MB strings.
+ */
+#define FILL_BMGS \
+ if (!fg->hasdot) \
+ { \
+ fg->sbmGs = xmalloc(fg->len * sizeof(int)); \
+ if (!fg->sbmGs) \
+ return REG_ESPACE; \
+ _FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false); \
+ }
+
+/*
+ * Fills in the good suffix table for wide strings.
+ */
+#define FILL_BMGS_WIDE \
+ if (!fg->hasdot) \
+ { \
+ fg->bmGs = xmalloc(fg->wlen * sizeof(int)); \
+ if (!fg->bmGs) \
+ return REG_ESPACE; \
+ _FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true); \
+ }
+
+#define _FILL_BMGS(arr, pat, plen, wide) \
+ { \
+ char *p; \
+ tre_char_t *wp; \
+ \
+ if (wide) \
+ { \
+ if (fg->icase) \
+ { \
+ wp = xmalloc(plen * sizeof(tre_char_t)); \
+ if (wp == NULL) \
+ return REG_ESPACE; \
+ for (int i = 0; i < plen; i++) \
+ wp[i] = towlower(pat[i]); \
+ _CALC_BMGS(arr, wp, plen); \
+ xfree(wp); \
+ } \
+ else \
+ _CALC_BMGS(arr, pat, plen); \
+ } \
+ else \
+ { \
+ if (fg->icase) \
+ { \
+ p = xmalloc(plen); \
+ if (p == NULL) \
+ return REG_ESPACE; \
+ for (int i = 0; i < plen; i++) \
+ p[i] = tolower(pat[i]); \
+ _CALC_BMGS(arr, p, plen); \
+ xfree(p); \
+ } \
+ else \
+ _CALC_BMGS(arr, pat, plen); \
+ } \
+ }
+
+#define _CALC_BMGS(arr, pat, plen) \
+ { \
+ int f, g; \
+ \
+ int *suff = xmalloc(plen * sizeof(int)); \
+ if (suff == NULL) \
+ return REG_ESPACE; \
+ \
+ suff[plen - 1] = plen; \
+ g = plen - 1; \
+ for (int i = plen - 2; i >= 0; i--) \
+ { \
+ if (i > g && suff[i + plen - 1 - f] < i - g) \
+ suff[i] = suff[i + plen - 1 - f]; \
+ else \
+ { \
+ if (i < g) \
+ g = i; \
+ f = i; \
+ while (g >= 0 && pat[g] == pat[g + plen - 1 - f]) \
+ g--; \
+ suff[i] = f - g; \
+ } \
+ } \
+ \
+ for (int i = 0; i < plen; i++) \
+ arr[i] = plen; \
+ g = 0; \
+ for (int i = plen - 1; i >= 0; i--) \
+ if (suff[i] == i + 1) \
+ for(; g < plen - 1 - i; g++) \
+ if (arr[g] == plen) \
+ arr[g] = plen - 1 - i; \
+ for (int i = 0; i <= plen - 2; i++) \
+ arr[plen - 1 - suff[i]] = plen - 1 - i; \
+ \
+ xfree(suff); \
+ }
+
+/*
+ * Copies the pattern pat having lenght n to p and stores
+ * the size in l.
*** DIFF OUTPUT TRUNCATED AT 1000 LINES ***
More information about the svn-src-user
mailing list