svn commit: r225347 - user/gabor/tre-integration/contrib/tre/lib

Gabor Kovesdan gabor at FreeBSD.org
Fri Sep 2 18:18:24 UTC 2011


Author: gabor
Date: Fri Sep  2 18:18:24 2011
New Revision: 225347
URL: http://svn.freebsd.org/changeset/base/225347

Log:
  - Merge some improvements and fixes from grep
  - Fix a cast [1]
  
  Submitted by:	ache [1]

Modified:
  user/gabor/tre-integration/contrib/tre/lib/fastmatch.c
  user/gabor/tre-integration/contrib/tre/lib/regcomp.c
  user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c

Modified: user/gabor/tre-integration/contrib/tre/lib/fastmatch.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/fastmatch.c	Fri Sep  2 18:13:46 2011	(r225346)
+++ user/gabor/tre-integration/contrib/tre/lib/fastmatch.c	Fri Sep  2 18:18:24 2011	(r225347)
@@ -1,3 +1,5 @@
+/* $FreeBSD$ */
+
 /*-
  * Copyright (C) 2011 Gabor Kovesdan <gabor at FreeBSD.org>
  * All rights reserved.
@@ -27,7 +29,9 @@
 #ifdef HAVE_CONFIG_H
 #include <config.h>
 #endif /* HAVE_CONFIG_H */
+#include <errno.h>
 #include <fastmatch.h>
+#include <regex.h>
 #include <string.h>
 
 #include "tre-fastmatch.h"
@@ -36,67 +40,72 @@
 
 /* XXX: avoid duplication */
 #define CONV_PAT							\
-  int ret;								\
-  tre_char_t *wregex;							\
-  size_t wlen;								\
-									\
-  wregex = xmalloc(sizeof(tre_char_t) * (n + 1));			\
-  if (wregex == NULL)							\
-    return REG_ESPACE;							\
-									\
-  if (TRE_MB_CUR_MAX == 1)						\
-    {									\
-      unsigned int i;							\
-      const unsigned char *str = (const unsigned char *)regex;		\
-      tre_char_t *wstr = wregex;					\
-									\
-      for (i = 0; i < n; i++)						\
-        *(wstr++) = *(str++);						\
-      wlen = n;								\
-    }									\
-  else									\
-    {									\
-      int consumed;							\
-      tre_char_t *wcptr = wregex;					\
-      mbstate_t state;							\
-      memset(&state, '\0', sizeof(state));				\
-      while (n > 0)							\
-        {								\
-          consumed = tre_mbrtowc(wcptr, regex, n, &state);		\
+  {									\
+    wregex = xmalloc(sizeof(tre_char_t) * (n + 1));			\
+    if (wregex == NULL)							\
+      return REG_ESPACE;						\
 									\
-          switch (consumed)						\
-            {								\
-            case 0:							\
-              if (*regex == '\0')					\
-                consumed = 1;						\
-              else							\
-                {							\
-                  xfree(wregex);					\
-                  return REG_BADPAT;					\
-                }							\
-              break;							\
-            case -1:							\
-              DPRINT(("mbrtowc: error %d: %s.\n", errno,		\
-		strerror(errno)));					\
-              xfree(wregex);						\
-              return REG_BADPAT;					\
-            case -2:							\
-              consumed = n;						\
-              break;							\
-            }								\
-          regex += consumed;						\
-          n -= consumed;						\
-          wcptr++;							\
-        }								\
-      wlen = wcptr - wregex;						\
-    }									\
+    if (TRE_MB_CUR_MAX == 1)						\
+      {									\
+	unsigned int i;							\
+	const unsigned char *str = (const unsigned char *)regex;	\
+	tre_char_t *wstr = wregex;					\
+									\
+	for (i = 0; i < n; i++)						\
+	  *(wstr++) = *(str++);						\
+	wlen = n;							\
+      }									\
+    else								\
+      {									\
+	int consumed;							\
+	tre_char_t *wcptr = wregex;					\
+	mbstate_t state;						\
+	memset(&state, '\0', sizeof(state));				\
+	while (n > 0)							\
+	  {								\
+	    consumed = tre_mbrtowc(wcptr, regex, n, &state);		\
+									\
+	    switch (consumed)						\
+	      {								\
+		case 0:							\
+		  if (*regex == '\0')					\
+		    consumed = 1;					\
+		  else							\
+		    {							\
+		      xfree(wregex);					\
+		      return REG_BADPAT;				\
+		    }							\
+		  break;						\
+		case -1:						\
+		  DPRINT(("mbrtowc: error %d: %s.\n", errno,		\
+		  strerror(errno)));					\
+		  xfree(wregex);					\
+		  return REG_BADPAT;					\
+		case -2:						\
+		  consumed = n;						\
+		  break;						\
+	      }								\
+	    regex += consumed;						\
+	    n -= consumed;						\
+	    wcptr++;							\
+	}								\
+        wlen = wcptr - wregex;						\
+      }									\
 									\
-  wregex[wlen] = L'\0';
+    wregex[wlen] = L'\0';						\
+  }
 
 int
 tre_fixncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags)
 {
-  CONV_PAT;
+  int ret;
+  tre_char_t *wregex;
+  size_t wlen;
+
+  if (n != 0)
+    CONV_PAT
+  else
+    return tre_compile_literal(preg, NULL, 0, cflags);
 
   ret = tre_compile_literal(preg, wregex, wlen, cflags);
   xfree(wregex);
@@ -107,7 +116,14 @@ tre_fixncomp(fastmatch_t *preg, const ch
 int
 tre_fastncomp(fastmatch_t *preg, const char *regex, size_t n, int cflags)
 {
-  CONV_PAT;
+  int ret;
+  tre_char_t *wregex;
+  size_t wlen;
+
+  if (n != 0)
+    CONV_PAT
+  else
+    return tre_compile_literal(preg, NULL, 0, cflags);
 
   ret = (cflags & REG_LITERAL) ?
     tre_compile_literal(preg, wregex, wlen, cflags) :
@@ -121,34 +137,13 @@ tre_fastncomp(fastmatch_t *preg, const c
 int
 tre_fixcomp(fastmatch_t *preg, const char *regex, int cflags)
 {
-  size_t len;
-
-  if (cflags & REG_PEND)
-    {
-      if (preg->re_endp >= regex)
-	len = preg->re_endp - regex
-      else
-	len = preg ? strlen(regex) : 0;
-     return tre_fixncomp(preg, regex, len, cflags);
-   }
-  else
-    return tre_fixncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
+  return tre_fixncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
 }
 
 int
 tre_fastcomp(fastmatch_t *preg, const char *regex, int cflags)
 {
-  size_t len;
-
-  if (cflags & REG_PEND)
-    {
-      len = (preg->re_endp >= regex)
-        ? preg->re_endp - regex
-        : 0;
-     return tre_fastncomp(preg, regex, len ? strlen(regex) : 0, cflags);
-   }
-  else
-    return tre_fastncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
+  return tre_fastncomp(preg, regex, regex ? strlen(regex) : 0, cflags);
 }
 
 int

Modified: user/gabor/tre-integration/contrib/tre/lib/regcomp.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/regcomp.c	Fri Sep  2 18:13:46 2011	(r225346)
+++ user/gabor/tre-integration/contrib/tre/lib/regcomp.c	Fri Sep  2 18:18:24 2011	(r225347)
@@ -117,11 +117,10 @@ tre_regcomp(regex_t *preg, const char *r
   if (cflags & REG_PEND)
     {
       if (preg->re_endp >= regex)
-	len = preg->re_endp - regex
+	len = preg->re_endp - regex;
       else
 	len = regex ? strlen(regex) : 0;
-      )
-     return tre_regncomp(preg, regex, len, cflags);
+      return tre_regncomp(preg, regex, len, cflags);
    }
   else
     return tre_regncomp(preg, regex, regex ? strlen(regex) : 0, cflags);

Modified: user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c	Fri Sep  2 18:13:46 2011	(r225346)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c	Fri Sep  2 18:18:24 2011	(r225347)
@@ -208,14 +208,14 @@ static int	fastcmp(const void *, const v
     fg->qsBc[i] = fg->len - fg->hasdot;					\
   for (int i = fg->hasdot + 1; i < fg->len; i++)			\
     {									\
-      fg->qsBc[(unsigned)fg->pattern[i]] = fg->len - i;			\
+      fg->qsBc[(unsigned char)fg->pattern[i]] = fg->len - i;		\
       DPRINT(("BC shift for char %c is %d\n", fg->pattern[i],		\
 	     fg->len - i));						\
       if (fg->icase)							\
         {								\
           char c = islower(fg->pattern[i]) ? toupper(fg->pattern[i])	\
             : tolower(fg->pattern[i]);					\
-          fg->qsBc[(unsigned)c] = fg->len - i;				\
+          fg->qsBc[(unsigned char)c] = fg->len - i;			\
 	  DPRINT(("BC shift for char %c is %d\n", c, fg->len - i));	\
         }								\
     }
@@ -370,13 +370,18 @@ static int	fastcmp(const void *, const v
  * Copies the pattern pat having lenght n to p and stores
  * the size in l.
  */
-#define SAVE_PATTERN(p, l)						\
-  l = (n == 0) ? tre_strlen(pat) : n;					\
-  p = xmalloc((l + 1) * sizeof(tre_char_t));				\
-  if (p == NULL)							\
-    return REG_ESPACE;							\
-  memcpy(p, pat, l * sizeof(tre_char_t));				\
-  p[l] = TRE_CHAR('\0');
+#define SAVE_PATTERN(src, srclen, dst, dstlen)				\
+  dstlen = srclen;							\
+  if (dstlen == 0)							\
+    dst = TRE_CHAR("");							\
+  else									\
+    {									\
+      dst = xmalloc((dstlen + 1) * sizeof(tre_char_t));			\
+      if (dst == NULL)							\
+	return REG_ESPACE;						\
+      memcpy(dst, src, dstlen * sizeof(tre_char_t));			\
+      dst[dstlen] = TRE_CHAR('\0');					\
+    }
 
 /*
  * Initializes pattern compiling.
@@ -392,12 +397,18 @@ static int	fastcmp(const void *, const v
   if (n == 0)								\
     {									\
       fg->matchall = true;						\
+      fg->pattern = "";							\
+      fg->wpattern = TRE_CHAR("");					\
+      DPRINT(("Matching every input\n"));				\
       return REG_OK;							\
-    }
+    }									\
 									\
   /* Cannot handle REG_ICASE with MB string */				\
   if (fg->icase && (TRE_MB_CUR_MAX > 1))				\
-    return REG_BADPAT;							\
+    {									\
+      DPRINT(("Cannot use fast matcher for MBS with REG_ICASE\n"));	\
+      return REG_BADPAT;						\
+    }
 
 /*
  * Returns: REG_OK on success, error code otherwise
@@ -413,14 +424,14 @@ tre_compile_literal(fastmatch_t *fg, con
     return REG_BADPAT;
 
 #ifdef TRE_WCHAR
-  SAVE_PATTERN(fg->wpattern, fg->wlen);
+  SAVE_PATTERN(pat, n, fg->wpattern, fg->wlen);
   STORE_MBS_PAT;
 #else
-  SAVE_PATTERN(fg->pattern, fg->len);
+  SAVE_PATTERN(pat, n, fg->pattern, fg->len);
 #endif
 
-  DPRINT(("tre_compile_literal: pattern: %s, icase: %c, word: %c, "
-	 "newline %c\n", fg->pattern, fg->icase ? 'y' : 'n',
+  DPRINT(("tre_compile_literal: pattern: %s, len %u, icase: %c, word: %c, "
+	 "newline %c\n", fg->pattern, fg->len, fg->icase ? 'y' : 'n',
 	 fg->word ? 'y' : 'n', fg->newline ? 'y' : 'n'));
 
   FILL_QSBC;
@@ -440,14 +451,11 @@ int
 tre_compile_fast(fastmatch_t *fg, const tre_char_t *pat, size_t n,
 		 int cflags)
 {
-  INIT_COMP;
+  tre_char_t *tmp;
+  size_t pos = 0;
+  bool escaped = false;
 
-  /* Remove end-of-line character ('$'). */
-  if ((n > 0) && (pat[n - 1] == TRE_CHAR('$')))
-    {
-      fg->eol = true;
-      n--;
-    }
+  INIT_COMP;
 
   /* Remove beginning-of-line character ('^'). */
   if (pat[0] == TRE_CHAR('^'))
@@ -472,36 +480,140 @@ tre_compile_fast(fastmatch_t *fg, const 
   if (fg->word && (TRE_MB_CUR_MAX > 1))
     return REG_BADPAT;
 
-  /* Look for ways to cheat...er...avoid the full regex engine. */
-  for (unsigned int i = 0; i < n; i++)
+  tmp = xmalloc((n + 1) * sizeof(tre_char_t));
+  if (tmp == NULL)
+    return REG_ESPACE;
+
+#define STORE_CHAR							\
+  do									\
+    {									\
+      tmp[pos++] = pat[i];						\
+      escaped = false;							\
+      continue;								\
+    } while (0)
+
+  /*
+   * Used for heuristic, only beginning ^, trailing $ and . are treated
+   * as special.
+   */
+  if (cflags & _REG_HEUR)
     {
-      /* Can still cheat? */
-      if (!(cflags & _REG_HEUR) &&
-	  ((tre_isalnum(pat[i])) || tre_isspace(pat[i]) ||
-	  (pat[i] == TRE_CHAR('_')) || (pat[i] == TRE_CHAR(',')) ||
-	  (pat[i] == TRE_CHAR('=')) || (pat[i] == TRE_CHAR('-')) ||
-	  (pat[i] == TRE_CHAR(':')) || (pat[i] == TRE_CHAR('/'))))
+      for (int i = 0; i < n; i++)
+	switch (pat[i])
+	  {
+	    case TRE_CHAR('.'):
+	      fg->hasdot = i;
+	      STORE_CHAR;
+	      break;
+	    case TRE_CHAR('$'):
+	      if (i == n - 1)
+		fg->eol = true;
+	      else
+		STORE_CHAR;
+	      break;
+	    default:
+	      STORE_CHAR;
+	  }
+    }
+  else
+    for (int i = 0; i < n; i++)
+      {
+	switch (pat[i])
+	  {
+	    case TRE_CHAR('\\'):
+	      if (escaped)
+		STORE_CHAR;
+	      else
+		escaped = true;
+	      break;
+	    case TRE_CHAR('['):
+	      if (escaped)
+		STORE_CHAR;
+	      else
+		goto badpat;
+	      break;
+	    case TRE_CHAR('*'):
+	      if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
+		STORE_CHAR;
+	      else
+		goto badpat;
+	      break;
+	    case TRE_CHAR('+'):
+	    case TRE_CHAR('?'):
+	      if ((cflags & REG_EXTENDED) && (i == 0))
+		continue;
+	      else if ((cflags & REG_EXTENDED) ^ !escaped)
+		STORE_CHAR;
+	      else
+		goto badpat;
+	    case TRE_CHAR('.'):
+	      if (escaped)
+		goto badpat;
+	      else
+		{
+		  fg->hasdot = true;
+		  STORE_CHAR;
+		}
+	      break;
+	    case TRE_CHAR('^'):
+	      STORE_CHAR;
+	      break;
+	    case TRE_CHAR('$'):
+	      if (!escaped && (i == n - 1))
+		fg->eol = true;
+	      else
+		STORE_CHAR;
+	      break;
+	    case TRE_CHAR('('):
+	      if ((cflags & REG_EXTENDED) ^ escaped)
+		goto badpat;
+	      else
+		STORE_CHAR;
+	      break;
+	    case TRE_CHAR('{'):
+	      if (escaped && (i == 0))
+		STORE_CHAR;
+	      else if (!(cflags & REG_EXTENDED) && (i == 0))
+		STORE_CHAR;
+	      else if ((cflags & REG_EXTENDED) && (i == 0))
+		continue;
+	      else
+		goto badpat;
+	      break;
+	    case TRE_CHAR('|'):
+	      if ((cflags & REG_EXTENDED) ^ (!escaped))
+		goto badpat;
+	      else
+		STORE_CHAR;
+	      break;
+	    default:
+	      if (escaped)
+		goto badpat;
+	      else
+		STORE_CHAR;
+	  }
 	continue;
-      else if (pat[i] == TRE_CHAR('.'))
-	fg->hasdot = i;
-      else
+badpat:
+	xfree(tmp);
 	return REG_BADPAT;
-  }
+      }
 
   /*
-   * pat has been adjusted earlier to not include '^', '$' or
-   * the word match character classes at the beginning and ending
-   * of the string respectively.
+   * The pattern has been processed and copied to tmp as a literal string
+   * with escapes, anchors (^$) and the word boundary match character
+   * classes stripped out.
    */
 #ifdef TRE_WCHAR
-  SAVE_PATTERN(fg->wpattern, fg->wlen);
+  SAVE_PATTERN(tmp, pos, fg->wpattern, fg->wlen);
   STORE_MBS_PAT;
 #else
-  SAVE_PATTERN(fg->pattern, fg->len);
+  SAVE_PATTERN(tmp, pos, fg->pattern, fg->len);
 #endif
 
-  DPRINT(("tre_compile_fast: pattern: %s, bol %c, eol %c, "
-	 "icase: %c, word: %c, newline %c\n", fg->pattern,
+  xfree(tmp);
+
+  DPRINT(("tre_compile_fast: pattern: %s, len %u, bol %c, eol %c, "
+	 "icase: %c, word: %c, newline %c\n", fg->pattern, fg->len,
 	 fg->bol ? 'y' : 'n', fg->eol ? 'y' : 'n',
 	 fg->icase ? 'y' : 'n', fg->word ? 'y' : 'n',
 	 fg->newline ? 'y' : 'n'));
@@ -593,7 +705,7 @@ tre_match_fast(const fastmatch_t *fg, co
   const tre_char_t *str_wide = data;
 
   /* Calculate length if unspecified. */
-  if (len == (unsigned)-1)
+  if (len == (size_t)-1)
     switch (type)
       {
 	case STR_WIDE:


More information about the svn-src-user mailing list