svn commit: r225434 - in user/gabor/tre-integration: contrib/tre/lib include

Gabor Kovesdan gabor at FreeBSD.org
Wed Sep 7 12:53:19 UTC 2011


Author: gabor
Date: Wed Sep  7 12:53:18 2011
New Revision: 225434
URL: http://svn.freebsd.org/changeset/base/225434

Log:
  - Make the heuristic code loosly coupled to the fast matcher by providing
    properly escaped patterns instead of using an internal flag
  - Add some struct fields for escaped dots, forgotten in a previous commit

Modified:
  user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c
  user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c
  user/gabor/tre-integration/include/fastmatch.h
  user/gabor/tre-integration/include/regex.h

Modified: user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c	Wed Sep  7 07:52:45 2011	(r225433)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-fastmatch.c	Wed Sep  7 12:53:18 2011	(r225434)
@@ -263,6 +263,15 @@ static int	fastcmp(const void *, const b
 	}								\
     }
 
+#ifdef TRE_DEBUG
+#define DPRINT_BMGS(len, fmt_str, sh)					\
+  for (int i = 0; i < len; i++)						\
+    DPRINT((fmt_str, i, sh[i]));
+#else
+#define DPRINT_BMGS(len, fmt_str, sh)					\
+  do { } while(/*CONSTCOND*/0)
+#endif
+
 /*
  * Fills in the good suffix table for SB/MB strings.
  */
@@ -276,6 +285,7 @@ static int	fastcmp(const void *, const b
 	fg->sbmGs[0] = 1;						\
       else								\
 	_FILL_BMGS(fg->sbmGs, fg->pattern, fg->len, false);		\
+      DPRINT_BMGS(fg->len, "GS shift for pos %d is %d\n", fg->sbmGs);	\
     }
 
 /*
@@ -291,6 +301,8 @@ static int	fastcmp(const void *, const b
 	fg->bmGs[0] = 1;						\
       else								\
 	_FILL_BMGS(fg->bmGs, fg->wpattern, fg->wlen, true);		\
+      DPRINT_BMGS(fg->wlen, "GS shift (wide) for pos %d is %d\n",	\
+		  fg->bmGs);						\
     }
 
 #define _FILL_BMGS(arr, pat, plen, wide)				\
@@ -496,121 +508,99 @@ tre_compile_fast(fastmatch_t *fg, const 
       continue;								\
     } while (0)
 
-  /*
-   * Used for heuristic, only beginning ^, trailing $ and . are treated
-   * as special.
-   */
-  if (cflags & _REG_HEUR)
+  for (int i = 0; i < n; i++)
     {
-      for (int i = 0; i < n; i++)
-	switch (pat[i])
-	  {
-	    case TRE_CHAR('.'):
-	      fg->hasdot = i;
+      switch (pat[i])
+	{
+	  case TRE_CHAR('\\'):
+	    if (escaped)
 	      STORE_CHAR;
-	      break;
-	    case TRE_CHAR('$'):
-	      if (i == n - 1)
-		fg->eol = true;
-	      else
-		STORE_CHAR;
-	      break;
-	    default:
+	    else
+	      escaped = true;
+	    continue;
+	  case TRE_CHAR('['):
+	    if (escaped)
 	      STORE_CHAR;
-	  }
-    }
-  else
-    for (int i = 0; i < n; i++)
-      {
-	switch (pat[i])
-	  {
-	    case TRE_CHAR('\\'):
-	      if (escaped)
-		STORE_CHAR;
-	      else
-		escaped = true;
-	      break;
-	    case TRE_CHAR('['):
-	      if (escaped)
-		STORE_CHAR;
-	      else
-		goto badpat;
-	      break;
-	    case TRE_CHAR('*'):
-	      if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
-		STORE_CHAR;
-	      else
-		goto badpat;
-	      break;
-	    case TRE_CHAR('+'):
-	    case TRE_CHAR('?'):
-	      if ((cflags & REG_EXTENDED) && (i == 0))
-		continue;
-	      else if ((cflags & REG_EXTENDED) ^ !escaped)
-		STORE_CHAR;
-	      else
-		goto badpat;
-	    case TRE_CHAR('.'):
-	      if (escaped)
-		{
-		  if (!_escmap)
-		    _escmap = xmalloc(n * sizeof(bool));
-		  if (!_escmap)
-		    {
-		      xfree(tmp);
-		      return REG_ESPACE;
-		    }
-		  _escmap[i] = true;
-		  STORE_CHAR;
-		}
-	      else
-		{
-		  fg->hasdot = i;
-		  STORE_CHAR;
-		}
-	      break;
-	    case TRE_CHAR('^'):
+	    else
+	      goto badpat;
+	    continue;
+	  case TRE_CHAR('*'):
+	    if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
 	      STORE_CHAR;
-	      break;
-	    case TRE_CHAR('$'):
-	      if (!escaped && (i == n - 1))
-		fg->eol = true;
-	      else
-		STORE_CHAR;
-	      break;
-	    case TRE_CHAR('('):
-	      if ((cflags & REG_EXTENDED) ^ escaped)
-		goto badpat;
-	      else
-		STORE_CHAR;
-	      break;
-	    case TRE_CHAR('{'):
-	      if (escaped && (i == 0))
-		STORE_CHAR;
-	      else if (!(cflags & REG_EXTENDED) && (i == 0))
-		STORE_CHAR;
-	      else if ((cflags & REG_EXTENDED) && (i == 0))
-		continue;
-	      else
-		goto badpat;
-	      break;
-	    case TRE_CHAR('|'):
-	      if ((cflags & REG_EXTENDED) ^ (!escaped))
-		goto badpat;
-	      else
-		STORE_CHAR;
-	      break;
-	    default:
-	      if (escaped)
-		goto badpat;
-	      else
+	    else
+	      goto badpat;
+	    continue;
+	  case TRE_CHAR('+'):
+	  case TRE_CHAR('?'):
+	    if ((cflags & REG_EXTENDED) && (i == 0))
+	      continue;
+	    else if ((cflags & REG_EXTENDED) ^ !escaped)
+	      STORE_CHAR;
+	    else
+	      goto badpat;
+	    continue;
+	  case TRE_CHAR('.'):
+	    if (escaped)
+	      {
+		if (!_escmap)
+		  _escmap = xmalloc(n * sizeof(bool));
+		if (!_escmap)
+		  {
+		    xfree(tmp);
+		    return REG_ESPACE;
+		  }
+		_escmap[i] = true;
+		STORE_CHAR;
+	      }
+	    else
+	      {
+		fg->hasdot = i;
 		STORE_CHAR;
-	  }
-	continue;
+	      }
+	    continue;
+	  case TRE_CHAR('^'):
+	    STORE_CHAR;
+	    continue;
+	  case TRE_CHAR('$'):
+	    if (!escaped && (i == n - 1))
+	      fg->eol = true;
+	    else
+	      STORE_CHAR;
+	    continue;
+	  case TRE_CHAR('('):
+	    if ((cflags & REG_EXTENDED) ^ escaped)
+	      goto badpat;
+	    else
+	      STORE_CHAR;
+	    continue;
+	  case TRE_CHAR('{'):
+	    if (!(cflags & REG_EXTENDED) ^ escaped)
+	      STORE_CHAR;
+	    else if (!(cflags & REG_EXTENDED) && (i == 0))
+	      STORE_CHAR;
+	    else if ((cflags & REG_EXTENDED) && (i == 0))
+	      continue;
+	    else
+	      goto badpat;
+	    continue;
+	  case TRE_CHAR('|'):
+	    if ((cflags & REG_EXTENDED) ^ escaped)
+	      goto badpat;
+	    else
+	      STORE_CHAR;
+	    continue;
+	  default:
+	    if (escaped)
+	      goto badpat;
+	    else
+	      STORE_CHAR;
+	    continue;
+	}
+      continue;
 badpat:
-	xfree(tmp);
-	return REG_BADPAT;
-      }
+      xfree(tmp);
+      return REG_BADPAT;
+    }
 
   /*
    * The pattern has been processed and copied to tmp as a literal string

Modified: user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c
==============================================================================
--- user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c	Wed Sep  7 07:52:45 2011	(r225433)
+++ user/gabor/tre-integration/contrib/tre/lib/tre-heuristic.c	Wed Sep  7 12:53:18 2011	(r225434)
@@ -127,11 +127,14 @@
       goto end_segment;							\
     } while (0)
 
-#define STORE_CHAR							\
+#define STORE_CHAR(esc)							\
   do									\
     {									\
-      escaped = false;							\
+      if (esc)								\
+	heur[pos++] = TRE_CHAR('\\');					\
       heur[pos++] = regex[i];						\
+      escaped = false;							\
+      continue;								\
     } while (0)
 
 
@@ -178,11 +181,13 @@ tre_compile_heur(heur_t *h, const tre_ch
 	       * bracket is escaped.
 	       */
 	      case TRE_CHAR('['):
-		PARSE_BRACKETS;
 		if (escaped)
-		  STORE_CHAR;
+		  STORE_CHAR(true);
 		else
-		  heur[pos++] = TRE_CHAR('.');
+		  {
+		    PARSE_BRACKETS;
+		    heur[pos++] = TRE_CHAR('.');
+		  }
 		continue;
 
 	      /*
@@ -192,9 +197,9 @@ tre_compile_heur(heur_t *h, const tre_ch
 	       */
 	      case TRE_CHAR('{'):
 		if (escaped && (i == 1))
-		  STORE_CHAR;
+		  STORE_CHAR(true);
 		else if ((i == 0) && !(cflags & REG_EXTENDED))
-		  STORE_CHAR;
+		  STORE_CHAR(true);
 		else if ((i == 0) && (cflags & REG_EXTENDED))
 		  continue;
 
@@ -205,7 +210,7 @@ tre_compile_heur(heur_t *h, const tre_ch
 		    END_SEGMENT;
 		  }
 		else
-		  STORE_CHAR;
+		  STORE_CHAR(cflags & REG_EXTENDED);
 		continue;
 
 	      /*
@@ -213,11 +218,13 @@ tre_compile_heur(heur_t *h, const tre_ch
 	       * otherwise treated as a normal character.
 	       */
 	      case TRE_CHAR('('):
-		PARSE_UNIT('(', ')');
 		if (escaped ^ (cflags & REG_EXTENDED))
-		  END_SEGMENT;
+		  {
+		    PARSE_UNIT('(', ')');
+		    END_SEGMENT;
+		  }
 		else
-		  STORE_CHAR;
+		  STORE_CHAR(cflags & REG_EXTENDED);
 		continue;
 
 	      /*
@@ -227,9 +234,9 @@ tre_compile_heur(heur_t *h, const tre_ch
 	       */
 	      case TRE_CHAR('\\'):
 		if (escaped)
-		  STORE_CHAR;
+		  STORE_CHAR(true);
 		else
-		  escaped = !escaped;
+		  escaped = true;
 		continue;
 
 	      /*
@@ -240,7 +247,7 @@ tre_compile_heur(heur_t *h, const tre_ch
 	       */
 	      case TRE_CHAR('*'):
 		if (escaped || (!(cflags & REG_EXTENDED) && (i == 0)))
-		  STORE_CHAR;
+		  STORE_CHAR(true);
 		else if ((i != 0))
 		  {
 		    pos--;
@@ -262,7 +269,7 @@ tre_compile_heur(heur_t *h, const tre_ch
 		else if ((cflags & REG_EXTENDED) ^ escaped)
 		  END_SEGMENT;
 		else 
-		  STORE_CHAR;
+		  STORE_CHAR(cflags & REG_EXTENDED);
 		continue;
 
 	      /*
@@ -281,7 +288,7 @@ tre_compile_heur(heur_t *h, const tre_ch
 		    END_SEGMENT;
 		  }
 		else
-		  STORE_CHAR;
+		  STORE_CHAR(true);
 		continue;
 
 	      /*
@@ -296,7 +303,7 @@ tre_compile_heur(heur_t *h, const tre_ch
 		else if (!(cflags & REG_EXTENDED) && escaped)
 		  END_SEGMENT;
 		else
-		  STORE_CHAR;
+		  STORE_CHAR(cflags & REG_EXTENDED);
 		continue;
 
 	      /*
@@ -304,10 +311,7 @@ tre_compile_heur(heur_t *h, const tre_ch
 	       * cannot handle it.
 	       */
 	      case TRE_CHAR('.'):
-		if (escaped)
-		  END_SEGMENT;
-		else
-		  STORE_CHAR;
+		STORE_CHAR(escaped);
 		continue;
 
 	      /*
@@ -319,7 +323,7 @@ tre_compile_heur(heur_t *h, const tre_ch
 		if (escaped)
 		  END_SEGMENT;
 		else
-		  STORE_CHAR;
+		  STORE_CHAR(false);
 		continue;
 	    }
 	}
@@ -352,7 +356,7 @@ end_segment:
 	      goto space1;
 	    }
 
-	  ret = tre_compile_fast(h->start, heur, pos, _REG_HEUR);
+	  ret = tre_compile_fast(h->start, heur, pos, 0);
 	  if (ret != REG_OK)
 	    {
 	      errcode = REG_BADPAT;
@@ -386,7 +390,7 @@ end_segment:
 	      goto space2;
 	    }
 	    
-	  ret = tre_compile_fast(h->end, heur, pos, _REG_HEUR);
+	  ret = tre_compile_fast(h->end, heur, pos, 0);
 	  if (ret != REG_OK)
 	    {
 	      xfree(h->end);

Modified: user/gabor/tre-integration/include/fastmatch.h
==============================================================================
--- user/gabor/tre-integration/include/fastmatch.h	Wed Sep  7 07:52:45 2011	(r225433)
+++ user/gabor/tre-integration/include/fastmatch.h	Wed Sep  7 12:53:18 2011	(r225434)
@@ -12,10 +12,12 @@ typedef struct {
   size_t	 wlen;
   size_t	 len;
   wchar_t	*wpattern;
+  bool		*wescmap;
   int		 hasdot;
   int		 qsBc[UCHAR_MAX + 1];
   int		*bmGs;
   char		*pattern;
+  bool		*escmap;
   int		 defBc;
   void		*qsBc_table;
   int		*sbmGs;

Modified: user/gabor/tre-integration/include/regex.h
==============================================================================
--- user/gabor/tre-integration/include/regex.h	Wed Sep  7 07:52:45 2011	(r225433)
+++ user/gabor/tre-integration/include/regex.h	Wed Sep  7 12:53:18 2011	(r225434)
@@ -110,7 +110,6 @@ typedef enum {
 #define REG_PEND	(REG_UNGREEDY << 1)
 #define REG_GNU         (REG_PEND << 1)
 #define REG_WORD	(REG_GNU << 1)
-#define _REG_HEUR	(REG_WORD << 1)
 
 /* POSIX tre_regexec() flags. */
 #define REG_NOTBOL 1


More information about the svn-src-user mailing list