bin/71367: regex multibyte support is really slow

Kuang-che Wu kcwu at csie.org
Sat Sep 4 05:00:49 PDT 2004


The following reply was made to PR bin/71367; it has been noted by GNATS.

From: Kuang-che Wu <kcwu at csie.org>
To: freebsd-gnats-submit at freebsd.org
Cc:  
Subject: Re: bin/71367: regex multibyte support is really slow
Date: Sat, 4 Sep 2004 20:00:49 +0800

 On Sat, Sep 04, 2004 at 09:36:16PM +1000, Tim Robbins wrote:
 > On Sat, Sep 04, 2004 at 01:21:22PM +0200, Simon L. Nielsen wrote:
 
 > Do you have any non-standard options in /etc/make.conf? Have you changed
 > the C library at all locally? Can you confirm that the system you ran
 > this on was idle?
 The system is idle and without C library changed.
 The only related option in /etc/make.conf is COMPAT4X=yes.
 
 > Could you please try this patch?
 I test my following program,
 without the patch:
 case 0: 0.000000s
 case 1: 7.390625s
 case 2: (matched)0.000000s
 case 3: 0.000000s
 case 4: (matched)0.125000s
 case 5: 0.000000s
 case 6: 7.398438s
 case 7: 0.000000s
 case 8: 0.000000s
 
 with the patch:
 case 0: 0.000000s
 case 1: 0.000000s
 case 2: (matched)0.000000s
 case 3: 0.000000s
 case 4: (matched)0.000000s
 case 5: 0.000000s
 case 6: 0.000000s
 case 7: 0.000000s
 case 8: 0.000000s
 
 --------------------------
 #include <stdio.h>
 #include <locale.h>
 #include <regex.h>
 #include <time.h>
 
 #define EN "blah"
 char en[1024]= EN EN EN EN EN EN EN EN EN EN;
 #define XX "!@#$"
 char xx[1024]= XX XX XX XX XX XX XX XX XX XX;
 char utf8[1024]={
 #define U8 0xe6,0x85,0xa2 // UTF-8 character
   U8, U8, U8, U8, U8, U8, U8, U8, U8, U8,
   U8, U8, U8, U8, U8, U8, U8, U8, U8, U8,
   0
 };
 char big5[1024]={
 #define B5 0xa6,0x72 // Big5 character
   B5, B5, B5, B5, B5, B5, B5, B5, B5, B5,
   B5, B5, B5, B5, B5, B5, B5, B5, B5, B5,
   0
 };
 struct T {
   char *locale,*pattern,*text;
   int flag;
 } test[]={
   { "C", "[[:alnum:]]", utf8, REG_EXTENDED|REG_ICASE },
   { "zh_TW.UTF-8", "[[:alnum:]]", utf8, REG_EXTENDED|REG_ICASE },
   { "zh_TW.UTF-8", "[[:alnum:]]", en, REG_EXTENDED|REG_ICASE },
   { "zh_TW.UTF-8", "[[:alnum:]]", xx, REG_EXTENDED|REG_ICASE },
   { "zh_TW.UTF-8", "[^[:alnum:]]", utf8, REG_EXTENDED|REG_ICASE },
   { "zh_TW.Big5", "[[:alnum:]]", big5, REG_EXTENDED|REG_ICASE },
   { "en_US.UTF-8", "[[:alnum:]]", utf8, REG_EXTENDED|REG_ICASE },
   { "en_US.UTF-8", "[A-Za-z0-9]", utf8, REG_ICASE },
   { "en_US.UTF-8", "[[:alnum:]]", utf8, REG_EXTENDED },
 };
 int main(void)
 {
   int i;
   clock_t st;
   regex_t re;
 
   for(i=0; test[i].locale; i++) {
     printf("case %d: ",i);
     if(setlocale(LC_CTYPE,test[i].locale)==NULL)
       return 1;
 
     if(regcomp(&re,test[i].pattern,test[i].flag)!=0)
       return 2;
     st=clock();
     if(regexec(&re,test[i].text,0,NULL,0)==0)
       printf("(matched)");
     printf("%fs\n",(double)(clock()-st)/CLOCKS_PER_SEC);
   }
 
   return 0;
 }


More information about the freebsd-bugs mailing list