svn commit: r315883 - in head/textproc/clucene: . files

Jung-uk Kim jkim at FreeBSD.org
Tue Apr 16 18:37:05 UTC 2013


Author: jkim
Date: Tue Apr 16 18:37:03 2013
New Revision: 315883
URL: http://svnweb.freebsd.org/changeset/ports/315883

Log:
  - Implement efficient BitSet::nextSetBit() to reduce diff against upstream.
  
  http://clucene.git.sourceforge.net/git/gitweb.cgi?p=clucene/clucene;a=commitdiff;h=17e53d7
  
  - Fix a buffer overflow in CJKAnalyzer.  Somehow the upstream missed this
  in 2.3.3.4 branch.
  
  http://clucene.svn.sourceforge.net/viewvc/clucene?view=revision&revision=2630
  
  - Fix potential memory leaks in libstemmer.  Merged from Snowball changes.
  
  http://svn.tartarus.org/snowball/trunk/snowball/libstemmer/libstemmer_c.in?r1=409&r2=520&view=patch
  
  - Implement SnowballAnalyzer::reusableTokenStream(). [1]  Also, this patch
  fixes memory leaks found by the submitter.
  
  Submitted by:	Kishore Ramareddy (kishore at niksun dot com)
  		(initial version) [1]
  Feature safe:	yes

Added:
  head/textproc/clucene/files/patch-src__contribs-lib__CLucene__analysis__cjk__CJKAnalyzer.h   (contents, props changed)
  head/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__Snowball.cpp   (contents, props changed)
  head/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__SnowballAnalyzer.h   (contents, props changed)
  head/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__libstemmer__libstemmer.c   (contents, props changed)
  head/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.cpp   (contents, props changed)
  head/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.h   (contents, props changed)
Modified:
  head/textproc/clucene/Makefile

Modified: head/textproc/clucene/Makefile
==============================================================================
--- head/textproc/clucene/Makefile	Tue Apr 16 18:37:00 2013	(r315882)
+++ head/textproc/clucene/Makefile	Tue Apr 16 18:37:03 2013	(r315883)
@@ -3,7 +3,7 @@
 
 PORTNAME=	clucene
 PORTVERSION=	2.3.3.4
-PORTREVISION=	1
+PORTREVISION=	2
 CATEGORIES=	textproc
 MASTER_SITES=	SF/${PORTNAME}/${PORTNAME}-core-unstable/2.3
 DISTNAME=	${PORTNAME}-core-${PORTVERSION}

Added: head/textproc/clucene/files/patch-src__contribs-lib__CLucene__analysis__cjk__CJKAnalyzer.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/textproc/clucene/files/patch-src__contribs-lib__CLucene__analysis__cjk__CJKAnalyzer.h	Tue Apr 16 18:37:03 2013	(r315883)
@@ -0,0 +1,11 @@
+--- src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h.orig	2011-03-16 20:21:07.000000000 -0400
++++ src/contribs-lib/CLucene/analysis/cjk/CJKAnalyzer.h	2013-03-29 18:46:22.000000000 -0400
+@@ -39,7 +39,7 @@
+      * character buffer, store the characters which are used to compose <br>
+      * the returned Token
+      */
+-    TCHAR buffer[LUCENE_MAX_WORD_LEN];
++    TCHAR buffer[LUCENE_MAX_WORD_LEN+1];
+ 
+     /**
+      * I/O buffer, used to store the content of the input(one of the <br>

Added: head/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__Snowball.cpp
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__Snowball.cpp	Tue Apr 16 18:37:03 2013	(r315883)
@@ -0,0 +1,74 @@
+--- src/contribs-lib/CLucene/snowball/Snowball.cpp.orig	2011-03-16 20:21:07.000000000 -0400
++++ src/contribs-lib/CLucene/snowball/Snowball.cpp	2013-04-01 19:14:15.000000000 -0400
+@@ -19,16 +19,31 @@
+ 
+ CL_NS_DEF2(analysis,snowball)
+ 
++  class SnowballAnalyzer::SavedStreams : public TokenStream {
++  public:
++    StandardTokenizer* tokenStream;
++    TokenStream* filteredTokenStream;
++
++    SavedStreams():tokenStream(NULL), filteredTokenStream(NULL) {}
++    void close(){}
++    Token* next(Token* token) {return NULL;}
++  };
++  
+   /** Builds the named analyzer with no stop words. */
+   SnowballAnalyzer::SnowballAnalyzer(const TCHAR* language) {
+     this->language = STRDUP_TtoT(language);
+ 	stopSet = NULL;
+   }
+ 
+-  SnowballAnalyzer::~SnowballAnalyzer(){
+-	  _CLDELETE_CARRAY(language);
+-	  if ( stopSet != NULL )
+-		  _CLDELETE(stopSet);
++  SnowballAnalyzer::~SnowballAnalyzer() {
++    SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
++    if (streams != NULL) {
++      _CLDELETE(streams->filteredTokenStream);
++      _CLDELETE(streams);
++    }
++    _CLDELETE_CARRAY(language);
++    if (stopSet != NULL)
++      _CLDELETE(stopSet);
+   }
+ 
+   /** Builds the named analyzer with the given stop words.
+@@ -62,12 +77,29 @@
+     result = _CLNEW SnowballFilter(result, language, true);
+     return result;
+   }
+-  
+-  
+-  
+-  
+-  
+-  
++
++  TokenStream* SnowballAnalyzer::reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader) {
++    SavedStreams* streams = reinterpret_cast<SavedStreams*>(getPreviousTokenStream());
++
++    if (streams == NULL) {
++      streams = _CLNEW SavedStreams();
++      BufferedReader* bufferedReader = reader->__asBufferedReader();
++
++      if (bufferedReader == NULL)
++        streams->tokenStream = _CLNEW StandardTokenizer(_CLNEW FilteredBufferedReader(reader, false), true);
++      else
++        streams->tokenStream = _CLNEW StandardTokenizer(bufferedReader);
++
++      streams->filteredTokenStream = _CLNEW StandardFilter(streams->tokenStream, true);
++      streams->filteredTokenStream = _CLNEW LowerCaseFilter(streams->filteredTokenStream, true);
++      if (stopSet != NULL)
++        streams->filteredTokenStream = _CLNEW StopFilter(streams->filteredTokenStream, true, stopSet);
++      streams->filteredTokenStream = _CLNEW SnowballFilter(streams->filteredTokenStream, language, true);
++      setPreviousTokenStream(streams);
++    } else
++      streams->tokenStream->reset(reader);
++    return streams->filteredTokenStream;
++  }
+   
+     /** Construct the named stemming filter.
+    *

Added: head/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__SnowballAnalyzer.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__SnowballAnalyzer.h	Tue Apr 16 18:37:03 2013	(r315883)
@@ -0,0 +1,19 @@
+--- src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h.orig	2011-03-16 20:21:07.000000000 -0400
++++ src/contribs-lib/CLucene/snowball/SnowballAnalyzer.h	2013-04-01 18:25:10.000000000 -0400
+@@ -22,6 +22,7 @@
+ class CLUCENE_CONTRIBS_EXPORT SnowballAnalyzer: public Analyzer {
+   TCHAR* language;
+   CLTCSetList* stopSet;
++  class SavedStreams;
+ 
+ public:
+   /** Builds the named analyzer with no stop words. */
+@@ -37,6 +38,8 @@
+       StandardFilter}, a {@link LowerCaseFilter} and a {@link StopFilter}. */
+   TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+   TokenStream* tokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader, bool deleteReader);
++
++  TokenStream* reusableTokenStream(const TCHAR* fieldName, CL_NS(util)::Reader* reader);
+ };
+ 
+ CL_NS_END2

Added: head/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__libstemmer__libstemmer.c
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/textproc/clucene/files/patch-src__contribs-lib__CLucene__snowball__libstemmer__libstemmer.c	Tue Apr 16 18:37:03 2013	(r315883)
@@ -0,0 +1,24 @@
+--- src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c.orig	2011-03-16 20:21:07.000000000 -0400
++++ src/contribs-lib/CLucene/snowball/libstemmer/libstemmer.c	2013-03-29 18:54:39.000000000 -0400
+@@ -35,9 +35,8 @@
+ {
+     stemmer_encoding enc;
+     struct stemmer_modules * module;
+-    struct sb_stemmer * stemmer =
+-	    (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
+-    if (stemmer == NULL) return NULL;
++    struct sb_stemmer * stemmer;
++
+     enc = sb_getenc(charenc);
+     if (enc == ENC_UNKNOWN) return NULL;
+ 
+@@ -46,6 +45,9 @@
+     }
+     if (module->name == NULL) return NULL;
+     
++    stemmer = (struct sb_stemmer *) malloc(sizeof(struct sb_stemmer));
++    if (stemmer == NULL) return NULL;
++
+     stemmer->create = module->create;
+     stemmer->close = module->close;
+     stemmer->stem = module->stem;

Added: head/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.cpp
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.cpp	Tue Apr 16 18:37:03 2013	(r315883)
@@ -0,0 +1,67 @@
+--- src/core/CLucene/util/BitSet.cpp.orig	2011-03-16 20:21:07.000000000 -0400
++++ src/core/CLucene/util/BitSet.cpp	2013-03-29 17:57:05.000000000 -0400
+@@ -32,6 +32,25 @@
+     3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
+     4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8};
+ 
++const uint8_t BitSet::BYTE_OFFSETS[256] = {
++    8, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++    5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++    6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0,
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    7, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    6, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    5, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0, 
++    4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0};
++
++
+ BitSet::BitSet( const BitSet& copy ) :
+ 	_size( copy._size ),
+ 	_count(-1)
+@@ -180,19 +199,32 @@
+     return                            factor * (4 + (8+40)*count()) < size();
+   }
+ 
+-  int32_t BitSet::nextSetBit(int32_t fromIndex) const {
++  int32_t BitSet::nextSetBit(int32_t fromIndex) const 
++  {
+       if (fromIndex < 0)
+           _CLTHROWT(CL_ERR_IndexOutOfBounds, _T("fromIndex < 0"));
+ 
+       if (fromIndex >= _size)
+           return -1;
+ 
+-      while (true) {
+-          if ((bits[fromIndex >> 3] & (1 << (fromIndex & 7))) != 0)
+-              return fromIndex;
+-          if (++fromIndex == _size)
+-              return -1;
++      int _max = ( _size+7 ) >> 3;
++
++      unsigned int i = (int)( fromIndex>>3 );
++      unsigned int subIndex = fromIndex & 0x7; // index within the byte
++      uint8_t byte = bits[i] >> subIndex;  // skip all the bits to the right of index
++
++      if ( byte != 0 ) 
++      {
++          return ( ( i<<3 ) + subIndex + BYTE_OFFSETS[ byte ] );
++      }
++
++      while( ++i < _max ) 
++      {
++          byte = bits[i];
++          if ( byte != 0 ) 
++              return ( ( i<<3 ) + BYTE_OFFSETS[ byte ] );
+       }
++      return -1;
+   }
+ 
+ CL_NS_END

Added: head/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.h
==============================================================================
--- /dev/null	00:00:00 1970	(empty, because file is newly added)
+++ head/textproc/clucene/files/patch-src__core__CLucene__util__BitSet.h	Tue Apr 16 18:37:03 2013	(r315883)
@@ -0,0 +1,10 @@
+--- src/core/CLucene/util/BitSet.h.orig	2011-03-16 20:21:07.000000000 -0400
++++ src/core/CLucene/util/BitSet.h	2013-03-29 17:57:05.000000000 -0400
+@@ -39,6 +39,7 @@
+   /** Indicates if the bit vector is sparse and should be saved as a d-gaps list, or dense, and should be saved as a bit set. */
+   bool isSparse();
+   static const uint8_t BYTE_COUNTS[256];
++  static const uint8_t BYTE_OFFSETS[256];
+ protected:
+ 	BitSet( const BitSet& copy );
+ 


More information about the svn-ports-all mailing list