git: 922291e01926 - main - textproc/sentencepiece: New port: Unsupervised text tokenizer for Neural Network-based text generation

From: Yuri Victorovich <yuri_at_FreeBSD.org>
Date: Mon, 16 Jan 2023 09:41:03 UTC
The branch main has been updated by yuri:

URL: https://cgit.FreeBSD.org/ports/commit/?id=922291e019260419b7bf80e0db65caf4563c2174

commit 922291e019260419b7bf80e0db65caf4563c2174
Author:     Yuri Victorovich <yuri@FreeBSD.org>
AuthorDate: 2023-01-16 09:36:02 +0000
Commit:     Yuri Victorovich <yuri@FreeBSD.org>
CommitDate: 2023-01-16 09:41:00 +0000

    textproc/sentencepiece: New port: Unsupervised text tokenizer for Neural Network-based text generation
---
 textproc/Makefile                |  1 +
 textproc/sentencepiece/Makefile  | 21 +++++++++++++++++++++
 textproc/sentencepiece/distinfo  |  3 +++
 textproc/sentencepiece/pkg-descr |  7 +++++++
 textproc/sentencepiece/pkg-plist | 16 ++++++++++++++++
 5 files changed, 48 insertions(+)

diff --git a/textproc/Makefile b/textproc/Makefile
index a85511af2b50..e2d0e0ea9521 100644
--- a/textproc/Makefile
+++ b/textproc/Makefile
@@ -1888,6 +1888,7 @@
     SUBDIR += sdocbook-xml
     SUBDIR += sdom
     SUBDIR += senna
+    SUBDIR += sentencepiece
     SUBDIR += sgmlformat
     SUBDIR += sgmls
     SUBDIR += sgrep
diff --git a/textproc/sentencepiece/Makefile b/textproc/sentencepiece/Makefile
new file mode 100644
index 000000000000..84e7ac9ca43e
--- /dev/null
+++ b/textproc/sentencepiece/Makefile
@@ -0,0 +1,21 @@
+PORTNAME=	sentencepiece
+DISTVERSIONPREFIX=	v
+DISTVERSION=	0.1.97
+CATEGORIES=	textproc # machine-learning
+
+MAINTAINER=	yuri@FreeBSD.org
+COMMENT=	Unsupervised text tokenizer for Neural Network-based text generation
+WWW=		https://github.com/google/sentencepiece
+
+LICENSE=	APACHE20
+LICENSE_FILE=	${WRKSRC}/LICENSE
+
+USES=		cmake:testing compiler:c++17-lang
+USE_LDCONFIG=	yes
+
+USE_GITHUB=	yes
+GH_ACCOUNT=	google
+
+CMAKE_TESTING_ON=	SPM_BUILD_TEST
+
+.include <bsd.port.mk>
diff --git a/textproc/sentencepiece/distinfo b/textproc/sentencepiece/distinfo
new file mode 100644
index 000000000000..c29dc9430710
--- /dev/null
+++ b/textproc/sentencepiece/distinfo
@@ -0,0 +1,3 @@
+TIMESTAMP = 1673860778
+SHA256 (google-sentencepiece-v0.1.97_GH0.tar.gz) = 41c3a07f315e3ac87605460c8bb8d739955bc8e7f478caec4017ef9b7d78669b
+SIZE (google-sentencepiece-v0.1.97_GH0.tar.gz) = 11945436
diff --git a/textproc/sentencepiece/pkg-descr b/textproc/sentencepiece/pkg-descr
new file mode 100644
index 000000000000..62b7de5f4ece
--- /dev/null
+++ b/textproc/sentencepiece/pkg-descr
@@ -0,0 +1,7 @@
+SentencePiece is an unsupervised text tokenizer and detokenizer mainly for
+Neural Network-based text generation systems where the vocabulary size is
+predetermined prior to the neural model training. SentencePiece implements
+subword units (e.g., byte-pair-encoding (BPE)) and unigram language model
+with the extension of direct training from raw sentences. SentencePiece
+allows us to make a purely end-to-end system that does not depend on
+language-specific pre/postprocessing.
diff --git a/textproc/sentencepiece/pkg-plist b/textproc/sentencepiece/pkg-plist
new file mode 100644
index 000000000000..7640dc4d9c23
--- /dev/null
+++ b/textproc/sentencepiece/pkg-plist
@@ -0,0 +1,16 @@
+bin/spm_decode
+bin/spm_encode
+bin/spm_export_vocab
+bin/spm_normalize
+bin/spm_train
+include/sentencepiece_processor.h
+include/sentencepiece_trainer.h
+lib/libsentencepiece.a
+lib/libsentencepiece.so
+lib/libsentencepiece.so.0
+lib/libsentencepiece.so.0.0.0
+lib/libsentencepiece_train.a
+lib/libsentencepiece_train.so
+lib/libsentencepiece_train.so.0
+lib/libsentencepiece_train.so.0.0.0
+libdata/pkgconfig/sentencepiece.pc