git: b6e6388dab6d - main - Add textproc/py-textract: Extract text from any document

Go to: [ bottom of page ] [ top of archives ] [ this month ]
From: Li-Wen Hsu <lwhsu_at_FreeBSD.org>
Date: Tue, 25 Oct 2022 20:49:33 UTC
The branch main has been updated by lwhsu:

URL: https://cgit.FreeBSD.org/ports/commit/?id=b6e6388dab6dd78e37adebf738e568997db6d15a

commit b6e6388dab6dd78e37adebf738e568997db6d15a
Author:     Jesús Daniel Colmenares Oviedo <DtxdF@disroot.org>
AuthorDate: 2022-09-23 16:18:31 +0000
Commit:     Li-Wen Hsu <lwhsu@FreeBSD.org>
CommitDate: 2022-10-25 20:49:12 +0000

    Add textproc/py-textract: Extract text from any document
    
    textract provides a single interface for extracting content embedded
    from Word documents, PowerPoint presentations, PDFs and much more,
    which can be used for further textual analysis and visualization.
    
    WWW: https://github.com/deanmalmgren/textract
    
    PR:             265768
---
 textproc/Makefile              |  1 +
 textproc/py-textract/Makefile  | 69 ++++++++++++++++++++++++++++++++++++++++++
 textproc/py-textract/distinfo  |  3 ++
 textproc/py-textract/pkg-descr |  3 ++
 4 files changed, 76 insertions(+)

diff --git a/textproc/Makefile b/textproc/Makefile
index 5b5097135fcb..8b52f2176b4f 100644
--- a/textproc/Makefile
+++ b/textproc/Makefile
@@ -1545,6 +1545,7 @@
     SUBDIR += py-tablib
     SUBDIR += py-terminaltables
     SUBDIR += py-textdistance
+    SUBDIR += py-textract
     SUBDIR += py-textfsm
     SUBDIR += py-texttable
     SUBDIR += py-textual
diff --git a/textproc/py-textract/Makefile b/textproc/py-textract/Makefile
new file mode 100644
index 000000000000..a1a57ea56e62
--- /dev/null
+++ b/textproc/py-textract/Makefile
@@ -0,0 +1,69 @@
+PORTNAME=	textract
+PORTVERSION=	1.6.5
+CATEGORIES=	textproc python
+MASTER_SITES=	CHEESESHOP
+PKGNAMEPREFIX=	${PYTHON_PKGNAMEPREFIX}
+
+MAINTAINER=	DtxdF@disroot.org
+COMMENT=	Extract text from any document
+WWW=	https://github.com/deanmalmgren/textract
+
+LICENSE=	MIT
+LICENSE_FILE=	${WRKSRC}/LICENSE
+
+RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}argcomplete>=1.10.0:devel/py-argcomplete@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}chardet>=3:textproc/py-chardet@${PY_FLAVOR} \
+		${PYTHON_PKGNAMEPREFIX}six>1.12.0:devel/py-six@${PY_FLAVOR}
+
+USES=	python:3.8+
+USE_PYTHON=	autoplist distutils
+
+OPTIONS_DEFINE=	ANTIWORD BEAUTIFULSOUP DOCX2TXT MSG LIBXML2 \
+		LIBXSLT PPTX PS SPREADSHEET UNRTF
+OPTIONS_DEFAULT=	ANTIWORD BEAUTIFULSOUP DOCX2TXT FFMPEG FLAC JPEG_TURBO \
+			LAME LIBXML2 LIBXSLT MSG PDFTOTEXT PPTX PS SOX \
+			SPEECH_RECOGNITION SPREADSHEET TESSERACT UNRTF
+OPTIONS_GROUP=	AUDIO OCR PDF RTF
+OPTIONS_GROUP_AUDIO=	FFMPEG FLAC LAME POCKETSPHINX SOX SPEECH_RECOGNITION
+OPTIONS_GROUP_OCR=	JPEG_TURBO TESSERACT
+OPTIONS_GROUP_PDF=	PDFMINER PDFTOTEXT
+
+ANTIWORD_DESC=	DOC document support
+BEAUTIFULSOUP_DESC=	HTML parsing library
+DOCX2TXT_DESC=	DOCX document support
+JPEG_TURBO_DESC=	SIMD-accelerated JPEG codec
+LIBXML2_DESC=	Python interface for XML parser library
+LIBXSLT_DESC=	XML stylesheet transformation library
+MSG_DESC=	MS Outlook MSG file format support
+PDFMINER_DESC=	PDF parser and analyzer
+PDFTOTEXT_DESC=	Extract text from a PDF document
+POCKETSPHINX_DESC=	Interface to CMU Sphinxbase and Pocketsphinx
+PPTX_DESC=	MS PowerPoint PPTX presentations support
+SOX_DESC=	Command-line audio processing tool
+SPEECH_RECOGNITION_DESC=	Python library for performing speech recognition
+SPREADSHEET_DESC=	XLS and XLSX spreadsheet support
+TESSERACT_DESC=	Commercial quality open source OCR engine
+UNRTF_DESC=	RTF document support
+
+ANTIWORD_RUN_DEPENDS=	antiword>0:textproc/antiword
+BEAUTIFULSOUP_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}beautifulsoup>=4.8.0:www/py-beautifulsoup@${PY_FLAVOR}
+DOCX2TXT_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}docx2txt>=0.8:textproc/py-docx2txt@${PY_FLAVOR}
+FFMPEG_RUN_DEPENDS=	ffmpeg>0:multimedia/ffmpeg
+FLAC_RUN_DEPENDS=	flac>0:audio/flac
+JPEG_TURBO_RUN_DEPENDS=	jpeg-turbo>0:graphics/jpeg-turbo
+LAME_RUN_DEPENDS=	lame>0:audio/lame
+LIBXML2_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}libxml2>0:textproc/py-libxml2@${PY_FLAVOR}
+LIBXSLT_RUN_DEPENDS=	libxslt>=1.1.15:textproc/libxslt
+MSG_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}extract-msg>=0.29:textproc/py-extract-msg@${PY_FLAVOR}
+PDFMINER_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}pdfminer.six>=20191110:textproc/py-pdfminer.six@${PY_FLAVOR}
+PDFTOTEXT_RUN_DEPENDS=	poppler-utils>0:graphics/poppler-utils
+POCKETSPHINX_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}pocketsphinx>0:audio/py-pocketsphinx@${PY_FLAVOR}
+PPTX_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}python-pptx>=0.6.18:textproc/py-python-pptx@${PY_FLAVOR}
+PS_RUN_DEPENDS=	pstotext>0:print/pstotext
+SOX_RUN_DEPENDS=	sox>0:audio/sox
+SPEECH_RECOGNITION_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}SpeechRecognition>=3.8.1:audio/py-speechrecognition@${PY_FLAVOR}
+SPREADSHEET_RUN_DEPENDS=	${PYTHON_PKGNAMEPREFIX}xlrd>=1.2.0:textproc/py-xlrd@${PY_FLAVOR}
+TESSERACT_RUN_DEPENDS=	tesseract>0:graphics/tesseract
+UNRTF_RUN_DEPENDS=	unrtf>0:textproc/unrtf
+
+.include <bsd.port.mk>
diff --git a/textproc/py-textract/distinfo b/textproc/py-textract/distinfo
new file mode 100644
index 000000000000..14f25b8e65e4
--- /dev/null
+++ b/textproc/py-textract/distinfo
@@ -0,0 +1,3 @@
+TIMESTAMP = 1659835075
+SHA256 (textract-1.6.5.tar.gz) = 68f0f09056885821e6c43d8538987518daa94057c306679f2857cc5ee66ad850
+SIZE (textract-1.6.5.tar.gz) = 17871
diff --git a/textproc/py-textract/pkg-descr b/textproc/py-textract/pkg-descr
new file mode 100644
index 000000000000..7d4986c9d8cb
--- /dev/null
+++ b/textproc/py-textract/pkg-descr
@@ -0,0 +1,3 @@
+textract provides a single interface for extracting content embedded
+from Word documents, PowerPoint presentations, PDFs and much more,
+which can be used for further textual analysis and visualization.