Skip to content

Commit

Permalink
nltk security issue and upgrade unstructured (#9558)
Browse files Browse the repository at this point in the history
  • Loading branch information
JohnJyong authored Oct 23, 2024
1 parent ecc8bee commit 3e9d271
Show file tree
Hide file tree
Showing 13 changed files with 1,669 additions and 786 deletions.
4 changes: 3 additions & 1 deletion api/constants/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,9 @@

if dify_config.ETL_TYPE == "Unstructured":
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"]
DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "ppt", "xml", "epub"))
DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "xml", "epub"))
if dify_config.UNSTRUCTURED_API_URL:
DOCUMENT_EXTENSIONS.append("ppt")
DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
else:
DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]
Expand Down
15 changes: 8 additions & 7 deletions api/core/rag/extractor/extract_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
from core.rag.extractor.unstructured.unstructured_pdf_extractor import UnstructuredPDFExtractor
from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
from core.rag.extractor.unstructured.unstructured_text_extractor import UnstructuredTextExtractor
Expand Down Expand Up @@ -102,10 +103,10 @@ def extract(
if file_extension in {".xlsx", ".xls"}:
extractor = ExcelExtractor(file_path)
elif file_extension == ".pdf":
extractor = PdfExtractor(file_path)
extractor = UnstructuredPDFExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension in {".md", ".markdown"}:
extractor = (
UnstructuredMarkdownExtractor(file_path, unstructured_api_url)
UnstructuredMarkdownExtractor(file_path, unstructured_api_url, unstructured_api_key)
if is_automatic
else MarkdownExtractor(file_path, autodetect_encoding=True)
)
Expand All @@ -116,17 +117,17 @@ def extract(
elif file_extension == ".csv":
extractor = CSVExtractor(file_path, autodetect_encoding=True)
elif file_extension == ".msg":
extractor = UnstructuredMsgExtractor(file_path, unstructured_api_url)
extractor = UnstructuredMsgExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".eml":
extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url)
extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".ppt":
extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".pptx":
extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url)
extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".xml":
extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url)
extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url, unstructured_api_key)
elif file_extension == ".epub":
extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url)
extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url, unstructured_api_key)
else:
# txt
extractor = (
Expand Down
18 changes: 10 additions & 8 deletions api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,24 +10,26 @@


class UnstructuredEmailExtractor(BaseExtractor):
"""Load msg files.
"""Load eml files.
Args:
file_path: Path to the file to load.
"""

def __init__(
self,
file_path: str,
api_url: str,
):
def __init__(self, file_path: str, api_url: str, api_key: str):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key

def extract(self) -> list[Document]:
from unstructured.partition.email import partition_email
if self._api_url:
from unstructured.partition.api import partition_via_api

elements = partition_email(filename=self._file_path)
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.email import partition_email

elements = partition_email(filename=self._file_path)

# noinspection PyBroadException
try:
Expand Down
12 changes: 10 additions & 2 deletions api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,15 +19,23 @@ def __init__(
self,
file_path: str,
api_url: Optional[str] = None,
api_key: Optional[str] = None,
):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key

def extract(self) -> list[Document]:
from unstructured.partition.epub import partition_epub
if self._api_url:
from unstructured.partition.api import partition_via_api

elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.epub import partition_epub

elements = partition_epub(filename=self._file_path, xml_keep_tags=True)

elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
from unstructured.chunking.title import chunk_by_title

chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,19 +24,21 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
if the specified encoding fails.
"""

def __init__(
self,
file_path: str,
api_url: str,
):
def __init__(self, file_path: str, api_url: str, api_key: str):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key

def extract(self) -> list[Document]:
from unstructured.partition.md import partition_md
if self._api_url:
from unstructured.partition.api import partition_via_api

elements = partition_md(filename=self._file_path)
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.md import partition_md

elements = partition_md(filename=self._file_path)
from unstructured.chunking.title import chunk_by_title

chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,21 @@ class UnstructuredMsgExtractor(BaseExtractor):
file_path: Path to the file to load.
"""

def __init__(self, file_path: str, api_url: str):
def __init__(self, file_path: str, api_url: str, api_key: str):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key

def extract(self) -> list[Document]:
from unstructured.partition.msg import partition_msg
if self._api_url:
from unstructured.partition.api import partition_via_api

elements = partition_msg(filename=self._file_path)
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.msg import partition_msg

elements = partition_msg(filename=self._file_path)
from unstructured.chunking.title import chunk_by_title

chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
Expand Down
47 changes: 47 additions & 0 deletions api/core/rag/extractor/unstructured/unstructured_pdf_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
import logging

from core.rag.extractor.extractor_base import BaseExtractor
from core.rag.models.document import Document

logger = logging.getLogger(__name__)


class UnstructuredPDFExtractor(BaseExtractor):
"""Load pdf files.
Args:
file_path: Path to the file to load.
api_url: Unstructured API URL
api_key: Unstructured API Key
"""

def __init__(self, file_path: str, api_url: str, api_key: str):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key

def extract(self) -> list[Document]:
if self._api_url:
from unstructured.partition.api import partition_via_api

elements = partition_via_api(
filename=self._file_path, api_url=self._api_url, api_key=self._api_key, strategy="auto"
)
else:
from unstructured.partition.pdf import partition_pdf

elements = partition_pdf(filename=self._file_path, strategy="auto")

from unstructured.chunking.title import chunk_by_title

chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
documents = []
for chunk in chunks:
text = chunk.text.strip()
documents.append(Document(page_content=text))

return documents
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@


class UnstructuredPPTExtractor(BaseExtractor):
"""Load msg files.
"""Load ppt files.
Args:
Expand All @@ -21,9 +21,12 @@ def __init__(self, file_path: str, api_url: str, api_key: str):
self._api_key = api_key

def extract(self) -> list[Document]:
from unstructured.partition.api import partition_via_api
if self._api_url:
from unstructured.partition.api import partition_via_api

elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
raise NotImplementedError("Unstructured API Url is not configured")
text_by_page = {}
for element in elements:
page = element.metadata.page_number
Expand Down
14 changes: 10 additions & 4 deletions api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,28 @@


class UnstructuredPPTXExtractor(BaseExtractor):
"""Load msg files.
"""Load pptx files.
Args:
file_path: Path to the file to load.
"""

def __init__(self, file_path: str, api_url: str):
def __init__(self, file_path: str, api_url: str, api_key: str):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key

def extract(self) -> list[Document]:
from unstructured.partition.pptx import partition_pptx
if self._api_url:
from unstructured.partition.api import partition_via_api

elements = partition_pptx(filename=self._file_path)
elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.pptx import partition_pptx

elements = partition_pptx(filename=self._file_path)
text_by_page = {}
for element in elements:
page = element.metadata.page_number
Expand Down
15 changes: 11 additions & 4 deletions api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,22 +7,29 @@


class UnstructuredXmlExtractor(BaseExtractor):
"""Load msg files.
"""Load xml files.
Args:
file_path: Path to the file to load.
"""

def __init__(self, file_path: str, api_url: str):
def __init__(self, file_path: str, api_url: str, api_key: str):
"""Initialize with file path."""
self._file_path = file_path
self._api_url = api_url
self._api_key = api_key

def extract(self) -> list[Document]:
from unstructured.partition.xml import partition_xml
if self._api_url:
from unstructured.partition.api import partition_via_api

elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
else:
from unstructured.partition.xml import partition_xml

elements = partition_xml(filename=self._file_path, xml_keep_tags=True)

elements = partition_xml(filename=self._file_path, xml_keep_tags=True)
from unstructured.chunking.title import chunk_by_title

chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
Expand Down
Loading

0 comments on commit 3e9d271

Please sign in to comment.