nltk security issue and upgrade unstructured (#9558)

langgenius · Oct 23, 2024 · 3e9d271 · 3e9d271
1 parent ecc8bee
commit 3e9d271
Show file tree

Hide file tree

Showing 13 changed files with 1,669 additions and 786 deletions.
diff --git a/api/constants/__init__.py b/api/constants/__init__.py
@@ -15,7 +15,9 @@
 
 if dify_config.ETL_TYPE == "Unstructured":
     DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls"]
-    DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "ppt", "xml", "epub"))
+    DOCUMENT_EXTENSIONS.extend(("docx", "csv", "eml", "msg", "pptx", "xml", "epub"))
+    if dify_config.UNSTRUCTURED_API_URL:
+        DOCUMENT_EXTENSIONS.append("ppt")
     DOCUMENT_EXTENSIONS.extend([ext.upper() for ext in DOCUMENT_EXTENSIONS])
 else:
     DOCUMENT_EXTENSIONS = ["txt", "markdown", "md", "pdf", "html", "htm", "xlsx", "xls", "docx", "csv"]

diff --git a/api/core/rag/extractor/extract_processor.py b/api/core/rag/extractor/extract_processor.py
@@ -21,6 +21,7 @@
 from core.rag.extractor.unstructured.unstructured_epub_extractor import UnstructuredEpubExtractor
 from core.rag.extractor.unstructured.unstructured_markdown_extractor import UnstructuredMarkdownExtractor
 from core.rag.extractor.unstructured.unstructured_msg_extractor import UnstructuredMsgExtractor
+from core.rag.extractor.unstructured.unstructured_pdf_extractor import UnstructuredPDFExtractor
 from core.rag.extractor.unstructured.unstructured_ppt_extractor import UnstructuredPPTExtractor
 from core.rag.extractor.unstructured.unstructured_pptx_extractor import UnstructuredPPTXExtractor
 from core.rag.extractor.unstructured.unstructured_text_extractor import UnstructuredTextExtractor
@@ -102,10 +103,10 @@ def extract(
                     if file_extension in {".xlsx", ".xls"}:
                         extractor = ExcelExtractor(file_path)
                     elif file_extension == ".pdf":
-                        extractor = PdfExtractor(file_path)
+                        extractor = UnstructuredPDFExtractor(file_path, unstructured_api_url, unstructured_api_key)
                     elif file_extension in {".md", ".markdown"}:
                         extractor = (
-                            UnstructuredMarkdownExtractor(file_path, unstructured_api_url)
+                            UnstructuredMarkdownExtractor(file_path, unstructured_api_url, unstructured_api_key)
                             if is_automatic
                             else MarkdownExtractor(file_path, autodetect_encoding=True)
                         )
@@ -116,17 +117,17 @@ def extract(
                     elif file_extension == ".csv":
                         extractor = CSVExtractor(file_path, autodetect_encoding=True)
                     elif file_extension == ".msg":
-                        extractor = UnstructuredMsgExtractor(file_path, unstructured_api_url)
+                        extractor = UnstructuredMsgExtractor(file_path, unstructured_api_url, unstructured_api_key)
                     elif file_extension == ".eml":
-                        extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url)
+                        extractor = UnstructuredEmailExtractor(file_path, unstructured_api_url, unstructured_api_key)
                     elif file_extension == ".ppt":
                         extractor = UnstructuredPPTExtractor(file_path, unstructured_api_url, unstructured_api_key)
                     elif file_extension == ".pptx":
-                        extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url)
+                        extractor = UnstructuredPPTXExtractor(file_path, unstructured_api_url, unstructured_api_key)
                     elif file_extension == ".xml":
-                        extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url)
+                        extractor = UnstructuredXmlExtractor(file_path, unstructured_api_url, unstructured_api_key)
                     elif file_extension == ".epub":
-                        extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url)
+                        extractor = UnstructuredEpubExtractor(file_path, unstructured_api_url, unstructured_api_key)
                     else:
                         # txt
                         extractor = (

diff --git a/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_eml_extractor.py
@@ -10,24 +10,26 @@
 
 
 class UnstructuredEmailExtractor(BaseExtractor):
-    """Load msg files.
+    """Load eml files.
     Args:
         file_path: Path to the file to load.
     """
 
-    def __init__(
-        self,
-        file_path: str,
-        api_url: str,
-    ):
+    def __init__(self, file_path: str, api_url: str, api_key: str):
         """Initialize with file path."""
         self._file_path = file_path
         self._api_url = api_url
+        self._api_key = api_key
 
     def extract(self) -> list[Document]:
-        from unstructured.partition.email import partition_email
+        if self._api_url:
+            from unstructured.partition.api import partition_via_api
 
-        elements = partition_email(filename=self._file_path)
+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+        else:
+            from unstructured.partition.email import partition_email
+
+            elements = partition_email(filename=self._file_path)
 
         # noinspection PyBroadException
         try:

diff --git a/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py b/api/core/rag/extractor/unstructured/unstructured_epub_extractor.py
@@ -19,15 +19,23 @@ def __init__(
         self,
         file_path: str,
         api_url: Optional[str] = None,
+        api_key: Optional[str] = None,
     ):
         """Initialize with file path."""
         self._file_path = file_path
         self._api_url = api_url
+        self._api_key = api_key
 
     def extract(self) -> list[Document]:
-        from unstructured.partition.epub import partition_epub
+        if self._api_url:
+            from unstructured.partition.api import partition_via_api
+
+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+        else:
+            from unstructured.partition.epub import partition_epub
+
+            elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
 
-        elements = partition_epub(filename=self._file_path, xml_keep_tags=True)
         from unstructured.chunking.title import chunk_by_title
 
         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)

diff --git a/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py b/api/core/rag/extractor/unstructured/unstructured_markdown_extractor.py
@@ -24,19 +24,21 @@ class UnstructuredMarkdownExtractor(BaseExtractor):
             if the specified encoding fails.
     """
 
-    def __init__(
-        self,
-        file_path: str,
-        api_url: str,
-    ):
+    def __init__(self, file_path: str, api_url: str, api_key: str):
         """Initialize with file path."""
         self._file_path = file_path
         self._api_url = api_url
+        self._api_key = api_key
 
     def extract(self) -> list[Document]:
-        from unstructured.partition.md import partition_md
+        if self._api_url:
+            from unstructured.partition.api import partition_via_api
 
-        elements = partition_md(filename=self._file_path)
+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+        else:
+            from unstructured.partition.md import partition_md
+
+            elements = partition_md(filename=self._file_path)
         from unstructured.chunking.title import chunk_by_title
 
         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)

diff --git a/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py b/api/core/rag/extractor/unstructured/unstructured_msg_extractor.py
@@ -14,15 +14,21 @@ class UnstructuredMsgExtractor(BaseExtractor):
         file_path: Path to the file to load.
     """
 
-    def __init__(self, file_path: str, api_url: str):
+    def __init__(self, file_path: str, api_url: str, api_key: str):
         """Initialize with file path."""
         self._file_path = file_path
         self._api_url = api_url
+        self._api_key = api_key
 
     def extract(self) -> list[Document]:
-        from unstructured.partition.msg import partition_msg
+        if self._api_url:
+            from unstructured.partition.api import partition_via_api
 
-        elements = partition_msg(filename=self._file_path)
+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+        else:
+            from unstructured.partition.msg import partition_msg
+
+            elements = partition_msg(filename=self._file_path)
         from unstructured.chunking.title import chunk_by_title
 
         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)

diff --git a/api/core/rag/extractor/unstructured/unstructured_pdf_extractor.py b/api/core/rag/extractor/unstructured/unstructured_pdf_extractor.py
@@ -0,0 +1,47 @@
+import logging
+
+from core.rag.extractor.extractor_base import BaseExtractor
+from core.rag.models.document import Document
+
+logger = logging.getLogger(__name__)
+
+
+class UnstructuredPDFExtractor(BaseExtractor):
+    """Load pdf files.
+
+
+    Args:
+        file_path: Path to the file to load.
+
+        api_url: Unstructured API URL
+
+        api_key: Unstructured API Key
+    """
+
+    def __init__(self, file_path: str, api_url: str, api_key: str):
+        """Initialize with file path."""
+        self._file_path = file_path
+        self._api_url = api_url
+        self._api_key = api_key
+
+    def extract(self) -> list[Document]:
+        if self._api_url:
+            from unstructured.partition.api import partition_via_api
+
+            elements = partition_via_api(
+                filename=self._file_path, api_url=self._api_url, api_key=self._api_key, strategy="auto"
+            )
+        else:
+            from unstructured.partition.pdf import partition_pdf
+
+            elements = partition_pdf(filename=self._file_path, strategy="auto")
+
+        from unstructured.chunking.title import chunk_by_title
+
+        chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)
+        documents = []
+        for chunk in chunks:
+            text = chunk.text.strip()
+            documents.append(Document(page_content=text))
+
+        return documents
diff --git a/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py b/api/core/rag/extractor/unstructured/unstructured_ppt_extractor.py
@@ -7,7 +7,7 @@
 
 
 class UnstructuredPPTExtractor(BaseExtractor):
-    """Load msg files.
+    """Load ppt files.
 
 
     Args:
@@ -21,9 +21,12 @@ def __init__(self, file_path: str, api_url: str, api_key: str):
         self._api_key = api_key
 
     def extract(self) -> list[Document]:
-        from unstructured.partition.api import partition_via_api
+        if self._api_url:
+            from unstructured.partition.api import partition_via_api
 
-        elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+        else:
+            raise NotImplementedError("Unstructured API Url is not configured")
         text_by_page = {}
         for element in elements:
             page = element.metadata.page_number

diff --git a/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py b/api/core/rag/extractor/unstructured/unstructured_pptx_extractor.py
@@ -7,22 +7,28 @@
 
 
 class UnstructuredPPTXExtractor(BaseExtractor):
-    """Load msg files.
+    """Load pptx files.
 
 
     Args:
         file_path: Path to the file to load.
     """
 
-    def __init__(self, file_path: str, api_url: str):
+    def __init__(self, file_path: str, api_url: str, api_key: str):
         """Initialize with file path."""
         self._file_path = file_path
         self._api_url = api_url
+        self._api_key = api_key
 
     def extract(self) -> list[Document]:
-        from unstructured.partition.pptx import partition_pptx
+        if self._api_url:
+            from unstructured.partition.api import partition_via_api
 
-        elements = partition_pptx(filename=self._file_path)
+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+        else:
+            from unstructured.partition.pptx import partition_pptx
+
+            elements = partition_pptx(filename=self._file_path)
         text_by_page = {}
         for element in elements:
             page = element.metadata.page_number

diff --git a/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py b/api/core/rag/extractor/unstructured/unstructured_xml_extractor.py
@@ -7,22 +7,29 @@
 
 
 class UnstructuredXmlExtractor(BaseExtractor):
-    """Load msg files.
+    """Load xml files.
 
 
     Args:
         file_path: Path to the file to load.
     """
 
-    def __init__(self, file_path: str, api_url: str):
+    def __init__(self, file_path: str, api_url: str, api_key: str):
         """Initialize with file path."""
         self._file_path = file_path
         self._api_url = api_url
+        self._api_key = api_key
 
     def extract(self) -> list[Document]:
-        from unstructured.partition.xml import partition_xml
+        if self._api_url:
+            from unstructured.partition.api import partition_via_api
+
+            elements = partition_via_api(filename=self._file_path, api_url=self._api_url, api_key=self._api_key)
+        else:
+            from unstructured.partition.xml import partition_xml
+
+            elements = partition_xml(filename=self._file_path, xml_keep_tags=True)
 
-        elements = partition_xml(filename=self._file_path, xml_keep_tags=True)
         from unstructured.chunking.title import chunk_by_title
 
         chunks = chunk_by_title(elements, max_characters=2000, combine_text_under_n_chars=2000)