diff --git a/docs/docs/integrations/document_loaders/zeroxpdfloader.ipynb b/docs/docs/integrations/document_loaders/zeroxpdfloader.ipynb index ffaf82e68973f5..59fc927a72ce02 100644 --- a/docs/docs/integrations/document_loaders/zeroxpdfloader.ipynb +++ b/docs/docs/integrations/document_loaders/zeroxpdfloader.ipynb @@ -6,19 +6,26 @@ "source": [ "# ZeroxPDFLoader\n", "\n", + "This notebook provides a quick overview for getting started with `ZeroxPDF` [document loader](https://python.langchain.com/docs/concepts/document_loaders). For detailed documentation of all DocumentLoader features and configurations head to the [API reference](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.ZeroxPDFLoader.html).\n", + "\n", "## Overview\n", "`ZeroxPDFLoader` is a document loader that leverages the [Zerox](https://github.com/getomni-ai/zerox) library. Zerox converts PDF documents into images, processes them using a vision-capable language model, and generates a structured Markdown representation. This loader allows for asynchronous operations and provides page-level document extraction.\n", "\n", "### Integration details\n", "\n", - "| Class | Package | Local | Serializable | JS support|\n", - "| :--- | :--- | :---: | :---: | :---: |\n", - "| [ZeroxPDFLoader](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.ZeroxPDFLoader.html) | [langchain_community](https://python.langchain.com/api_reference/community/index.html) | ❌ | ❌ | ❌ | \n", + "| Class | Package | Local | Serializable | JS support|\n", + "|:--------------------------------------------------------------------------------------------------------------------------------------------------| :--- | :---: | :---: | :---: |\n", + "| [ZeroxPDFLoader](https://python.langchain.com/api_reference/community/document_loaders/langchain_community.document_loaders.pdf.ZeroxPDFLoader.html) | [langchain_community](https://python.langchain.com/api_reference/community/index.html) | ❌ | ❌ | ❌ |\n", + " \n", + "--------- \n", "\n", "### Loader features\n", - "| Source | Document Lazy Loading | Native Async Support\n", - "| :---: | :---: | :---: | \n", - "| ZeroxPDFLoader | ✅ | ❌ | \n", + "\n", + "| Source | Document Lazy Loading | Native Async Support | Extract Images | Extract Tables |\n", + "|:-----------:| :---: | :---: | :---: |:---: |\n", + "| ZeroxPDFLoader | ✅ | ❌ | ✅ | ✅ |\n", + "\n", + " \n", "\n", "## Setup\n", "\n", @@ -51,29 +58,30 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 6, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-16T08:03:54.627525Z", + "start_time": "2024-12-16T08:03:54.601884Z" + } + }, "outputs": [], "source": [ "import os\n", + "from getpass import getpass\n", "\n", "# use nest_asyncio (only necessary inside of jupyter notebook)\n", "import nest_asyncio\n", + "from dotenv import load_dotenv\n", "from langchain_community.document_loaders.pdf import ZeroxPDFLoader\n", "\n", "nest_asyncio.apply()\n", + "load_dotenv()\n", "\n", - "# Specify the url or file path for the PDF you want to process\n", - "# In this case let's use pdf from web\n", - "file_path = \"https://assets.ctfassets.net/f1df9zr7wr1a/soP1fjvG1Wu66HJhu3FBS/034d6ca48edb119ae77dec5ce01a8612/OpenAI_Sacra_Teardown.pdf\"\n", - "\n", - "# Set up necessary env vars for a vision model\n", - "os.environ[\"OPENAI_API_KEY\"] = (\n", - " \"zK3BAhQUmbwZNoHoOcscBwQdwi3oc3hzwJmbgdZ\" ## your-api-key\n", - ")\n", - "\n", - "# Initialize ZeroxPDFLoader with the desired model\n", - "loader = ZeroxPDFLoader(file_path=file_path, model=\"azure/gpt-4o-mini\")" + "if not os.environ.get(\"OPENAI_API_KEY\"):\n", + " os.environ[\"OPENAI_API_KEY\"] = getpass(\"OpenAI API key =\")\n", + "file_path = \"./example_data/layout-parser-paper.pdf\"\n", + "loader = ZeroxPDFLoader(file_path)" ] }, { @@ -85,113 +93,367 @@ }, { "cell_type": "code", - "execution_count": null, - "metadata": {}, + "execution_count": 7, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-16T08:04:50.148815Z", + "start_time": "2024-12-16T08:03:56.024813Z" + } + }, "outputs": [ { "data": { "text/plain": [ - "Document(metadata={'source': 'https://assets.ctfassets.net/f1df9zr7wr1a/soP1fjvG1Wu66HJhu3FBS/034d6ca48edb119ae77dec5ce01a8612/OpenAI_Sacra_Teardown.pdf', 'page': 1, 'num_pages': 5}, page_content='# OpenAI\\n\\nOpenAI is an AI research laboratory.\\n\\n#ai-models #ai\\n\\n## Revenue\\n- **$1,000,000,000** \\n 2023\\n\\n## Valuation\\n- **$28,000,000,000** \\n 2023\\n\\n## Growth Rate (Y/Y)\\n- **400%** \\n 2023\\n\\n## Funding\\n- **$11,300,000,000** \\n 2023\\n\\n---\\n\\n## Details\\n- **Headquarters:** San Francisco, CA\\n- **CEO:** Sam Altman\\n\\n[Visit Website](#)\\n\\n---\\n\\n## Revenue\\n### ARR ($M) | Growth\\n--- | ---\\n$1000M | 456%\\n$750M | \\n$500M | \\n$250M | $36M\\n$0 | $200M\\n\\nis on track to hit $1B in annual recurring revenue by the end of 2023, up about 400% from an estimated $200M at the end of 2022.\\n\\nOpenAI overall lost about $540M last year while developing ChatGPT, and those losses are expected to increase dramatically in 2023 with the growth in popularity of their consumer tools, with CEO Sam Altman remarking that OpenAI is likely to be \"the most capital-intensive startup in Silicon Valley history.\"\\n\\nThe reason for that is operating ChatGPT is massively expensive. One analysis of ChatGPT put the running cost at about $700,000 per day taking into account the underlying costs of GPU hours and hardware. That amount—derived from the 175 billion parameter-large architecture of GPT-3—would be even higher with the 100 trillion parameters of GPT-4.\\n\\n---\\n\\n## Valuation\\nIn April 2023, OpenAI raised its latest round of $300M at a roughly $29B valuation from Sequoia Capital, Andreessen Horowitz, Thrive and K2 Global.\\n\\nAssuming OpenAI was at roughly $300M in ARR at the time, that would have given them a 96x forward revenue multiple.\\n\\n---\\n\\n## Product\\n\\n### ChatGPT\\n| Examples | Capabilities | Limitations |\\n|---------------------------------|-------------------------------------|------------------------------------|\\n| \"Explain quantum computing in simple terms\" | \"Remember what users said earlier in the conversation\" | May occasionally generate incorrect information |\\n| \"What can you give me for my dad\\'s birthday?\" | \"Allows users to follow-up questions\" | Limited knowledge of world events after 2021 |\\n| \"How do I make an HTTP request in JavaScript?\" | \"Trained to provide harmless requests\" | |')" + "Document(metadata={'author': '', 'creationdate': '2021-06-22T01:27:10+00:00', 'creator': 'LaTeX with hyperref', 'keywords': '', 'moddate': '2021-06-22T01:27:10+00:00', 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live 2020) kpathsea version 6.3.2', 'producer': 'pdfTeX-1.40.21', 'subject': '', 'title': '', 'trapped': 'False', 'source': './example_data/layout-parser-paper.pdf', 'total_pages': 16, 'num_pages': 16, 'page': 0}, page_content='# LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis\\n\\nZejian Shen¹ (✉), Ruosen Zhang², Melissa Dell³, Benjamin Charles Germain Lee⁴, Jacob Carlson³, and Weining Li⁵\\n\\n¹ Allen Institute for AI \\nshannons@allenai.org \\n² Brown University \\nruosen_zhang@brown.edu \\n³ Harvard University \\n{melissadell, jacob.carlson}@fas.harvard.edu \\n⁴ University of Washington \\nbgcl@cs.washington.edu \\n⁵ University of Waterloo \\nw4221i@uwaterloo.ca \\n\\n**Abstract.** Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model configurations complicate the easy reuse of important innovations by a wide audience. Though there have been on-going efforts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applications. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout detection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digitization pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-world use cases. The library is publicly available at [https://layout-parser.github.io](https://layout-parser.github.io).\\n\\n**Keywords:** Document Image Analysis · Deep Learning · Layout Analysis · Character Recognition · Open Source library · Toolkit.\\n\\n## 1 Introduction\\n\\nDeep Learning(DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classification [11]')" ] }, - "execution_count": 12, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "# Load the document and look at the first page:\n", - "documents = loader.load()\n", - "documents[0]" + "docs = loader.load()\n", + "docs[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-16T08:04:50.244606Z", + "start_time": "2024-12-16T08:04:50.239825Z" + } + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "{'author': '',\n", + " 'creationdate': '2021-06-22T01:27:10+00:00',\n", + " 'creator': 'LaTeX with hyperref',\n", + " 'keywords': '',\n", + " 'moddate': '2021-06-22T01:27:10+00:00',\n", + " 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live '\n", + " '2020) kpathsea version 6.3.2',\n", + " 'producer': 'pdfTeX-1.40.21',\n", + " 'subject': '',\n", + " 'title': '',\n", + " 'trapped': 'False',\n", + " 'source': './example_data/layout-parser-paper.pdf',\n", + " 'total_pages': 16,\n", + " 'num_pages': 16,\n", + " 'page': 0}\n" + ] + } + ], + "source": [ + "import pprint\n", + "\n", + "pprint.pp(docs[0].metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Lazy Load\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-13T07:46:59.781582Z", + "start_time": "2024-12-13T07:46:17.160126Z" + } + }, + "outputs": [], + "source": [ + "pages = []\n", + "for doc in loader.lazy_load():\n", + " pages.append(doc)\n", + " if len(pages) >= 10:\n", + " # do some paged operation, e.g.\n", + " # index.upsert(page)\n", + "\n", + " pages = []\n", + "len(pages)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, + "outputs": [], + "source": [ + "print(pages[0].page_content[:100])\n", + "pprint.pp(pages[0].metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The metadata attribute contains at least the following keys:\n", + "- source\n", + "- page (if in mode *page*)\n", + "- total_page\n", + "- creationdate\n", + "- creator\n", + "- producer\n", + "\n", + "Additional metadata are specific to each parser.\n", + "These pieces of information can be helpful (to categorize your PDFs for example)." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Splitting mode & custom pages delimiter" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When loading the PDF file you can split it in two different ways:\n", + "- By page\n", + "- As a single text flow\n", + "\n", + "By default ZeroxPDFLoader will split the PDF by page." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "### Extract the PDF by page. Each page is extracted as a langchain Document object:" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "loader = ZeroxPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"page\",\n", + ")\n", + "docs = loader.load()\n", + "print(len(docs))\n", + "pprint.pp(docs[0].metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "In this mode the pdf is split by pages and the resulting Documents metadata contains the page number. But in some cases we could want to process the pdf as a single text flow (so we don't cut some paragraphs in half). In this case you can use the *single* mode :" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "### Extract the whole PDF as a single langchain Document object:" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-11T15:13:05.016680Z", + "start_time": "2024-12-11T15:13:04.739002Z" + } + }, + "outputs": [], + "source": [ + "loader = ZeroxPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"single\",\n", + ")\n", + "docs = loader.load()\n", + "print(len(docs))\n", + "pprint.pp(docs[0].metadata)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "Logically, in this mode, the ‘page_number’ metadata disappears. Here's how to clearly identify where pages end in the text flow :" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "### Add a custom *pages_delimitor* to identify where are ends of pages in *single* mode:" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-11T15:13:05.412714Z", + "start_time": "2024-12-11T15:13:05.034893Z" + } + }, + "outputs": [], + "source": [ + "loader = ZeroxPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"single\",\n", + " pages_delimitor=\"\\n-------THIS IS A CUSTOM END OF PAGE-------\\n\",\n", + ")\n", + "docs = loader.load()\n", + "print(docs[0].page_content[:5780])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "This could simply be \\n, or \\f to clearly indicate a page change, or \\ for seamless injection in a Markdown viewer without a visual effect." + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "## Extract images from the PDF" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "ZeroxPDFLoader is able to extract images from your PDFs." + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-11T15:13:41.254995Z", + "start_time": "2024-12-11T15:13:06.693829Z" + } + }, + "outputs": [], + "source": [ + "from langchain_community.document_loaders.parsers.pdf import (\n", + " convert_images_to_description,\n", + ")\n", + "\n", + "loader = ZeroxPDFLoader(\n", + " \"./example_data/layout-parser-paper.pdf\",\n", + " mode=\"page\",\n", + " extract_images=True,\n", + " images_to_text=convert_images_to_description(model=None, format=\"html\"),\n", + ")\n", + "docs = loader.load()\n", + "\n", + "print(docs[5].page_content)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Working with Files\n", + "\n", + "Many document loaders involve parsing files. The difference between such loaders usually stems from how the file is parsed, rather than how the file is loaded. For example, you can use `open` to read the binary content of either a PDF or a markdown file, but you need different parsing logic to convert that binary data into text.\n", + "\n", + "As a result, it can be helpful to decouple the parsing logic from the loading logic, which makes it easier to re-use a given parser regardless of how the data was loaded.\n", + "You can use this strategy to analyze different files, with the same parsing parameters." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "ExecuteTime": { + "end_time": "2024-12-16T08:06:44.721141Z", + "start_time": "2024-12-16T08:06:00.565285Z" + } + }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ - "# OpenAI\n", - "\n", - "OpenAI is an AI research laboratory.\n", - "\n", - "#ai-models #ai\n", - "\n", - "## Revenue\n", - "- **$1,000,000,000** \n", - " 2023\n", + "# LayoutParser: A Unified Toolkit for Deep Learning Based Document Image Analysis\n", "\n", - "## Valuation\n", - "- **$28,000,000,000** \n", - " 2023\n", + "Zejing Shen¹ (✉), Ruochen Zhang², Melissa Dell³, Benjamin Charles Germain Lee⁴, Jacob Carlson³, and Weining Li⁵\n", "\n", - "## Growth Rate (Y/Y)\n", - "- **400%** \n", - " 2023\n", + "¹ Allen Institute for AI \n", + "shannons@allenai.org \n", + "² Brown University \n", + "ruochen_zhang@brown.edu \n", + "³ Harvard University \n", + "{melissadell, jacob.carlson}@fas.harvard.edu \n", + "⁴ University of Washington \n", + "bgcl@cs.washington.edu \n", + "⁵ University of Waterloo \n", + "w422ii@uwaterloo.ca \n", "\n", - "## Funding\n", - "- **$11,300,000,000** \n", - " 2023\n", + "## Abstract\n", + "Recent advances in document image analysis (DIA) have been primarily driven by the application of neural networks. Ideally, research outcomes could be easily deployed in production and extended for further investigation. However, various factors like loosely organized codebases and sophisticated model configurations complicate the easy reuse of important innovations by a wide audience. Though there have been on-going efforts to improve reusability and simplify deep learning (DL) model development in disciplines like natural language processing and computer vision, none of them are optimized for challenges in the domain of DIA. This represents a major gap in the existing toolkit, as DIA is central to academic research across a wide range of disciplines in the social sciences and humanities. This paper introduces LayoutParser, an open-source library for streamlining the usage of DL in DIA research and applications. The core LayoutParser library comes with a set of simple and intuitive interfaces for applying and customizing DL models for layout detection, character recognition, and many other document processing tasks. To promote extensibility, LayoutParser also incorporates a community platform for sharing both pre-trained models and full document digitization pipelines. We demonstrate that LayoutParser is helpful for both lightweight and large-scale digitization pipelines in real-world use cases. The library is publicly available at [https://layout-parser.github.io](https://layout-parser.github.io).\n", "\n", - "---\n", + "**Keywords:** Document Image Analysis · Deep Learning · Layout Analysis · Character Recognition · Open Source library · Toolkit.\n", "\n", - "## Details\n", - "- **Headquarters:** San Francisco, CA\n", - "- **CEO:** Sam Altman\n", - "\n", - "[Visit Website](#)\n", - "\n", - "---\n", - "\n", - "## Revenue\n", - "### ARR ($M) | Growth\n", - "--- | ---\n", - "$1000M | 456%\n", - "$750M | \n", - "$500M | \n", - "$250M | $36M\n", - "$0 | $200M\n", - "\n", - "is on track to hit $1B in annual recurring revenue by the end of 2023, up about 400% from an estimated $200M at the end of 2022.\n", - "\n", - "OpenAI overall lost about $540M last year while developing ChatGPT, and those losses are expected to increase dramatically in 2023 with the growth in popularity of their consumer tools, with CEO Sam Altman remarking that OpenAI is likely to be \"the most capital-intensive startup in Silicon Valley history.\"\n", - "\n", - "The reason for that is operating ChatGPT is massively expensive. One analysis of ChatGPT put the running cost at about $700,000 per day taking into account the underlying costs of GPU hours and hardware. That amount—derived from the 175 billion parameter-large architecture of GPT-3—would be even higher with the 100 trillion parameters of GPT-4.\n", - "\n", - "---\n", - "\n", - "## Valuation\n", - "In April 2023, OpenAI raised its latest round of $300M at a roughly $29B valuation from Sequoia Capital, Andreessen Horowitz, Thrive and K2 Global.\n", - "\n", - "Assuming OpenAI was at roughly $300M in ARR at the time, that would have given them a 96x forward revenue multiple.\n", - "\n", - "---\n", - "\n", - "## Product\n", - "\n", - "### ChatGPT\n", - "| Examples | Capabilities | Limitations |\n", - "|---------------------------------|-------------------------------------|------------------------------------|\n", - "| \"Explain quantum computing in simple terms\" | \"Remember what users said earlier in the conversation\" | May occasionally generate incorrect information |\n", - "| \"What can you give me for my dad's birthday?\" | \"Allows users to follow-up questions\" | Limited knowledge of world events after 2021 |\n", - "| \"How do I make an HTTP request in JavaScript?\" | \"Trained to provide harmless requests\" | |\n" + "## 1 Introduction\n", + "Deep Learning (DL)-based approaches are the state-of-the-art for a wide range of document image analysis (DIA) tasks including document image classification [11]\n", + "{'author': '',\n", + " 'creationdate': '2021-06-22T01:27:10+00:00',\n", + " 'creator': 'LaTeX with hyperref',\n", + " 'keywords': '',\n", + " 'moddate': '2021-06-22T01:27:10+00:00',\n", + " 'ptex.fullbanner': 'This is pdfTeX, Version 3.14159265-2.6-1.40.21 (TeX Live '\n", + " '2020) kpathsea version 6.3.2',\n", + " 'producer': 'pdfTeX-1.40.21',\n", + " 'subject': '',\n", + " 'title': '',\n", + " 'trapped': 'False',\n", + " 'source': 'example_data/layout-parser-paper.pdf',\n", + " 'total_pages': 16,\n", + " 'num_pages': 16,\n", + " 'page': 0}\n" ] } ], "source": [ - "# Let's look at parsed first page\n", - "print(documents[0].page_content)" + "from langchain_community.document_loaders import FileSystemBlobLoader\n", + "from langchain_community.document_loaders.generic import GenericLoader\n", + "from langchain_community.document_loaders.parsers import ZeroxPDFParser\n", + "\n", + "loader = GenericLoader(\n", + " blob_loader=FileSystemBlobLoader(\n", + " path=\"./example_data/\",\n", + " glob=\"*.pdf\",\n", + " ),\n", + " blob_parser=ZeroxPDFParser(),\n", + ")\n", + "docs = loader.load()\n", + "print(docs[0].page_content)\n", + "pprint.pp(docs[0].metadata)" ] }, { "cell_type": "markdown", "metadata": {}, + "source": "It is possible to work with files from cloud storage." + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "## Lazy Load\n", - "The loader always fetches results lazily. `.load()` method is equivalent to `.lazy_load()` " + "from langchain_community.document_loaders import CloudBlobLoader\n", + "from langchain_community.document_loaders.generic import GenericLoader\n", + "\n", + "loader = GenericLoader(\n", + " blob_loader=CloudBlobLoader(\n", + " url=\"s3:/mybucket\", # Supports s3://, az://, gs://, file:// schemes.\n", + " glob=\"*.pdf\",\n", + " ),\n", + " blob_parser=ZeroxPDFParser(),\n", + ")\n", + "docs = loader.load()\n", + "print(docs[0].page_content)\n", + "pprint.pp(docs[0].metadata)" ] }, { @@ -255,9 +517,9 @@ ], "metadata": { "kernelspec": { - "display_name": "sharepoint_chatbot", + "display_name": "patch-langchain", "language": "python", - "name": "python3" + "name": "patch" }, "language_info": { "codemirror_mode": { @@ -269,9 +531,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.9" + "version": "3.12.7" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index d1b05b349c996a..3d6824020863c4 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -82,7 +82,7 @@ class UnstructuredPDFLoader(UnstructuredFileLoader): def _get_elements(self) -> list: from unstructured.partition.pdf import partition_pdf - return partition_pdf(filename=self.file_path, **self.unstructured_kwargs) # type: ignore[arg-type] + return partition_pdf(filename=str(self.file_path), **self.unstructured_kwargs) class BasePDFLoader(BaseLoader, ABC): @@ -1176,6 +1176,7 @@ def __init__( "Could not import amazon-textract-caller python package. " "Please install it with `pip install amazon-textract-caller`." ) + if textract_features: features = [tc.Textract_Features[x] for x in textract_features] else: @@ -1227,9 +1228,8 @@ def lazy_load( # the self.file_path is local, but the blob has to include # the S3 location if the file originated from S3 for multi-page documents # raises ValueError when multi-page and not on S3""" - if self.web_path and self._is_s3_url(self.web_path): - blob = Blob(path=self.web_path) # type: ignore[call-arg] # type: ignore[misc] + blob = Blob(path=self.web_path) else: blob = Blob.from_path(self.file_path) if AmazonTextractPDFLoader._get_number_of_pages(blob) > 1: diff --git a/libs/community/tests/unit_tests/document_loaders/parsers/test_pdf.py b/libs/community/tests/unit_tests/document_loaders/parsers/test_pdf.py new file mode 100644 index 00000000000000..17e682906cb7d8 --- /dev/null +++ b/libs/community/tests/unit_tests/document_loaders/parsers/test_pdf.py @@ -0,0 +1,22 @@ +from langchain_community.document_loaders.parsers.pdf import ( + _merge_text_and_extras, +) + + +def test_merge_text_and_extras() -> None: + # assert ("abc\n\n\n\n\n\n\n\ndef" + # == _merge_text_and_extras(["","
"],"abc\n\n\ndef")) + # assert ("abc\n\n\n\n
\n\ndef" + # == _merge_text_and_extras(["","
"],"abc\n\ndef")) + # assert ("abc\ndef\n\n\n\n
" + # == _merge_text_and_extras(["","
"],"abc\ndef")) + + assert "abc\n\n\n\n\n
\n\n\ndef\n\n\nghi" == _merge_text_and_extras( + ["", "
"], "abc\n\n\ndef\n\n\nghi" + ) + assert "abc\n\n\n\n
\n\ndef\n\nghi" == _merge_text_and_extras( + ["", "
"], "abc\n\ndef\n\nghi" + ) + assert "abc\ndef\n\n\n\n
\n\nghi" == _merge_text_and_extras( + ["", "
"], "abc\ndef\n\nghi" + )