From 616f17a80793d52b215355ae5c94642cde6f9c35 Mon Sep 17 00:00:00 2001 From: Eric Pinzur <2641606+epinzur@users.noreply.github.com> Date: Mon, 4 Mar 2024 23:08:25 +0100 Subject: [PATCH] updated unstructured items to use custom API url (#310) * updated unstructured items to use custom API url * lint * fix param --- .github/workflows/_run_e2e_tests.yml | 2 ++ docs/modules/examples/pages/langchain-unstructured-astra.adoc | 4 ++++ examples/notebooks/langchain-unstructured-astra.ipynb | 3 +++ ragstack-e2e-tests/.env.template | 1 + ragstack-e2e-tests/e2e_tests/langchain/test_unstructured.py | 1 + ragstack-e2e-tests/tox.ini | 1 + tox.ini | 1 + 7 files changed, 13 insertions(+) diff --git a/.github/workflows/_run_e2e_tests.yml b/.github/workflows/_run_e2e_tests.yml index efe495cc5..1f66ffd2d 100644 --- a/.github/workflows/_run_e2e_tests.yml +++ b/.github/workflows/_run_e2e_tests.yml @@ -91,6 +91,7 @@ jobs: GCLOUD_ACCOUNT_KEY_JSON: "${{ secrets.E2E_TESTS_GCLOUD_ACCOUNT_KEY_JSON }}" NVIDIA_API_KEY: "${{ secrets.E2E_TESTS_NVIDIA_API_KEY }}" UNSTRUCTURED_API_KEY: "${{ secrets.E2E_TESTS_UNSTRUCTURED_API_KEY }}" + UNSTRUCTURED_API_URL: "${{ secrets.E2E_TESTS_UNSTRUCTURED_API_URL }}" run: | source scripts/ci-common-env.sh tox -e notebooks @@ -118,6 +119,7 @@ jobs: LANGCHAIN_API_KEY: "${{ secrets.E2E_TESTS_LANGCHAIN_API_KEY }}" LLAMA_CLOUD_API_KEY: "${{ secrets.E2E_TESTS_LLAMA_CLOUD_API_KEY }}" UNSTRUCTURED_API_KEY: "${{ secrets.E2E_TESTS_UNSTRUCTURED_API_KEY }}" + UNSTRUCTURED_API_URL: "${{ secrets.E2E_TESTS_UNSTRUCTURED_API_URL }}" run: | source scripts/ci-common-env.sh if [ "${{ inputs.suite-name == 'ragstack' }}" == "true" ]; then diff --git a/docs/modules/examples/pages/langchain-unstructured-astra.adoc b/docs/modules/examples/pages/langchain-unstructured-astra.adoc index 153094403..ac6002497 100644 --- a/docs/modules/examples/pages/langchain-unstructured-astra.adoc +++ b/docs/modules/examples/pages/langchain-unstructured-astra.adoc @@ -38,6 +38,7 @@ Create a `.env` file in your application with the following environment variable [source,bash] ---- UNSTRUCTURED_API_KEY=... +UNSTRUCTURED_API_URL=https://api.unstructured.io/general/v0/general ASTRA_DB_API_ENDPOINT=https://-.apps.astra.datastax.com ASTRA_DB_APPLICATION_TOKEN=AstraCS:... OPENAI_API_KEY=sk-... @@ -103,6 +104,7 @@ This works well if your document doesn't contain any complex formatting or table loader = UnstructuredAPIFileLoader( file_path="./attention_pages_9_10.pdf", api_key=os.getenv("UNSTRUCTURED_API_KEY"), + url = os.getenv("UNSTRUCTURED_API_URL"), ) simple_docs = loader.load() @@ -125,6 +127,7 @@ A list of all the different element types can be found here: https://unstructure elements = unstructured.get_elements_from_api( file_path="./attention_pages_9_10.pdf", api_key=os.getenv("UNSTRUCTURED_API_KEY"), + api_url=os.getenv("UNSTRUCTURED_API_URL"), strategy="hi_res", # default "auto" pdf_infer_table_structure=True, ) @@ -262,6 +265,7 @@ else: elements = unstructured.get_elements_from_api( file_path="./attention_pages_9_10.pdf", api_key=os.getenv("UNSTRUCTURED_API_KEY"), + api_url=os.getenv("UNSTRUCTURED_API_URL"), strategy="hi_res", # default "auto" pdf_infer_table_structure=True, ) diff --git a/examples/notebooks/langchain-unstructured-astra.ipynb b/examples/notebooks/langchain-unstructured-astra.ipynb index 29c3c296f..b53674183 100644 --- a/examples/notebooks/langchain-unstructured-astra.ipynb +++ b/examples/notebooks/langchain-unstructured-astra.ipynb @@ -72,6 +72,7 @@ "from getpass import getpass\n", "\n", "os.environ[\"UNSTRUCTURED_API_KEY\"] = getpass(\"Enter your Unstructured API Key:\")\n", + "os.environ[\"UNSTRUCTURED_API_URL\"] = getpass(\"Enter your Unstructured API URL:\")\n", "os.environ[\"ASTRA_DB_ENDPOINT\"] = input(\"Enter you Astra DB API Endpoint: \")\n", "os.environ[\"ASTRA_DB_TOKEN\"] = getpass(\"Enter you Astra DB Token: \")\n", "os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter your OpenAI API Key: \")" @@ -123,6 +124,7 @@ "loader = UnstructuredAPIFileLoader(\n", " file_path=\"./resources/attention_pages_9_10.pdf\",\n", " api_key=os.getenv(\"UNSTRUCTURED_API_KEY\"),\n", + " url = os.getenv(\"UNSTRUCTURED_API_URL\"),\n", ")\n", "simple_docs = loader.load()\n", "len(simple_docs)" @@ -222,6 +224,7 @@ "elements = unstructured.get_elements_from_api(\n", " file_path=\"./resources/attention_pages_9_10.pdf\",\n", " api_key=os.getenv(\"UNSTRUCTURED_API_KEY\"),\n", + " api_url = os.getenv(\"UNSTRUCTURED_API_URL\"),\n", " strategy=\"hi_res\", # default \"auto\"\n", " pdf_infer_table_structure=True,\n", ")\n", diff --git a/ragstack-e2e-tests/.env.template b/ragstack-e2e-tests/.env.template index 8f01a15d6..cdb16af30 100644 --- a/ragstack-e2e-tests/.env.template +++ b/ragstack-e2e-tests/.env.template @@ -36,3 +36,4 @@ VECTOR_DATABASE_TYPE=astradb # Unstructured.io # UNSTRUCTURED_API_KEY= +# UNSTRUCTURED_API_URL= diff --git a/ragstack-e2e-tests/e2e_tests/langchain/test_unstructured.py b/ragstack-e2e-tests/e2e_tests/langchain/test_unstructured.py index d29366a61..884a62830 100644 --- a/ragstack-e2e-tests/e2e_tests/langchain/test_unstructured.py +++ b/ragstack-e2e-tests/e2e_tests/langchain/test_unstructured.py @@ -59,6 +59,7 @@ def test_unstructured_api(vector_store, unstructured_mode, request): mode=unstructured_mode, strategy="auto", api_key=get_required_env("UNSTRUCTURED_API_KEY"), + url=get_required_env("UNSTRUCTURED_API_URL"), ) splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=0) diff --git a/ragstack-e2e-tests/tox.ini b/ragstack-e2e-tests/tox.ini index 3c85417ae..1ae0830d2 100644 --- a/ragstack-e2e-tests/tox.ini +++ b/ragstack-e2e-tests/tox.ini @@ -28,6 +28,7 @@ pass_env = LANGCHAIN_PROJECT LLAMA_CLOUD_API_KEY UNSTRUCTURED_API_KEY + UNSTRUCTURED_API_URL deps = poetry commands = diff --git a/tox.ini b/tox.ini index fc3863781..42a07cf31 100644 --- a/tox.ini +++ b/tox.ini @@ -22,6 +22,7 @@ pass_env = LLAMA_CLOUD_API_KEY NVIDIA_API_KEY UNSTRUCTURED_API_KEY + UNSTRUCTURED_API_URL deps = pytest nbmake