Skip to content

Commit

Permalink
updated unstructured items to use custom API url (#310)
Browse files Browse the repository at this point in the history
* updated unstructured items to use custom API url

* lint

* fix param
  • Loading branch information
epinzur authored Mar 4, 2024
1 parent a646b3e commit 616f17a
Show file tree
Hide file tree
Showing 7 changed files with 13 additions and 0 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/_run_e2e_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ jobs:
GCLOUD_ACCOUNT_KEY_JSON: "${{ secrets.E2E_TESTS_GCLOUD_ACCOUNT_KEY_JSON }}"
NVIDIA_API_KEY: "${{ secrets.E2E_TESTS_NVIDIA_API_KEY }}"
UNSTRUCTURED_API_KEY: "${{ secrets.E2E_TESTS_UNSTRUCTURED_API_KEY }}"
UNSTRUCTURED_API_URL: "${{ secrets.E2E_TESTS_UNSTRUCTURED_API_URL }}"
run: |
source scripts/ci-common-env.sh
tox -e notebooks
Expand Down Expand Up @@ -118,6 +119,7 @@ jobs:
LANGCHAIN_API_KEY: "${{ secrets.E2E_TESTS_LANGCHAIN_API_KEY }}"
LLAMA_CLOUD_API_KEY: "${{ secrets.E2E_TESTS_LLAMA_CLOUD_API_KEY }}"
UNSTRUCTURED_API_KEY: "${{ secrets.E2E_TESTS_UNSTRUCTURED_API_KEY }}"
UNSTRUCTURED_API_URL: "${{ secrets.E2E_TESTS_UNSTRUCTURED_API_URL }}"
run: |
source scripts/ci-common-env.sh
if [ "${{ inputs.suite-name == 'ragstack' }}" == "true" ]; then
Expand Down
4 changes: 4 additions & 0 deletions docs/modules/examples/pages/langchain-unstructured-astra.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ Create a `.env` file in your application with the following environment variable
[source,bash]
----
UNSTRUCTURED_API_KEY=...
UNSTRUCTURED_API_URL=https://api.unstructured.io/general/v0/general
ASTRA_DB_API_ENDPOINT=https://<ASTRA_DB_ID>-<ASTRA_DB_REGION>.apps.astra.datastax.com
ASTRA_DB_APPLICATION_TOKEN=AstraCS:...
OPENAI_API_KEY=sk-...
Expand Down Expand Up @@ -103,6 +104,7 @@ This works well if your document doesn't contain any complex formatting or table
loader = UnstructuredAPIFileLoader(
file_path="./attention_pages_9_10.pdf",
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
url = os.getenv("UNSTRUCTURED_API_URL"),
)
simple_docs = loader.load()
Expand All @@ -125,6 +127,7 @@ A list of all the different element types can be found here: https://unstructure
elements = unstructured.get_elements_from_api(
file_path="./attention_pages_9_10.pdf",
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
api_url=os.getenv("UNSTRUCTURED_API_URL"),
strategy="hi_res", # default "auto"
pdf_infer_table_structure=True,
)
Expand Down Expand Up @@ -262,6 +265,7 @@ else:
elements = unstructured.get_elements_from_api(
file_path="./attention_pages_9_10.pdf",
api_key=os.getenv("UNSTRUCTURED_API_KEY"),
api_url=os.getenv("UNSTRUCTURED_API_URL"),
strategy="hi_res", # default "auto"
pdf_infer_table_structure=True,
)
Expand Down
3 changes: 3 additions & 0 deletions examples/notebooks/langchain-unstructured-astra.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@
"from getpass import getpass\n",
"\n",
"os.environ[\"UNSTRUCTURED_API_KEY\"] = getpass(\"Enter your Unstructured API Key:\")\n",
"os.environ[\"UNSTRUCTURED_API_URL\"] = getpass(\"Enter your Unstructured API URL:\")\n",
"os.environ[\"ASTRA_DB_ENDPOINT\"] = input(\"Enter you Astra DB API Endpoint: \")\n",
"os.environ[\"ASTRA_DB_TOKEN\"] = getpass(\"Enter you Astra DB Token: \")\n",
"os.environ[\"OPENAI_API_KEY\"] = getpass(\"Enter your OpenAI API Key: \")"
Expand Down Expand Up @@ -123,6 +124,7 @@
"loader = UnstructuredAPIFileLoader(\n",
" file_path=\"./resources/attention_pages_9_10.pdf\",\n",
" api_key=os.getenv(\"UNSTRUCTURED_API_KEY\"),\n",
" url = os.getenv(\"UNSTRUCTURED_API_URL\"),\n",
")\n",
"simple_docs = loader.load()\n",
"len(simple_docs)"
Expand Down Expand Up @@ -222,6 +224,7 @@
"elements = unstructured.get_elements_from_api(\n",
" file_path=\"./resources/attention_pages_9_10.pdf\",\n",
" api_key=os.getenv(\"UNSTRUCTURED_API_KEY\"),\n",
" api_url = os.getenv(\"UNSTRUCTURED_API_URL\"),\n",
" strategy=\"hi_res\", # default \"auto\"\n",
" pdf_infer_table_structure=True,\n",
")\n",
Expand Down
1 change: 1 addition & 0 deletions ragstack-e2e-tests/.env.template
Original file line number Diff line number Diff line change
Expand Up @@ -36,3 +36,4 @@ VECTOR_DATABASE_TYPE=astradb

# Unstructured.io
# UNSTRUCTURED_API_KEY=
# UNSTRUCTURED_API_URL=
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ def test_unstructured_api(vector_store, unstructured_mode, request):
mode=unstructured_mode,
strategy="auto",
api_key=get_required_env("UNSTRUCTURED_API_KEY"),
url=get_required_env("UNSTRUCTURED_API_URL"),
)

splitter = TokenTextSplitter(chunk_size=512, chunk_overlap=0)
Expand Down
1 change: 1 addition & 0 deletions ragstack-e2e-tests/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ pass_env =
LANGCHAIN_PROJECT
LLAMA_CLOUD_API_KEY
UNSTRUCTURED_API_KEY
UNSTRUCTURED_API_URL
deps =
poetry
commands =
Expand Down
1 change: 1 addition & 0 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ pass_env =
LLAMA_CLOUD_API_KEY
NVIDIA_API_KEY
UNSTRUCTURED_API_KEY
UNSTRUCTURED_API_URL
deps =
pytest
nbmake
Expand Down

0 comments on commit 616f17a

Please sign in to comment.