-
Notifications
You must be signed in to change notification settings - Fork 16
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[RAG-198] - Docs - LlamaParse (#281)
* initial-content * ci-security-scans-badge * cleanup * print-source-node * description * bump-nav
- Loading branch information
Showing
4 changed files
with
273 additions
and
16 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,230 @@ | ||
= RAG with LlamaParse and AstraDB | ||
|
||
image::https://colab.research.google.com/assets/colab-badge.svg[align="left",link="https://colab.research.google.com/github/datastax/ragstack-ai/blob/main/examples/notebooks/llama-parse-astra.ipynb"] | ||
|
||
Build a RAG pipeline with RAGStack, AstraDB, and LlamaIndex. | ||
|
||
This example demonstrates loading and parsing a PDF document with LLamaParse into an Astra DB vector store, then querying the index with LlamaIndex. | ||
|
||
== Prerequisites | ||
|
||
You will need a vector-enabled Astra database. | ||
|
||
* Create an https://docs.datastax.com/en/astra-serverless/docs/getting-started/create-db-choices.html[Astra | ||
vector database]. | ||
* Within your database, create an https://docs.datastax.com/en/astra-serverless/docs/manage/org/manage-tokens.html[Astra | ||
DB Access Token] with Database Administrator permissions. | ||
* Get your Astra DB Endpoint: | ||
** `+https://<ASTRA_DB_ID>-<ASTRA_DB_REGION>.apps.astra.datastax.com+` | ||
* Create an API key at https://cloud.llamaindex.ai/[LlamaIndex.ai]. | ||
Install the following dependencies: | ||
[source,python] | ||
---- | ||
pip install ragstack-ai llama-parse python-dotenv | ||
---- | ||
See the https://docs.datastax.com/en/ragstack/docs/prerequisites.html[Prerequisites] page for more details. | ||
|
||
== Export database connection details | ||
|
||
Create a `.env` file in your application with the following environment variables: | ||
[source,bash] | ||
---- | ||
LLAMA_CLOUD_API_KEY=llx-... | ||
ASTRA_DB_API_ENDPOINT=https://bbe07f45-8ab4-4d81-aa7d-7f58dbed3ead-us-east-1.apps.astra.datastax.com | ||
ASTRA_DB_APPLICATION_TOKEN=AstraCS:... | ||
OPENAI_API_KEY=sk-... | ||
---- | ||
|
||
If you're using Google Colab, you'll be prompted for these values in the Colab environment. | ||
|
||
See the https://docs.datastax.com/en/ragstack/docs/prerequisites.html[Prerequisites] page for more details. | ||
|
||
== Create RAG pipeline | ||
|
||
. Import dependencies and load environment variables. | ||
+ | ||
[source,python] | ||
---- | ||
import os | ||
import requests | ||
from dotenv import load_dotenv | ||
from llama_parse import LlamaParse | ||
from llama_index.vector_stores import AstraDBVectorStore | ||
from llama_index.node_parser import SimpleNodeParser | ||
from llama_index import OpenAIEmbedding, VectorStoreIndex, StorageContext, ServiceContext | ||
from llama_index.llms import OpenAI | ||
load_dotenv() | ||
llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY") | ||
api_endpoint = os.getenv("ASTRA_DB_API_ENDPOINT") | ||
token = os.getenv("ASTRA_DB_APPLICATION_TOKEN") | ||
openai_api_key = os.getenv("OPENAI_API_KEY") | ||
---- | ||
+ | ||
. Download a PDF about attention mechanisms in transformer model architectures. | ||
+ | ||
[source,python] | ||
---- | ||
url = "https://arxiv.org/pdf/1706.03762.pdf" | ||
file_path = "./attention.pdf" | ||
response = requests.get(url) | ||
if response.status_code == 200: | ||
with open(file_path, "wb") as file: | ||
file.write(response.content) | ||
print("Download complete.") | ||
else: | ||
print("Error downloading the file.") | ||
---- | ||
+ | ||
. Load the downloaded PDF with LlamaParse as a text Document for indexing. | ||
LlamaParse also supports Markdown-type Documents with `(result_type=markdown)`. | ||
+ | ||
[source,python] | ||
---- | ||
documents = LlamaParse(result_type="text").load_data(file_path) | ||
print(documents[0].get_content()[10000:11000]) | ||
---- | ||
+ | ||
. Create an AstraDB vector store instance. | ||
+ | ||
[source,python] | ||
---- | ||
astra_db_store = AstraDBVectorStore( | ||
token=token, | ||
api_endpoint=api_endpoint, | ||
collection_name="astra_v_table_llamaparse", | ||
embedding_dimension=1536 | ||
) | ||
---- | ||
+ | ||
. Parse Documents into nodes and set up storage and service contexts to use AstraDB and OpenAI. | ||
+ | ||
[source,python] | ||
---- | ||
node_parser = SimpleNodeParser() | ||
nodes = node_parser.get_nodes_from_documents(documents) | ||
print(nodes[0].get_content()) | ||
storage_context = StorageContext.from_defaults(vector_store=astra_db_store) | ||
service_context = ServiceContext.from_defaults( | ||
llm=OpenAI(model="gpt-4"), | ||
embed_model=OpenAIEmbedding(), | ||
chunk_size=512, | ||
) | ||
---- | ||
+ | ||
. Create a vector store index and query engine from your nodes and contexts. | ||
+ | ||
[source,python] | ||
---- | ||
index = VectorStoreIndex(nodes=nodes, storage_context=storage_context) | ||
query_engine = index.as_query_engine(similarity_top_k=15, service_context=service_context) | ||
---- | ||
|
||
== Execute a query | ||
|
||
. Query the Astra vector store for an example with expected context - this query should return a relevant response. | ||
+ | ||
[source,python] | ||
---- | ||
query = "What is Multi-Head Attention also known as?" | ||
response_1 = query_engine.query(query) | ||
print("\n***********New LlamaParse+ Basic Query Engine***********") | ||
print(response_1) | ||
---- | ||
+ | ||
. Query the Astra vector store for an example with expected lack of context. | ||
This query should return `The context does not provide information about the color of the sky` because your document does not contain information about the color of the sky. | ||
+ | ||
[source,python] | ||
---- | ||
query = "What is the color of the sky?" | ||
response_1 = query_engine.query(query) | ||
print("\n***********New LlamaParse+ Basic Query Engine***********") | ||
print(response_1) | ||
---- | ||
|
||
== Complete code | ||
|
||
.Python | ||
[%collapsible%open] | ||
==== | ||
[source,python] | ||
---- | ||
import os | ||
import requests | ||
from dotenv import load_dotenv | ||
from llama_parse import LlamaParse | ||
from llama_index.vector_stores import AstraDBVectorStore | ||
from llama_index.node_parser import SimpleNodeParser | ||
from llama_index import OpenAIEmbedding, VectorStoreIndex, StorageContext, ServiceContext | ||
from llama_index.llms import OpenAI | ||
# Load environment variables | ||
load_dotenv() | ||
# Get all required API keys and parameters | ||
llama_cloud_api_key = os.getenv("LLAMA_CLOUD_API_KEY") | ||
api_endpoint = os.getenv("ASTRA_DB_API_ENDPOINT") | ||
token = os.getenv("ASTRA_DB_APPLICATION_TOKEN") | ||
openai_api_key = os.getenv("OPENAI_API_KEY") | ||
# Download a PDF for indexing | ||
url = "https://arxiv.org/pdf/1706.03762.pdf" | ||
file_path = "./attention.pdf" | ||
response = requests.get(url) | ||
if response.status_code == 200: | ||
with open(file_path, "wb") as file: | ||
file.write(response.content) | ||
print("Download complete.") | ||
else: | ||
print("Error downloading the file.") | ||
# Load and parse the document | ||
documents = LlamaParse(result_type="text").load_data(file_path) | ||
# Output a snippet from the parsed document for verification | ||
print(documents[0].get_content()[10000:11000]) | ||
# Setup for storing in AstraDB | ||
astra_db_store = AstraDBVectorStore( | ||
token=token, | ||
api_endpoint=api_endpoint, | ||
collection_name="astra_v_table_llamaparse", | ||
embedding_dimension=1536 | ||
) | ||
# Parse nodes from documents and output a snippet for verification | ||
node_parser = SimpleNodeParser() | ||
nodes = node_parser.get_nodes_from_documents(documents) | ||
print(nodes[0].get_content()) | ||
# Setup storage and service contexts | ||
storage_context = StorageContext.from_defaults(vector_store=astra_db_store) | ||
service_context = ServiceContext.from_defaults( | ||
llm=OpenAI(model="gpt-4"), | ||
embed_model=OpenAIEmbedding(), | ||
chunk_size=512, | ||
) | ||
# Indexing and query engine setup | ||
index = VectorStoreIndex(nodes=nodes, storage_context=storage_context) | ||
query_engine = index.as_query_engine(similarity_top_k=15, service_context=service_context) | ||
# Execute a query | ||
query = "What is Multi-Head Attention also known as?" | ||
response_1 = query_engine.query(query) | ||
print("\n***********New LlamaParse+ Basic Query Engine***********") | ||
print(response_1) | ||
# Query for an example with expected lack of context | ||
query = "What is the color of the sky?" | ||
response_1 = query_engine.query(query) | ||
print("\n***********New LlamaParse+ Basic Query Engine***********") | ||
print(response_1) | ||
---- | ||
==== | ||
|