Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add llama-parse as dependency and bump llama-index to 0.9.48 #279

Merged
merged 26 commits into from
Feb 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/_run_e2e_tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -88,6 +88,7 @@ jobs:
ASTRA_DB_ID: "${{ steps.astra-db.outputs.db_id }}"
OPENAI_API_KEY: "${{ secrets.E2E_TESTS_OPEN_AI_KEY }}"
LANGCHAIN_API_KEY: "${{ secrets.E2E_TESTS_LANGCHAIN_API_KEY }}"
LLAMA_CLOUD_API_KEY: "${{ secrets.E2E_TESTS_LLAMA_CLOUD_API_KEY }}"
GCLOUD_ACCOUNT_KEY_JSON: "${{ secrets.E2E_TESTS_GCLOUD_ACCOUNT_KEY_JSON }}"
run: |
source scripts/ci-common-env.sh
Expand All @@ -114,6 +115,7 @@ jobs:
HUGGINGFACE_HUB_KEY: "${{ secrets.E2E_TESTS_HUGGINGFACE_HUB_KEY }}"
NVIDIA_API_KEY: "${{ secrets.E2E_TESTS_NVIDIA_API_KEY }}"
LANGCHAIN_API_KEY: "${{ secrets.E2E_TESTS_LANGCHAIN_API_KEY }}"
LLAMA_CLOUD_API_KEY: "${{ secrets.E2E_TESTS_LLAMA_CLOUD_API_KEY }}"
run: |
source scripts/ci-common-env.sh
if [ "${{ inputs.suite-name == 'ragstack' }}" == "true" ]; then
Expand Down
20 changes: 9 additions & 11 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,22 @@ authors = ["DataStax"]
readme = "PACKAGE_README.md"
repository = "https://github.com/datastax/ragstack-ai"
documentation = "https://docs.datastax.com/en/ragstack"
packages = [
{include = "ragstack"}
]
packages = [{ include = "ragstack" }]

[tool.poetry.dependencies]
python = ">=3.9,<4.0"
astrapy = "~0.7.0"
cassio = "~0.1.3"
unstructured = "^0.10"
llama-index = { version = "0.9.34", extras = ["langchain"] }
langchain = {version = "0.1.4"}
llama-index = { version = "0.9.48", extras = ["langchain"] }
llama-parse = { version = "0.1.4" }
langchain = { version = "0.1.4" }
langchain-core = "0.1.16"
langchain-community = "0.0.16"
langchain-openai = {version = "0.0.3"}
langchain-google-genai = {version = "0.0.6", optional = true}
langchain-google-vertexai = {version = "0.0.3", optional = true}
langchain-nvidia-ai-endpoints = {version = "0.0.1", optional = true}
langchain-openai = { version = "0.0.3" }
langchain-google-genai = { version = "0.0.6", optional = true }
langchain-google-vertexai = { version = "0.0.3", optional = true }
langchain-nvidia-ai-endpoints = { version = "0.0.1", optional = true }

[tool.poetry.extras]
langchain-google = ["langchain-google-genai", "langchain-google-vertexai"]
Expand All @@ -39,5 +38,4 @@ requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.poetry.dev-dependencies]
yamllint = "^1.34.0"

yamllint = "^1.34.0"
5 changes: 4 additions & 1 deletion ragstack-e2e-tests/.env.template
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,7 @@ VECTOR_DATABASE_TYPE=astradb
# HUGGINGFACE_HUB_KEY=

# Nvidia
# NVIDIA_API_KEY=
# NVIDIA_API_KEY=

# LlamaIndex
# LLAMA_CLOUD_API_KEY=
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
import os
from typing import List

import pytest
Expand All @@ -15,6 +14,7 @@
run_conversational_rag,
)
from e2e_tests.langchain.trulens import run_trulens_evaluation
from e2e_tests.test_utils import get_local_resource_path

from langchain.chat_models import ChatOpenAI, AzureChatOpenAI, ChatVertexAI, BedrockChat
from langchain.embeddings import (
Expand Down Expand Up @@ -341,12 +341,6 @@ def embed_query(self, text: str) -> List[float]:
assert "Coffee Machine Ultra Cool" in response.content


def get_local_resource_path(filename: str):
dirname = os.path.dirname(__file__)
e2e_tests_dir = os.path.dirname(dirname)
return os.path.join(e2e_tests_dir, "resources", filename)


@pytest.mark.parametrize("chat", ["vertex_gemini_pro_llm", "gemini_pro_llm"])
def test_chat(chat, request, record_property):
set_current_test_info(
Expand Down
25 changes: 25 additions & 0 deletions ragstack-e2e-tests/e2e_tests/llama_index/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import pytest

from e2e_tests.conftest import (
get_vector_store_handler,
)

from e2e_tests.test_utils.vector_store_handler import (
VectorStoreImplementation,
)


@pytest.fixture
def astra_db():
handler = get_vector_store_handler(VectorStoreImplementation.ASTRADB)
context = handler.before_test()
yield context
handler.after_test()


@pytest.fixture
def cassandra():
handler = get_vector_store_handler(VectorStoreImplementation.CASSANDRA)
context = handler.before_test()
yield context
handler.after_test()
40 changes: 10 additions & 30 deletions ragstack-e2e-tests/e2e_tests/llama_index/test_compatibility_rag.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
import logging
import os

import pytest
from langchain.embeddings import VertexAIEmbeddings, HuggingFaceInferenceAPIEmbeddings
Expand Down Expand Up @@ -29,32 +28,15 @@
from e2e_tests.conftest import (
set_current_test_info,
get_required_env,
get_vector_store_handler,
)
from vertexai.vision_models import MultiModalEmbeddingModel, Image

from e2e_tests.test_utils import get_local_resource_path
from e2e_tests.test_utils.vector_store_handler import (
VectorStoreImplementation,
VectorStoreTestContext,
)


@pytest.fixture
def astra_db():
handler = get_vector_store_handler(VectorStoreImplementation.ASTRADB)
context = handler.before_test()
yield context
handler.after_test()


@pytest.fixture
def cassandra():
handler = get_vector_store_handler(VectorStoreImplementation.CASSANDRA)
context = handler.before_test()
yield context
handler.after_test()


@pytest.fixture
def openai_llm():
return "openai", OpenAI(api_key=get_required_env("OPEN_AI_KEY"))
Expand Down Expand Up @@ -110,7 +92,7 @@ def bedrock_anthropic_llm():
model="anthropic.claude-v2",
aws_access_key_id=get_required_env("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=get_required_env("AWS_SECRET_ACCESS_KEY"),
aws_region_name=get_required_env("BEDROCK_AWS_REGION"),
region_name=get_required_env("BEDROCK_AWS_REGION"),
)


Expand All @@ -120,7 +102,7 @@ def bedrock_meta_llm():
model="meta.llama2-13b-chat-v1",
aws_access_key_id=get_required_env("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=get_required_env("AWS_SECRET_ACCESS_KEY"),
aws_region_name=get_required_env("BEDROCK_AWS_REGION"),
region_name=get_required_env("BEDROCK_AWS_REGION"),
)


Expand All @@ -138,12 +120,16 @@ def bedrock_titan_embedding():

@pytest.fixture
def bedrock_cohere_embedding():
import boto3

return (
"bedrock-cohere",
1024,
BedrockEmbedding.from_credentials(
model_name="cohere.embed-english-v3",
aws_region=get_required_env("BEDROCK_AWS_REGION"),
BedrockEmbedding(
client=boto3.Session(
region_name=get_required_env("BEDROCK_AWS_REGION")
).client("bedrock-runtime"),
model="cohere.embed-english-v3",
),
)

Expand Down Expand Up @@ -339,12 +325,6 @@ def test_multimodal(vector_store, embedding, llm, request):
assert "Coffee Machine Ultra Cool" in response


def get_local_resource_path(filename: str):
dirname = os.path.dirname(__file__)
e2e_tests_dir = os.path.dirname(dirname)
return os.path.join(e2e_tests_dir, "resources", filename)


@pytest.mark.parametrize(
"chat",
["gemini_pro_llm", "vertex_gemini_pro_llm"],
Expand Down
66 changes: 66 additions & 0 deletions ragstack-e2e-tests/e2e_tests/llama_index/test_llama_parse.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import pytest

try:
from llama_parse import LlamaParse
except ImportError:
pytest.skip("llama_parse is not supported, skipping tests", allow_module_level=True)

from llama_index import (
VectorStoreIndex,
StorageContext,
ServiceContext,
)

from llama_index.embeddings import OpenAIEmbedding
from llama_index.llms import OpenAI

from e2e_tests.conftest import (
set_current_test_info,
get_required_env,
)
from e2e_tests.test_utils import get_local_resource_path
from e2e_tests.test_utils.vector_store_handler import (
VectorStoreTestContext,
)


@pytest.fixture
def llama_parse_text():
return "text", LlamaParse(result_type="text")


@pytest.fixture
def llama_parse_markdown():
return "markdown", LlamaParse(result_type="markdown")


@pytest.mark.parametrize("vector_store", ["cassandra", "astra_db"])
@pytest.mark.parametrize(
"llama_parse_instance",
["llama_parse_text", "llama_parse_markdown"],
)
def test_llama_parse(vector_store, llama_parse_instance, request):
vector_store_context: VectorStoreTestContext = request.getfixturevalue(vector_store)
lp_type, lp = request.getfixturevalue(llama_parse_instance)
llm = OpenAI(api_key=get_required_env("OPEN_AI_KEY"))
embedding = OpenAIEmbedding(api_key=get_required_env("OPEN_AI_KEY"))

set_current_test_info(
"llama_index::llama_parse",
f"{lp_type},{vector_store}",
)
vector_store = vector_store_context.new_llamaindex_vector_store(
embedding_dimension=1536
)
storage_context = StorageContext.from_defaults(vector_store=vector_store)
service_context = ServiceContext.from_defaults(llm=llm, embed_model=embedding)

file_path = get_local_resource_path("tree.pdf")
documents = lp.load_data(file_path)

index = VectorStoreIndex.from_documents(
documents, storage_context=storage_context, service_context=service_context
)

retriever = index.as_retriever()
assert len(retriever.retrieve("What was Eldenroot?")) > 0
Binary file added ragstack-e2e-tests/e2e_tests/resources/tree.pdf
Binary file not shown.
6 changes: 6 additions & 0 deletions ragstack-e2e-tests/e2e_tests/test_utils/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,12 @@ def get_required_env(name) -> str:
return value


def get_local_resource_path(filename: str):
dirname = os.path.dirname(__file__)
e2e_tests_dir = os.path.dirname(dirname)
return os.path.join(e2e_tests_dir, "resources", filename)


def random_string() -> str:
return str(uuid.uuid4()).split("-")[0]

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -114,18 +114,8 @@ def search_documents(self, vector: List[float], limit: int) -> List[str]:
return docs


def metaclass_resolver(*classes):
metaclass = tuple(set(type(cls) for cls in classes))
metaclass = (
metaclass[0]
if len(metaclass) == 1
else type("_".join(mcls.__name__ for mcls in metaclass), metaclass, {})
) # class M_C
return metaclass("_".join(cls.__name__ for cls in classes), classes, {})


class EnhancedAstraDBLlamaIndexVectorStore(
metaclass_resolver(EnhancedLlamaIndexVectorStore, AstraDBVectorStore)
AstraDBVectorStore, EnhancedLlamaIndexVectorStore
):

def put_document(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from langchain_core.chat_history import BaseChatMessageHistory
from langchain_core.vectorstores import VectorStore as LangChainVectorStore
from llama_index.vector_stores.types import VectorStore as LLamaIndexVectorStore

from e2e_tests.test_utils import skip_test_due_to_implementation_not_supported

Expand All @@ -30,7 +29,8 @@ class EnhancedLangChainVectorStore(LangChainVectorStore, EnhancedVectorStore, AB
"""Enhanced LangChain vector store"""


class EnhancedLlamaIndexVectorStore(LLamaIndexVectorStore, EnhancedVectorStore, ABC):
# we can't use the VectorStore type here from llama_index.vector_stores.types because AstraDBVectorStore is based on BasePydanticVectorStore
class EnhancedLlamaIndexVectorStore(EnhancedVectorStore, ABC):
"""Enhanced Llama-Index vector store"""


Expand Down
3 changes: 2 additions & 1 deletion ragstack-e2e-tests/pyproject.langchain.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ langchain-openai = { git = "https://github.com/langchain-ai/langchain.git", bran
langchain-google-genai = { git = "https://github.com/langchain-ai/langchain.git", branch = "master", subdirectory = "libs/partners/google-genai" }
langchain-google-vertexai = { git = "https://github.com/langchain-ai/langchain.git", branch = "master", subdirectory = "libs/partners/google-vertexai" }
langchain-nvidia-ai-endpoints = { git = "https://github.com/langchain-ai/langchain.git", branch = "master", subdirectory = "libs/partners/nvidia-ai-endpoints" }
llama-index = { version = "0.9.34", extras = ["langchain"] }
llama-index = { version = "0.9.48", extras = ["langchain"] }
llama-parse = { version = "0.1.4" }
astrapy = "~0.7.0"
# we need this specific feature from cassio: https://github.com/CassioML/cassio/pull/128
cassio = "~0.1.4"
Expand Down
11 changes: 6 additions & 5 deletions ragstack-e2e-tests/pyproject.llamaindex.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@ trulens-eval = "^0.21.0"
beautifulsoup4 = "^4"

llama-index = { git = "https://github.com/run-llama/llama_index.git", branch = "main" }
llama-parse = { version = "0.2.0" }

langchain = {version = "0.1.2"}
langchain = { version = "0.1.2" }
langchain-core = "0.1.15"
langchain-community = "0.0.15"
langchain-openai = {version = "0.0.3"}
langchain-google-genai = {version = "0.0.6"}
langchain-google-vertexai = {version = "0.0.3"}
langchain-nvidia-ai-endpoints = {version = "0.0.1"}
langchain-openai = { version = "0.0.3" }
langchain-google-genai = { version = "0.0.6" }
langchain-google-vertexai = { version = "0.0.3" }
langchain-nvidia-ai-endpoints = { version = "0.0.1" }
astrapy = "~0.7.0"
# we need this specific feature from cassio: https://github.com/CassioML/cassio/pull/128
cassio = "~0.1.4"
Expand Down
2 changes: 1 addition & 1 deletion ragstack-e2e-tests/pyproject.ragstack-ai.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,4 +39,4 @@ line-length = 250
log_cli = true
log_cli_level = "INFO"
log_cli_format = "%(asctime)s [%(levelname)8s] %(message)s (%(filename)s:%(lineno)s)"
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
log_cli_date_format = "%Y-%m-%d %H:%M:%S"
1 change: 1 addition & 0 deletions ragstack-e2e-tests/tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ pass_env =
LANGCHAIN_TRACING_V2
LANGCHAIN_ENDPOINT
LANGCHAIN_PROJECT
LLAMA_CLOUD_API_KEY
deps =
poetry
commands =
Expand Down
5 changes: 3 additions & 2 deletions tox.ini
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ pass_env =
LANGCHAIN_TRACING_V2
LANGCHAIN_ENDPOINT
LANGCHAIN_PROJECT
LLAMA_CLOUD_API_KEY
deps =
pytest
nbmake
Expand All @@ -27,7 +28,7 @@ commands =


[testenv:lint-yaml]
deps =
deps =
yamllint
commands =
commands =
yamllint -c .github/.yamllint .github/
Loading