Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(Dataframe): pull method to fetch dataset from remote server #1446

Merged
merged 10 commits into from
Dec 5, 2024
22 changes: 22 additions & 0 deletions pandasai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,16 @@
PandasAI is a wrapper around a LLM to make dataframes conversational
"""

from io import BytesIO
import os
from typing import List
from zipfile import ZipFile

import pandas as pd
import requests

from pandasai.exceptions import DatasetNotFound
from pandasai.helpers.path import find_project_root
from .agent import Agent
from .helpers.cache import Cache
from .dataframe.base import DataFrame
Expand Down Expand Up @@ -74,6 +81,21 @@ def load(dataset_path: str, virtualized=False) -> DataFrame:
DataFrame: A new PandasAI DataFrame instance with loaded data.
"""
global _dataset_loader
dataset_full_path = os.path.join(find_project_root(), "datasets", dataset_path)
if not os.path.exists(dataset_full_path):
api_key = os.environ.get("PANDAAI_API_KEY", None)
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Check if api_key and api_url are None before using them, and raise a PandasAIApiKeyError if they are not set. This prevents potential TypeError when constructing headers or making requests.

api_url = os.environ.get("PANDAAI_API_URL", None)
headers = {"accept": "application/json", "x-authorization": f"Bearer {api_key}"}

file_data = requests.get(
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Consider using the get_pandaai_session() function to obtain a session and make the request instead of using requests.get directly. This ensures consistent request handling across the codebase.

f"{api_url}/datasets/pull", headers=headers, params={"path": dataset_path}
)
if file_data.status_code != 200:
raise DatasetNotFound("Dataset not found!")

with ZipFile(BytesIO(file_data.content)) as zip_file:
zip_file.extractall(dataset_full_path)

return _dataset_loader.load(dataset_path, virtualized)


Expand Down
4 changes: 2 additions & 2 deletions pandasai/data_loader/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,9 +89,9 @@ def _is_cache_valid(self, cache_file: str) -> bool:
def _read_cache(self, cache_file: str) -> DataFrame:
cache_format = self.schema["destination"]["format"]
if cache_format == "parquet":
return DataFrame(pd.read_parquet(cache_file))
return DataFrame(pd.read_parquet(cache_file), path=self.dataset_path)
elif cache_format == "csv":
return DataFrame(pd.read_csv(cache_file))
return DataFrame(pd.read_csv(cache_file), path=self.dataset_path)
else:
raise ValueError(f"Unsupported cache format: {cache_format}")

Expand Down
45 changes: 37 additions & 8 deletions pandasai/dataframe/base.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,24 @@
from __future__ import annotations
from io import BytesIO
import os
import re
from zipfile import ZipFile
import pandas as pd
from typing import TYPE_CHECKING, List, Optional, Union, Dict, ClassVar

import requests
import yaml


from pandasai.config import Config
import hashlib
from pandasai.exceptions import PandasAIApiKeyError
from pandasai.exceptions import DatasetNotFound
from pandasai.helpers.dataframe_serializer import (
DataframeSerializer,
DataframeSerializerType,
)
from pandasai.helpers.path import find_project_root
from pandasai.helpers.request import Session
from pandasai.helpers.request import get_pandaai_session


if TYPE_CHECKING:
Expand Down Expand Up @@ -220,14 +223,9 @@ def save(
print(f"Dataset saved successfully to path: {dataset_directory}")

def push(self):
api_url = os.environ.get("PANDAAI_API_URL", None)
api_key = os.environ.get("PANDAAI_API_KEY", None)
if not api_url or not api_key:
raise PandasAIApiKeyError(
"Set PANDAAI_API_URL and PANDAAI_API_KEY in environment to push dataset to the remote server"
)

request_session = Session(endpoint_url=api_url, api_key=api_key)
request_session = get_pandaai_session()

params = {
"path": self.path,
Expand Down Expand Up @@ -255,3 +253,34 @@ def push(self):
params=params,
headers=headers,
)

def pull(self):
api_key = os.environ.get("PANDAAI_API_KEY", None)
api_url = os.environ.get("PANDAAI_API_URL", None)
headers = {"accept": "application/json", "x-authorization": f"Bearer {api_key}"}

file_data = requests.get(
f"{api_url}/datasets/pull", headers=headers, params={"path": self.path}
)
if file_data.status_code != 200:
raise DatasetNotFound("Remote dataset not found to pull!")

with ZipFile(BytesIO(file_data.content)) as zip_file:
for file_name in zip_file.namelist():
target_path = os.path.join(self.path, file_name)

# Check if the file already exists
if os.path.exists(target_path):
print(f"Replacing existing file: {target_path}")

# Ensure target directory exists
os.makedirs(os.path.dirname(target_path), exist_ok=True)

# Extract the file
with open(target_path, "wb") as f:
f.write(zip_file.read(file_name))

# reloads the Dataframe
from pandasai import load

self = load(self.path, virtualized=not isinstance(self, DataFrame))
13 changes: 11 additions & 2 deletions pandasai/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,8 +209,9 @@ class PandasAIApiKeyError(Exception):
Exception (Exception): PandasAIApiKeyError
"""

def __init__(self):
message = PANDASBI_SETUP_MESSAGE
def __init__(self, message: str = None):
if not message:
message = PANDASBI_SETUP_MESSAGE
super().__init__(message)


Expand Down Expand Up @@ -264,3 +265,11 @@ class MaliciousCodeGenerated(Exception):
Args:
Exception (Exception): MaliciousCodeGenerated
"""


class DatasetNotFound(Exception):
"""
Raise error if dataset not found
Args:
Exception (Exception): DatasetNotFound
"""
11 changes: 11 additions & 0 deletions pandasai/helpers/request.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,3 +91,14 @@ def make_request(
except requests.exceptions.RequestException as e:
self._logger.log(f"Request failed: {traceback.format_exc()}", logging.ERROR)
raise PandasAIApiCallError(f"Request failed: {e}") from e


def get_pandaai_session():
api_url = os.environ.get("PANDAAI_API_URL", None)
api_key = os.environ.get("PANDAAI_API_KEY", None)
if not api_url or not api_key:
raise PandasAIApiKeyError(
"Set PANDAAI_API_URL and PANDAAI_API_KEY in environment to push dataset to the remote server"
)

return Session(endpoint_url=api_url, api_key=api_key)
Loading