Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add pypdf as a documentloader #12

Merged
merged 2 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added examples/files/CV_Candidate.pdf
Binary file not shown.
Binary file added examples/files/Job_Offer.pdf
Binary file not shown.
157 changes: 157 additions & 0 deletions examples/resume_processor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
import json
import os
from typing import List, Optional

from dotenv import load_dotenv
from pydantic import Field
import yaml

from extract_thinker import Extractor, Contract, DocumentLoaderPyPdf
from litellm import Router

from extract_thinker.llm import LLM


def json_to_yaml(json_dict):
# Check if json_dict is a dictionary
if not isinstance(json_dict, dict):
raise ValueError("json_dict must be a dictionary")

# Convert the Python dictionary to YAML
yaml_str = yaml.dump(json_dict)

return yaml_str


class RoleContract(Contract):
company_name: str = Field("Company name")
years_of_experience: int = Field("Years of experience required. If not mention, calculate with start date and end date")
is_remote: bool = Field("Is the role remote?")
country: str = Field("Country of the role")
city: Optional[str] = Field("City of the role")
list_of_skills: List[str] = Field("""
list of strings, e.g ["5 years experience", "3 years in React", "Typescript"]
Make the lists of skills to be a yes/no list, so it can be used in the LLM model as a list of true/false
""")


class ResumeContract(Contract):
name: str = Field("First and Last Name")
age: Optional[str] = Field("Age with format DD/MM/YYYY. Empty if not available")
email: str = Field("Email address")
phone: Optional[str] = Field("Phone number")
address: Optional[str] = Field("Address")
city: Optional[str] = Field("City")
total_experience: int = Field("Total experience in years")
can_go_to_office: Optional[bool] = Field("Can go to office. If city/location is not provider, is false. If is the same city, is true")
list_of_skills: List[bool] = Field("Takes the list of skills and returns a list of true/false, if the candidate has that skill. E.g. ['Python', 'JavaScript', 'React', 'Node.js'] -> [True, True, False, True]")


class Person(Contract):
name: str = Field("First and Last Name")
list_of_skills: List[str]

load_dotenv()
cwd = os.getcwd()


def config_router():
rpm = 5000 # Rate limit in requests per minute

model_list = [
{
"model_name": "Meta-Llama-3-8B-Instruct",
"litellm_params": {
"model": "deepinfra/meta-llama/Meta-Llama-3-8B-Instruct",
"api_key": os.getenv("DEEPINFRA_API_KEY"),
"rpm": rpm,
},
},
{
"model_name": "Mistral-7B-Instruct-v0.2",
"litellm_params": {
"model": "deepinfra/mistralai/Mistral-7B-Instruct-v0.2",
"api_key": os.getenv("DEEPINFRA_API_KEY"),
"rpm": rpm,
}
},
{
"model_name": "groq-llama3-8b-8192",
"litellm_params": {
"model": "groq/llama3-8b-8192",
"api_key": os.getenv("GROQ_API_KEY"),
"rpm": rpm,
}
},
]

# Adding fallback models
fallback_models = [
{
"model_name": "claude-3-haiku-20240307",
"litellm_params": {
"model": "claude-3-haiku-20240307",
"api_key": os.getenv("CLAUDE_API_KEY"),
}
},
{
"model_name": "azure-deployment",
"litellm_params": {
"model": "azure/<your-deployment-name>",
"api_base": os.getenv("AZURE_API_BASE"),
"api_key": os.getenv("AZURE_API_KEY"),
"rpm": 1440,
}
}
]

# Combine the lists
model_list.extend(fallback_models)

# Define the router configuration
router = Router(
model_list=model_list,
default_fallbacks=["claude-3-haiku-20240307", "azure/<your-deployment-name>"],
context_window_fallbacks=[
{"Meta-Llama-3-8B-Instruct": ["claude-3-haiku-20240307"]},
{"groq-llama3-8b-8192": ["claude-3-haiku-20240307"]},
{"Mistral-7B-Instruct-v0.2": ["claude-3-haiku-20240307"]}
],
set_verbose=True
)

return router


job_role_path = os.path.join(cwd, "examples", "files", "Job_Offer.pdf")

extractor_job_role = Extractor()

extractor_job_role.load_document_loader(
DocumentLoaderPyPdf()
)

extractor_job_role.load_llm("gpt-4o")
role_result = extractor_job_role.extract(job_role_path, RoleContract)

print(role_result.json())

extractor_candidate = Extractor()
extractor_candidate.load_document_loader(
DocumentLoaderPyPdf()
)

llm = LLM("groq/llama3-8b-8192") # default model
#llm.load_router(config_router()) # load the router

extractor_candidate.load_llm(llm)

resume_content_path = os.path.join(cwd, "examples", "files", "CV_Candidate.pdf")

job_role_content = "This is the job cotent. to be mapped: \n" + json_to_yaml(json.loads(role_result.json()))

result = extractor_candidate.extract(resume_content_path,
ResumeContract,
content=job_role_content)

print(result.json())
2 changes: 2 additions & 0 deletions extract_thinker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from .document_loader.cached_document_loader import CachedDocumentLoader
from .document_loader.document_loader_tesseract import DocumentLoaderTesseract
from .document_loader.document_loader_spreadsheet import DocumentLoaderSpreadSheet
from .document_loader.document_loader_pypdf import DocumentLoaderPyPdf
from .document_loader.document_loader_text import DocumentLoaderText
from .models import classification, classification_response
from .process import Process
Expand All @@ -17,6 +18,7 @@
'DocumentLoader',
'CachedDocumentLoader',
'DocumentLoaderTesseract',
'DocumentLoaderPyPdf',
'DocumentLoaderText',
'classification',
'classification_response',
Expand Down
54 changes: 54 additions & 0 deletions extract_thinker/document_loader/document_loader_llm_image.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
from abc import ABC
from io import BytesIO
from PIL import Image
from extract_thinker.document_loader.cached_document_loader import CachedDocumentLoader
from extract_thinker.utils import extract_json


class DocumentLoaderLLMImage(CachedDocumentLoader, ABC):
def __init__(self, content=None, cache_ttl=300, llm=None):
super().__init__(content, cache_ttl)
self.llm = llm

def extract_image_content(self, image_stream: BytesIO) -> str:
"""
Extracts text or data from an image using an LLM.
The actual implementation uses an LLM to process the image content.
"""
# Load the image from the stream
image = Image.open(image_stream)

# Encode the image to base64
base64_image = self.encode_image(image)

# Use the LLM to extract the content from the image
resp = self.llm.completion(
model="claude-3-sonnet-20240229",
messages=[
{
"role": "system",
"content": 'You are a worldclass Image data extractor. You receive an image and extract useful information from it. You output a JSON with the extracted information.',
},
{
"role": "user",
"content": [
{
"type": "image_url",
"image_url": {
"url": "data:image/jpeg;base64," + base64_image
},
},
{"type": "text", "text": "###JSON Output\n"},
],
},
],
)

# Extract the JSON text from the response
jsonText = resp.choices[0].message.content

# Extract the JSON from the text
jsonText = extract_json(jsonText)

# Return the extracted content
return jsonText
41 changes: 41 additions & 0 deletions extract_thinker/document_loader/document_loader_pypdf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import io
from typing import Any, Dict, List, Union
from PyPDF2 import PdfReader
from extract_thinker.document_loader.document_loader_llm_image import DocumentLoaderLLMImage


class DocumentLoaderPyPdf(DocumentLoaderLLMImage):
def __init__(self, content: Any = None, cache_ttl: int = 300):
super().__init__(content, cache_ttl)

def load_content_from_file(self, file_path: str) -> Union[str, Dict[str, Any]]:
reader = PdfReader(file_path)
return self.extract_data_from_pdf(reader)

def load_content_from_stream(self, stream: io.BytesIO) -> Union[str, Dict[str, Any]]:
reader = PdfReader(stream)
return self.extract_data_from_pdf(reader)

def load_content_from_file_list(self, file_paths: List[str]) -> List[Any]:
return [self.load_content_from_file(file_path) for file_path in file_paths]

def load_content_from_stream_list(self, streams: List[io.BytesIO]) -> List[Any]:
return [self.load_content_from_stream(stream) for stream in streams]

def extract_data_from_pdf(self, reader: PdfReader) -> Union[str, Dict[str, Any]]:
document_data = {
"text": []
}

for page in reader.pages:
# Extract text and split by newline characters
page_text = page.extract_text()
document_data["text"].extend(page_text.split('\n'))

# Skip image extraction for now. TODO
# for img_index, image in enumerate(page.images):
# image_data = self.extract_image_content(io.BytesIO(image["data"]))
# if image_data:
# document_data["images"].append(image_data)

return document_data
43 changes: 32 additions & 11 deletions extract_thinker/extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@
from extract_thinker.document_loader.loader_interceptor import LoaderInterceptor
from extract_thinker.document_loader.llm_interceptor import LlmInterceptor

from extract_thinker.utils import get_file_extension
from extract_thinker.utils import get_file_extension, encode_image
import yaml


SUPPORTED_IMAGE_FORMATS = ["jpeg", "png", "bmp", "tiff"]
Expand All @@ -30,6 +31,7 @@ def __init__(
self.document_loaders_by_file_type: Dict[str, DocumentLoader] = {}
self.loader_interceptors: List[LoaderInterceptor] = []
self.llm_interceptors: List[LlmInterceptor] = []
self.extra_content: Optional[str] = None

def add_interceptor(
self, interceptor: Union[LoaderInterceptor, LlmInterceptor]
Expand All @@ -55,10 +57,17 @@ def get_document_loader_for_file(self, file: str) -> DocumentLoader:
def load_document_loader(self, document_loader: DocumentLoader) -> None:
self.document_loader = document_loader

def load_llm(self, model: str) -> None:
self.llm = LLM(model)
def load_llm(self, model: Optional[str] = None) -> None:
if isinstance(model, LLM):
self.llm = model
elif model is not None:
self.llm = LLM(model)
else:
raise ValueError("Either a model string or an LLM object must be provided.")

def extract(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False, content: Optional[str] = None) -> Any:
self.extra_content = content

def extract(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False) -> str:
if not issubclass(response_model, BaseModel):
raise ValueError("response_model must be a subclass of Pydantic's BaseModel.")

Expand All @@ -71,7 +80,7 @@ def extract(self, source: Union[str, IO, list], response_model: type[BaseModel],
else:
raise ValueError("Source must be a file path, a stream, or a list of dictionaries")

async def extract_async(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False) -> str:
async def extract_async(self, source: Union[str, IO, list], response_model: type[BaseModel], vision: bool = False) -> Any:
return await asyncio.to_thread(self.extract, source, response_model, vision)

def extract_from_list(self, data: List[Dict[Any, Any]], response_model: type[BaseModel], vision: bool) -> str:
Expand Down Expand Up @@ -162,9 +171,13 @@ def classify(self, input: Union[str, IO], classifications: List[Classification])
async def classify_async(self, input: Union[str, IO], classifications: List[Classification]):
return await asyncio.to_thread(self.classify, input, classifications)

def _extract(
self, content, file_or_stream, response_model, vision=False, is_stream=False
):
def _extract(self,
content,
file_or_stream,
response_model,
vision=False,
is_stream=False
):
# call all the llm interceptors before calling the llm
for interceptor in self.llm_interceptors:
interceptor.intercept(self.llm)
Expand All @@ -177,8 +190,18 @@ def _extract(
},
]

if self.extra_content is not None:
if isinstance(self.extra_content, dict):
self.extra_content = yaml.dump(self.extra_content)
messages.append({"role": "user", "content": "##Extra Content\n\n" + self.extra_content})

if content is not None:
if isinstance(content, dict):
content = yaml.dump(content)
messages.append({"role": "user", "content": "##Content\n\n" + content})

if vision:
base64_encoded_image = self._encode_image_to_base64(
base64_encoded_image = encode_image(
file_or_stream, is_stream
)

Expand All @@ -196,8 +219,6 @@ def _extract(
],
}
]
else:
messages.append({"role": "user", "content": "##Content\n\n" + content})

response = self.llm.request(messages, response_model)
return response
Expand Down
Loading
Loading