Skip to content

Commit

Permalink
Handle pdf extraction error
Browse files Browse the repository at this point in the history
  • Loading branch information
shrir committed Jan 10, 2025
1 parent 31e23e1 commit 7177c24
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 19 deletions.
15 changes: 13 additions & 2 deletions src/app/domain/jobs/controllers/job_posts.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,8 +165,19 @@ async def create_job_post_from_url(
# Generate pdf and save to s3
if db_obj.url:
pdf_content = await get_pdf(db_obj.url)
s3_client = boto3.client("s3")
s3_client.put_object(Bucket=app_s3_bucket_name, Key=f"job_posts/{db_obj.id}.pdf", Body=pdf_content)
if pdf_content:
s3_client = boto3.client("s3")
s3_client.put_object(Bucket=app_s3_bucket_name, Key=f"job_posts/{db_obj.id}.pdf", Body=pdf_content)
else:
error_msg = "Couldn't get pdf for a job post"
await logger.awarn(
error_msg,
job_id=db_obj.id,
job_title=db_obj.title,
company_id=company_db_obj.id,
company_name=company_db_obj.name,
company_url=company_db_obj.url,
)

return job_posts_service.to_schema(schema_type=JobPost, data=db_obj)

Expand Down
40 changes: 23 additions & 17 deletions src/app/lib/pdfshift.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,29 +7,35 @@
pdfshift_api_key = os.environ["PDFSHIFT_API_KEY"]


async def get_pdf(url: str, encode: bool = False) -> bytes | str:
async def get_pdf(url: str, encode: bool = False) -> bytes | str | None:
"""Get pdf."""
if not url:
error_msg = "URL is required"
raise ValueError(error_msg)

body = {"source": url, "encode": encode, "timeout": 30}

async with httpx.AsyncClient() as client:
response = await client.post(
"https://api.pdfshift.io/v3/convert/pdf",
auth=("api", pdfshift_api_key),
json=body,
)

if response.status_code != 200 and not response.content:
error_msg = "Failed to extract pdf."
await logger.awarn(
error_msg,
status=response.status_code,
response=response.content,
url=url,
try:
async with httpx.AsyncClient() as client:
response = await client.post(
"https://api.pdfshift.io/v3/convert/pdf",
auth=("api", pdfshift_api_key),
json=body,
)
raise LookupError(error_msg)

return response.content
if response.status_code != 200 and not response.content:
error_msg = "Failed to extract pdf."
await logger.awarn(
error_msg,
status=response.status_code,
response=response.content,
url=url,
)
raise LookupError(error_msg)

return response.content
except httpx.ReadTimeout as e:
error_msg = "Failed to get pdf"
await logger.awarn(error_msg, exc_info=e)

return None

0 comments on commit 7177c24

Please sign in to comment.