Code for the article: https://medium.com/@enoch3712/unlocking-rapid-d…

…ata-extraction-groq-ocr-and-claude-vision-technologies-dd5a2607665f
enoch3712 · Apr 16, 2024 · cb5d01d · cb5d01d
1 parent 6a9a1ce
commit cb5d01d
Show file tree

Hide file tree

Showing 6 changed files with 132 additions and 7 deletions.
diff --git a/config.py b/config.py
@@ -1,3 +1,4 @@
 API_KEY = 'XXXX'
 API_KEY_ANTROPIC = 'XXXX'
-API_KEY_OPENAI = 'XXXX'
+API_KEY_OPENAI = 'XXXX'
+API_KEY_GROQ = 'XXXX'
diff --git a/docTR/Dockerfile b/docTR/Dockerfile
@@ -0,0 +1,29 @@
+# Use an official Python runtime as a parent image, suitable for TensorFlow
+FROM tensorflow/tensorflow:latest
+
+# Set the working directory in the container
+WORKDIR /app
+
+# Install system dependencies required for OpenCV and WeasyPrint
+RUN apt-get update && apt-get install -y \
+    libgl1-mesa-glx \
+    libpango-1.0-0 \
+    libpangocairo-1.0-0 \
+    libgdk-pixbuf2.0-0 \
+    libffi-dev \
+    shared-mime-info
+
+# Install FastAPI and Uvicorn
+RUN pip install fastapi uvicorn python-multipart aiofiles Pillow
+
+# Copy the local directory contents into the container
+COPY . /app
+
+# Install `doctr` with TensorFlow support
+RUN pip install python-doctr[tf]
+
+# Expose the port FastAPI will run on
+EXPOSE 8001
+
+# Command to run the FastAPI server on container start
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8001", "--workers", "4"]
diff --git a/docTR/README.md b/docTR/README.md
@@ -0,0 +1,4 @@
+# Simple docTR container to use as an example
+
+docker build -t myapp .
+docker run -p 8001:8001 myapp
diff --git a/docTR/app.py b/docTR/app.py
@@ -0,0 +1,32 @@
+from fastapi import FastAPI, File, UploadFile
+from fastapi.responses import JSONResponse
+from doctr.io import DocumentFile
+from doctr.models import ocr_predictor
+from PIL import Image
+import io
+import os
+
+app = FastAPI(title="OCR Service using docTR")
+
+@app.post("/ocr/")
+async def perform_ocr(file: UploadFile = File(...)):
+    image_data = await file.read()
+
+    # Attempt to load the image directly from the BytesIO object
+    doc = DocumentFile.from_images(image_data)
+
+    model = ocr_predictor(pretrained=True)
+    result = model(doc)
+
+    extracted_texts = []
+    for page in result.pages:
+        for block in page.blocks:
+            for line in block.lines:
+                line_text = ' '.join([word.value for word in line.words])
+                extracted_texts.append(line_text)
+
+    return JSONResponse(content={"ExtractedText": extracted_texts})
+
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run(app, host="0.0.0.0", port=8001)
diff --git a/main.py b/main.py
@@ -26,6 +26,7 @@
 from easyocr import Reader
 from fastapi import FastAPI, File, Form, HTTPException, UploadFile
 from fastapi.responses import JSONResponse
+from groq import Groq
 
 from Antropic.AnthropicsApiRequest import AnthropicsApiRequest
 from Antropic.AnthropicsApiService import AnthropicsApiService
@@ -37,7 +38,7 @@
 from Classification.ModelDecorator import ModelDecorator
 from CustomException import CustomException
 from Payload import Message, Payload
-from config import API_KEY, API_KEY_ANTROPIC
+from config import API_KEY, API_KEY_ANTROPIC, API_KEY_GROQ
 from utils import remove_json_format
 
 # local path to tesseract
@@ -159,12 +160,10 @@ async def extract_text(file: UploadFile = File(...), extraction_contract: str =
     # Send the extracted text and extraction contract to the Mistral API
     content = send_request_to_mistral(extracted_text)
 
-    # 
-
     # Close and remove the temporary file
     temp_file.close()
 
-    return content
+    return {"Content": json.loads(content)}
 
 @app.post("/extractClaude")
 async def process_image_with_claude(file: UploadFile = File(...), extraction_contract: str = Form(...)):
@@ -202,7 +201,7 @@ async def process_image_with_claude(file: UploadFile = File(...), extraction_con
     response = api_service.send_image_message(api_request, base64_encoded_image, "", addOcr=False)
 
     # Return the response
-    return {"Content": response}
+    return {"Content": json.loads(response)}
 
 @app.post("/extractClaudeWithOcr")
 async def process_image_with_claude(file: UploadFile = File(...), extraction_contract: str = Form(...)):
@@ -304,7 +303,43 @@ async def classify(file: UploadFile = File(...), classifications: str = Form(...
 
     content = remove_json_format(result)
 
-    return json.loads(content)
+    return {"Content": json.loads(content)}
+
+@app.post("/extract_fast")
+async def extract_text(file: UploadFile = File(...), extraction_contract: str = Form(...)):
+
+    # Create a temporary file and save the uploaded file to it
+    temp_file = tempfile.NamedTemporaryFile(delete=False)
+    shutil.copyfileobj(file.file, temp_file)
+    file_path = temp_file.name
+
+    # Convert PDF to images
+    images = convert_pdf_to_images(file_path)
+
+    # Extract text using different methods
+    extracted_text = extract_text_with_pytesseract(images)
+
+    # Join the extracted text into a single string
+    extracted_text = "\n new page --- \n".join(extracted_text)
+
+    # add system message to the extracted text
+    extracted_text = systemMessage + "\n####Content\n\n" + extracted_text
+
+    # add contract to the extracted text
+    extracted_text = extracted_text + "\n####Structure of the JSON output file\n\n" + extraction_contract
+
+    # add response section
+    extracted_text = extracted_text + "\n#### JSON Response\n\n" + jsonContentStarter
+
+    # Send the extracted text and extraction contract to the Mistral API
+    content = send_request_to_groq(extracted_text)
+
+    # Close and remove the temporary file
+    temp_file.close()
+
+    content = remove_json_format(content)
+
+    return {"Content": json.loads(content)}
 
 def process_file(file):
     # Create a temporary file and save the uploaded file to it
@@ -374,6 +409,30 @@ def send_request_to_mistral(content: str) -> str:
 
     return json_content
 
+def send_request_to_groq(content: str) -> str:
+    client = Groq(api_key=API_KEY_GROQ)
+    completion = client.chat.completions.create(
+        model="gemma-7b-it",
+        messages=[
+            {
+                "role": "system",
+                "content": "You are an API server that receives content from a document and returns a JSON with the defined protocol"
+            },
+            {
+                "role": "user",
+                "content": content
+            }
+        ],
+        temperature=1,
+        max_tokens=1024,
+        top_p=1,
+        stream=False,
+        response_format={"type": "json_object"},
+        stop=None,
+    )
+
+    return completion.choices[0].message.content
+
 def extract_json(text):
     # Find the JSON string in the text
     match = re.search(r'\{.*?\}', text, re.DOTALL)

diff --git a/requirements.txt b/requirements.txt