Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

AP-1692: Annotate Documents through Marie with specific tag #110

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
212 changes: 121 additions & 91 deletions marie/ocr/ocr_engine.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import os
from abc import ABC, abstractmethod
from itertools import chain
from typing import Any, Dict, List, Optional, Union

import cv2
Expand Down Expand Up @@ -253,114 +254,143 @@ def __process_extract_regions(
raise Exception(f"Required key missing in region : {region}")

# Additional fields are allowed (e.g. mode)

# TODO : Introduce mini-batched by region to improve inference
bbox_results_batch = []
pages = {}
for region in regions:
try:
self.logger.debug(f"Extracting box : {region}")
rid = region["id"]
page_index = region["pageIndex"]
x = region["x"]
y = region["y"]
w = region["w"]
h = region["h"]

img = frames[page_index]

if w == 0 or h == 0:
self.logger.warning(f"Region has zero width or height : {region}")
output.append({"id": rid, "text": "", "confidence": 0.0})
continue

if y + h > img.shape[0] or x + w > img.shape[1]:
self.logger.warning(f"Region out of bounds : {region}")
output.append({"id": rid, "text": "", "confidence": 0.0})
continue

img = img[y : y + h, x : x + w].copy()
# allow for small padding around the component
padding = 4
if crop_to_content_enabled:
img = crop_to_content(img)
h = img.shape[0]
w = img.shape[1]
padding = 4

if padding != 0:
overlay = (
np.ones((h + padding * 2, w + padding * 2, 3), dtype=np.uint8)
* 255
)
overlay[padding : h + padding, padding : w + padding] = img
else:
overlay = img

# cv2.imwrite(f"/tmp/marie/overlay_image_{page_index}_{rid}.png", overlay)
# each region can have its own segmentation mode
if "mode" in region:
mode = PSMode.from_value(region["mode"])
else:
mode = pms_mode
pages.setdefault(region["pageIndex"], []).append(region)

# create cache key from region id and overlay hash
overlay_hash = hash_frames_fast(overlay)
cache_key = f"{id(region)}_{overlay_hash}"
bbox_results = None
# Batch region by page
for page_index, regions in pages.items():
img = frames[page_index]
x_batch, y_batch, w_batch, h_batch = img.shape[1], img.shape[0], 0, 0
region_ids = []
try:
for region in regions:
self.logger.debug(f"Extracting box : {region}")
rid = region["id"]
region_ids.append(rid)
x = region["x"]
y = region["y"]
w = region["w"]
h = region["h"]

if x < 0 or y < 0:
self.logger.warning(
f"Region has negative coordinates : {region}"
)
# normalize region to start from 0. ex: (0, y) | (x, 0) | (0, 0)
x = max(x, 0)
y = max(y, 0)

if w == 0 or h == 0:
self.logger.warning(
f"Region has zero width or height : {region}"
)
output.append({"id": rid, "text": "", "confidence": 0.0})
continue

if cache_key in bbox_cache:
bbox_results = bbox_cache[cache_key]
if y + h > img.shape[0] or x + w > img.shape[1]:
self.logger.warning(f"Region out of bounds : {region}")
output.append({"id": rid, "text": "", "confidence": 0.0})
continue

if bbox_results is None:
bbox_results = box_processor.extract_bounding_boxes(
queue_id, checksum, overlay, psm=mode
)
bbox_cache[cache_key] = bbox_results
# Update the size of the batch overlay
x_batch = min(x, x_batch)
y_batch = min(y, y_batch)
w_batch = max(x + w, x_batch + w_batch) - x_batch
h_batch = max(y + h, h_batch + y_batch) - y_batch

(
boxes,
img_fragments,
lines,
_,
lines_bboxes,
) = bbox_results
region_fragment = img[y : y + h, x : x + w].copy()
# allow for small padding around the component
padding = 4
if crop_to_content_enabled:
region_fragment = crop_to_content(region_fragment)
h = region_fragment.shape[0]
w = region_fragment.shape[1]
padding = 4

if padding != 0:
region_overlay = (
np.ones(
(h + padding * 2, w + padding * 2, 3), dtype=np.uint8
)
* 255
)
region_overlay[
padding : h + padding, padding : w + padding
] = region_fragment
else:
region_overlay = region_fragment

result, overlay_image = icr_processor.recognize(
queue_id, checksum, overlay, boxes, img_fragments, lines
# cv2.imwrite(f"/tmp/marie/region_overlay_{page_index}_{rid}.png", region_overlay)
# each region can have its own segmentation mode
if "mode" in region:
mode = PSMode.from_value(region["mode"])
else:
mode = pms_mode

# create cache key from region id and overlay hash
region_overlay_hash = hash_frames_fast(region_overlay)
cache_key = f"{id(region)}_{region_overlay_hash}"
bbox_results = None

if cache_key in bbox_cache:
bbox_results = bbox_cache[cache_key]

if bbox_results is None:
bbox_results = box_processor.extract_bounding_boxes(
queue_id, checksum, region_overlay, psm=mode
)
bbox_cache[cache_key] = bbox_results

bbox_results_batch.append(bbox_results)

# use a crop of the image related to the batch
batch_crop = img[
y_batch : y_batch + h_batch, x_batch : x_batch + w_batch
]
(boxes, img_fragments, lines, _, lines_bboxes,) = (
list(chain.from_iterable(x))
for i, x in enumerate(zip(*bbox_results_batch))
)
batch_result, batch_overlay_image = icr_processor.recognize(
queue_id, checksum, batch_crop, boxes, img_fragments, lines
)

del boxes
del img_fragments
del lines
del lines_bboxes

if not filter_snippets:
result["overlay_b64"] = encodeToBase64(overlay_image)

result["id"] = rid
extended.append(result)

# TODO : Implement rendering modes
# 1 - Simple
# 2 - Full
# 3 - HOCR
self.logger.debug(result)
rendering_mode = "simple"
region_result = {}
if rendering_mode == "simple":
if "lines" in result and len(result["lines"]) > 0:
lines = result["lines"]
line = lines[0]
region_result["id"] = rid
region_result["text"] = line["text"]
region_result["confidence"] = line["confidence"]
output.append(region_result)
else:
output.append({"id": rid, "text": "", "confidence": 0.0})

except Exception as ex:
self.logger.error(ex)
raise ex

if not filter_snippets:
batch_result["overlay_b64"] = encodeToBase64(batch_overlay_image)

extended.append(batch_result)

# TODO : Implement rendering modes
# 1 - Simple
# 2 - Full
# 3 - HOCR
rendering_mode = "simple"
if rendering_mode == "simple":
# unpack result from batch icr
if "words" in batch_result and len(batch_result["words"]) == len(
region_ids
):
for words, rid in zip(batch_result["words"], region_ids):
region_result = {
"id": rid,
"text": words["text"],
"confidence": words["confidence"],
}
output.append(region_result)
else:
for rid in region_ids:
output.append({"id": rid, "text": "", "confidence": 0.0})
# Filter out base 64 encoded fragments(fragment_b64, overlay_b64)
# This is useful when we like to display or process image in the output but has significant payload overhead

Expand Down
23 changes: 22 additions & 1 deletion marie/pipe/extract_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
split_filename,
store_assets,
)
from marie.renderer import PdfRenderer, TextRenderer
from marie.renderer import PdfRenderer, PngRenderer, TextRenderer
from marie.renderer.adlib_renderer import AdlibRenderer
from marie.renderer.blob_renderer import BlobRenderer
from marie.utils.docs import docs_from_image, frames_from_file
Expand Down Expand Up @@ -258,6 +258,7 @@ def execute_frames_pipeline(

# TODO : Convert to execution pipeline
self.render_pdf(ref_id, frames, ocr_results, root_asset_dir)
# self.render_png(ref_id, frames, ocr_results, root_asset_dir)
self.render_blobs(ref_id, frames, ocr_results, root_asset_dir)
self.render_adlib(ref_id, frames, ocr_results, root_asset_dir)

Expand Down Expand Up @@ -410,6 +411,26 @@ def render_text(self, frames, results, root_asset_dir) -> None:
output_file_or_dir=os.path.join(root_asset_dir, "results.txt"),
)

def render_png(self, ref_id: str, frames, results, root_asset_dir) -> None:
output_dir = ensure_exists(os.path.join(root_asset_dir, "png"))
renderer = PngRenderer(config={})
renderer.render(
frames,
results,
output_filename=os.path.join(output_dir, "results.png"),
**{
"overlay": True,
},
)
renderer.render(
frames,
results,
output_filename=os.path.join(output_dir, "results_clean.png"),
**{
"overlay": False,
},
)

def render_pdf(self, ref_id: str, frames, results, root_asset_dir) -> None:
output_dir = ensure_exists(os.path.join(root_asset_dir, "pdf"))
renderer = PdfRenderer(config={})
Expand Down
1 change: 1 addition & 0 deletions marie/renderer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import os
import sys

from .png_renderer import PngRenderer
from .renderer import ResultRenderer

from .text_renderer import TextRenderer # isort:skip depends on ResultRenderer
Expand Down
2 changes: 1 addition & 1 deletion marie/renderer/adlib_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ def __render_page(self, image: np.ndarray, result: Dict[str, Any], page_index: i
root.set("NUMBER", str(pagenumber))
root.set("OCREndTime", "0")
root.set("OCRStartTime", "0")
root.set("Producer", "marie")
root.set("Producer", "MARIE-AI")
root.set("XRESOLUTION", str(dpi_x))
root.set("YRESOLUTION", str(dpi_y))

Expand Down
1 change: 1 addition & 0 deletions marie/renderer/blob_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@ def __render_page(self, image: np.ndarray, result: Dict[str, Any], page_index: i
root.set("yres", "300")
root.set("xres", "300")
root.set("page", str(page_index))
root.set("producer", "MARIE-AI")

try:
meta = result["meta"]
Expand Down
16 changes: 16 additions & 0 deletions marie/renderer/pdf_renderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -156,14 +156,30 @@ def render(

self.logger.info(f"Render PDF [{image_overlay}]: {output_filename}")

num_pages = 0
# The underlying ByteIO buffer will be closed when we write the file out
writer = PdfFileWriter()
for page_index, (image, result) in enumerate(zip(frames, results)):
try:
page = self.__render_page(image, result, page_index, image_overlay)
writer.addPage(page)
num_pages += 1
except Exception as e:
logger.error(e, stack_info=True, exc_info=True)

# add specific tag
metadata = {'/Producer': "MARIE-AI"}
writer.addMetadata(metadata)
with open(output_filename, "wb") as output:
writer.write(output)

with open(output_filename, "rb") as f:
pdf = PdfFileReader(f)
information = pdf.getDocumentInfo()
number_of_pages = pdf.getNumPages()

txt = f"""
Information about {output_filename}:
Producer: {information.producer}
Number of pages: {number_of_pages} """
self.logger.info(txt)
Loading