Skip to content

Commit

Permalink
Fix section bugs
Browse files Browse the repository at this point in the history
  • Loading branch information
BenConstable9 committed Sep 10, 2024
1 parent 23ec029 commit bde3d6d
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 24 deletions.
19 changes: 15 additions & 4 deletions ai_search_with_adi/adi_function_app/adi_2_ai_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ def clean_adi_markdown(
comment_patterns = r"<!-- PageNumber=\"\d+\" -->|<!-- PageHeader=\".*?\" -->|<!-- PageFooter=\".*?\" -->|<!-- PageBreak -->"
cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL)

combined_pattern = r"(.*?)\n===|\n# (.*?)\n|\n## ?(.*?)\n|\n### ?(.*?)\n|\n#### ?(.*?)\n|\n##### ?(.*?)\n|\n###### ?(.*?)\n"
combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n"
doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL)
doc_metadata = [match for group in doc_metadata for match in group if match]

Expand Down Expand Up @@ -170,6 +170,8 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3):
If the image is a diagram, you should describe the components, relationships, and any other relevant information that can be inferred from the diagram.
Include any data points, labels, and other relevant information that can be inferred from the image.
IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'."""

user_input = "Describe this image with technical analysis. Provide a well-structured, description."
Expand Down Expand Up @@ -255,6 +257,10 @@ def pil_image_to_base64(image, image_format="JPEG"):
return base64.b64encode(buffered.getvalue()).decode("utf-8")


async def mark_image_as_irrelevant():
return "Irrelevant Image"


async def process_figures_from_extracted_content(
file_path: str, markdown_content: str, figures: list, page_number: None | int = None
) -> str:
Expand All @@ -270,6 +276,8 @@ async def process_figures_from_extracted_content(
Returns:
--------
str: The updated Markdown content with the figure descriptions."""

image_understanding_tasks = []
for idx, figure in enumerate(figures):
img_description = ""
logging.debug(f"Figure #{idx} has the following spans: {figure.spans}")
Expand All @@ -293,16 +301,19 @@ async def process_figures_from_extracted_content(
) # page_number is 1-indexed3

if cropped_image is None:
img_description += "Irrelevant Image"
image_understanding_tasks.append(mark_image_as_irrelevant())
else:
image_base64 = pil_image_to_base64(cropped_image)

img_description = await understand_image_with_gptv(
image_base64, figure.caption.content
image_understanding_tasks.append(
understand_image_with_gptv(image_base64, figure.caption.content)
)
logging.info(f"\tDescription of figure {idx}: {img_description}")
break

image_descriptions = await asyncio.gather(*image_understanding_tasks)

for idx, img_description in enumerate(image_descriptions):
markdown_content = update_figure_description(
markdown_content, img_description, idx
)
Expand Down
22 changes: 15 additions & 7 deletions ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,17 @@ def get_section(cleaned_text: str) -> list:
list: The sections related to text
"""
combined_pattern = r"(.*?)\n===|\n## ?(.*?)\n|\n### ?(.*?)\n"
combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n"
doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL)
doc_metadata = [match for group in doc_metadata for match in group if match]
return doc_metadata
return clean_sections(doc_metadata)


def clean_sections(sections: list) -> list:
"""Cleans the sections by removing special characters and extra white spaces."""
cleaned_sections = [re.sub(r"[=#]", "", match).strip() for match in sections]

return cleaned_sections


def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
Expand Down Expand Up @@ -120,16 +127,17 @@ async def process_pre_embedding_cleaner(record: dict) -> dict:
record["data"]["chunk"]["content"]
)
cleaned_record["data"]["chunk"] = record["data"]["chunk"]["content"]
cleaned_record["data"]["section"] = record["data"]["chunk"]["section"]
cleaned_record["data"]["page_number"] = record["data"]["chunk"][
"page_number"
]
cleaned_record["data"]["sections"] = clean_sections(
record["data"]["chunk"]["sections"]
)
else:
cleaned_record["data"]["cleaned_chunk"] = clean_text(
record["data"]["chunk"]
)
cleaned_record["data"]["chunk"] = record["data"]["chunk"]
cleaned_record["data"]["section"] = get_section(record["data"]["chunk"])
cleaned_record["data"]["cleaned_sections"] = get_section(
record["data"]["chunk"]
)

except Exception as e:
logging.error("string cleanup Error: %s", e)
Expand Down
18 changes: 6 additions & 12 deletions ai_search_with_adi/ai_search/ai_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection:
return data_source_connection

def get_pre_embedding_cleaner_skill(
self, context, source, chunk_by_page=False, target_name="cleaned_chunk"
self, context, source, target_name="cleaned_chunk"
) -> WebApiSkill:
"""Get the custom skill for data cleanup.
Expand Down Expand Up @@ -203,18 +203,11 @@ def get_pre_embedding_cleaner_skill(
pre_embedding_cleaner_skill_outputs = [
OutputFieldMappingEntry(name="cleaned_chunk", target_name=target_name),
OutputFieldMappingEntry(name="chunk", target_name="chunk"),
OutputFieldMappingEntry(name="section", target_name="section"),
OutputFieldMappingEntry(
name="cleaned_sections", target_name="cleaned_sections"
),
]

if chunk_by_page:
pre_embedding_cleaner_skill_outputs.extend(
[
OutputFieldMappingEntry(
name="page_number", target_name="page_number"
),
]
)

pre_embedding_cleaner_skill = WebApiSkill(
name="Pre Embedding Cleaner Skill",
description="Skill to clean the data before sending to embedding",
Expand Down Expand Up @@ -277,8 +270,9 @@ def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill:
batch_size = 1
degree_of_parallelism = 4
else:
# Depending on your GPT Token limit, you may need to adjust the batch size and degree of parallelism
batch_size = 1
degree_of_parallelism = 16
degree_of_parallelism = 8

if chunk_by_page:
output = [
Expand Down
2 changes: 1 addition & 1 deletion ai_search_with_adi/ai_search/rag_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjections:
name="Keywords", source="/document/pages/*/keywords"
),
InputFieldMappingEntry(
name="Sections", source="/document/pages/*/sections"
name="Sections", source="/document/pages/*/cleaned_sections"
),
]

Expand Down

0 comments on commit bde3d6d

Please sign in to comment.