Fix section bugs

microsoft · Sep 10, 2024 · bde3d6d · bde3d6d
1 parent 23ec029
commit bde3d6d
Show file tree

Hide file tree

Showing 4 changed files with 37 additions and 24 deletions.
diff --git a/ai_search_with_adi/adi_function_app/adi_2_ai_search.py b/ai_search_with_adi/adi_function_app/adi_2_ai_search.py
@@ -69,7 +69,7 @@ def clean_adi_markdown(
     comment_patterns = r"<!-- PageNumber=\"\d+\" -->|<!-- PageHeader=\".*?\" -->|<!-- PageFooter=\".*?\" -->|<!-- PageBreak -->"
     cleaned_text = re.sub(comment_patterns, "", markdown_text, flags=re.DOTALL)
 
-    combined_pattern = r"(.*?)\n===|\n# (.*?)\n|\n## ?(.*?)\n|\n### ?(.*?)\n|\n#### ?(.*?)\n|\n##### ?(.*?)\n|\n###### ?(.*?)\n"
+    combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n"
     doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL)
     doc_metadata = [match for group in doc_metadata for match in group if match]
 
@@ -170,6 +170,8 @@ async def understand_image_with_gptv(image_base64, caption, tries_left=3):
 
     If the image is a diagram, you should describe the components, relationships, and any other relevant information that can be inferred from the diagram.
 
+    Include any data points, labels, and other relevant information that can be inferred from the image.
+
     IMPORTANT: If the provided image is a logo or photograph, simply return 'Irrelevant Image'."""
 
     user_input = "Describe this image with technical analysis. Provide a well-structured, description."
@@ -255,6 +257,10 @@ def pil_image_to_base64(image, image_format="JPEG"):
     return base64.b64encode(buffered.getvalue()).decode("utf-8")
 
 
+async def mark_image_as_irrelevant():
+    return "Irrelevant Image"
+
+
 async def process_figures_from_extracted_content(
     file_path: str, markdown_content: str, figures: list, page_number: None | int = None
 ) -> str:
@@ -270,6 +276,8 @@ async def process_figures_from_extracted_content(
     Returns:
     --------
         str: The updated Markdown content with the figure descriptions."""
+
+    image_understanding_tasks = []
     for idx, figure in enumerate(figures):
         img_description = ""
         logging.debug(f"Figure #{idx} has the following spans: {figure.spans}")
@@ -293,16 +301,19 @@ async def process_figures_from_extracted_content(
                 )  # page_number is 1-indexed3
 
                 if cropped_image is None:
-                    img_description += "Irrelevant Image"
+                    image_understanding_tasks.append(mark_image_as_irrelevant())
                 else:
                     image_base64 = pil_image_to_base64(cropped_image)
 
-                    img_description = await understand_image_with_gptv(
-                        image_base64, figure.caption.content
+                    image_understanding_tasks.append(
+                        understand_image_with_gptv(image_base64, figure.caption.content)
                     )
                     logging.info(f"\tDescription of figure {idx}: {img_description}")
                     break
 
+    image_descriptions = await asyncio.gather(*image_understanding_tasks)
+
+    for idx, img_description in enumerate(image_descriptions):
         markdown_content = update_figure_description(
             markdown_content, img_description, idx
         )

diff --git a/ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py b/ai_search_with_adi/adi_function_app/pre_embedding_cleaner.py
@@ -21,10 +21,17 @@ def get_section(cleaned_text: str) -> list:
         list: The sections related to text
 
     """
-    combined_pattern = r"(.*?)\n===|\n## ?(.*?)\n|\n### ?(.*?)\n"
+    combined_pattern = r"(.*?)\n===|\n#+\s*(.*?)\n"
     doc_metadata = re.findall(combined_pattern, cleaned_text, re.DOTALL)
     doc_metadata = [match for group in doc_metadata for match in group if match]
-    return doc_metadata
+    return clean_sections(doc_metadata)
+
+
+def clean_sections(sections: list) -> list:
+    """Cleans the sections by removing special characters and extra white spaces."""
+    cleaned_sections = [re.sub(r"[=#]", "", match).strip() for match in sections]
+
+    return cleaned_sections
 
 
 def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
@@ -120,16 +127,17 @@ async def process_pre_embedding_cleaner(record: dict) -> dict:
                 record["data"]["chunk"]["content"]
             )
             cleaned_record["data"]["chunk"] = record["data"]["chunk"]["content"]
-            cleaned_record["data"]["section"] = record["data"]["chunk"]["section"]
-            cleaned_record["data"]["page_number"] = record["data"]["chunk"][
-                "page_number"
-            ]
+            cleaned_record["data"]["sections"] = clean_sections(
+                record["data"]["chunk"]["sections"]
+            )
         else:
             cleaned_record["data"]["cleaned_chunk"] = clean_text(
                 record["data"]["chunk"]
             )
             cleaned_record["data"]["chunk"] = record["data"]["chunk"]
-            cleaned_record["data"]["section"] = get_section(record["data"]["chunk"])
+            cleaned_record["data"]["cleaned_sections"] = get_section(
+                record["data"]["chunk"]
+            )
 
     except Exception as e:
         logging.error("string cleanup Error: %s", e)

diff --git a/ai_search_with_adi/ai_search/ai_search.py b/ai_search_with_adi/ai_search/ai_search.py
@@ -175,7 +175,7 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection:
         return data_source_connection
 
     def get_pre_embedding_cleaner_skill(
-        self, context, source, chunk_by_page=False, target_name="cleaned_chunk"
+        self, context, source, target_name="cleaned_chunk"
     ) -> WebApiSkill:
         """Get the custom skill for data cleanup.
 
@@ -203,18 +203,11 @@ def get_pre_embedding_cleaner_skill(
         pre_embedding_cleaner_skill_outputs = [
             OutputFieldMappingEntry(name="cleaned_chunk", target_name=target_name),
             OutputFieldMappingEntry(name="chunk", target_name="chunk"),
-            OutputFieldMappingEntry(name="section", target_name="section"),
+            OutputFieldMappingEntry(
+                name="cleaned_sections", target_name="cleaned_sections"
+            ),
         ]
 
-        if chunk_by_page:
-            pre_embedding_cleaner_skill_outputs.extend(
-                [
-                    OutputFieldMappingEntry(
-                        name="page_number", target_name="page_number"
-                    ),
-                ]
-            )
-
         pre_embedding_cleaner_skill = WebApiSkill(
             name="Pre Embedding Cleaner Skill",
             description="Skill to clean the data before sending to embedding",
@@ -277,8 +270,9 @@ def get_adi_skill(self, chunk_by_page=False) -> WebApiSkill:
             batch_size = 1
             degree_of_parallelism = 4
         else:
+            # Depending on your GPT Token limit, you may need to adjust the batch size and degree of parallelism
             batch_size = 1
-            degree_of_parallelism = 16
+            degree_of_parallelism = 8
 
         if chunk_by_page:
             output = [

diff --git a/ai_search_with_adi/ai_search/rag_documents.py b/ai_search_with_adi/ai_search/rag_documents.py
@@ -191,7 +191,7 @@ def get_index_projections(self) -> SearchIndexerIndexProjections:
                 name="Keywords", source="/document/pages/*/keywords"
             ),
             InputFieldMappingEntry(
-                name="Sections", source="/document/pages/*/sections"
+                name="Sections", source="/document/pages/*/cleaned_sections"
             ),
         ]