Skip to content

Commit

Permalink
Update pipeline
Browse files Browse the repository at this point in the history
  • Loading branch information
BenConstable9 committed Sep 19, 2024
1 parent 6834517 commit 9c3d28c
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 17 deletions.
14 changes: 6 additions & 8 deletions adi_function_app/pre_embedding_cleaner.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@ def get_section(cleaned_text: str) -> list:

def clean_sections(sections: list) -> list:
"""Cleans the sections by removing special characters and extra white spaces."""
cleaned_sections = [re.sub(r"[=#]", "", match).strip() for match in sections]
cleanedSections = [re.sub(r"[=#]", "", match).strip() for match in sections]

return cleaned_sections
return cleanedSections


def remove_markdown_tags(text: str, tag_patterns: dict) -> str:
Expand Down Expand Up @@ -123,19 +123,17 @@ async def process_pre_embedding_cleaner(record: dict) -> dict:

# scenarios when page by chunking is enabled
if isinstance(record["data"]["chunk"], dict):
cleaned_record["data"]["cleaned_chunk"] = clean_text(
cleaned_record["data"]["cleanedChunk"] = clean_text(
record["data"]["chunk"]["content"]
)
cleaned_record["data"]["chunk"] = record["data"]["chunk"]["content"]
cleaned_record["data"]["cleaned_sections"] = clean_sections(
cleaned_record["data"]["cleanedSections"] = clean_sections(
record["data"]["chunk"]["sections"]
)
else:
cleaned_record["data"]["cleaned_chunk"] = clean_text(
record["data"]["chunk"]
)
cleaned_record["data"]["cleanedChunk"] = clean_text(record["data"]["chunk"])
cleaned_record["data"]["chunk"] = record["data"]["chunk"]
cleaned_record["data"]["cleaned_sections"] = get_section(
cleaned_record["data"]["cleanedSections"] = get_section(
record["data"]["chunk"]
)

Expand Down
6 changes: 3 additions & 3 deletions deploy_ai_search/ai_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def get_data_source(self) -> SearchIndexerDataSourceConnection:
return data_source_connection

def get_pre_embedding_cleaner_skill(
self, context, source, target_name="cleaned_chunk"
self, context, source, target_name="cleanedChunk"
) -> WebApiSkill:
"""Get the custom skill for data cleanup.
Expand All @@ -221,10 +221,10 @@ def get_pre_embedding_cleaner_skill(
]

pre_embedding_cleaner_skill_outputs = [
OutputFieldMappingEntry(name="cleaned_chunk", target_name=target_name),
OutputFieldMappingEntry(name="cleanedChunk", target_name=target_name),
OutputFieldMappingEntry(name="chunk", target_name="chunk"),
OutputFieldMappingEntry(
name="cleaned_sections", target_name="cleaned_sections"
name="cleanedSections", target_name="cleanedSections"
),
]

Expand Down
19 changes: 13 additions & 6 deletions deploy_ai_search/rag_documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,11 +174,11 @@ def get_skills(self) -> list:
)

key_phrase_extraction_skill = self.get_key_phrase_extraction_skill(
"/document/pages/*", "/document/pages/*/cleaned_chunk"
"/document/pages/*", "/document/pages/*/cleanedChunk"
)

embedding_skill = self.get_vector_skill(
"/document/pages/*", "/document/pages/*/cleaned_chunk"
"/document/pages/*", "/document/pages/*/cleanedChunk"
)

if self.enable_page_by_chunking:
Expand Down Expand Up @@ -213,22 +213,29 @@ def get_index_projections(self) -> SearchIndexerIndexProjections:
name="Keywords", source="/document/pages/*/keywords"
),
InputFieldMappingEntry(
name="Sections", source="/document/pages/*/cleaned_sections"
name="Sections", source="/document/pages/*/cleanedSections"
),
InputFieldMappingEntry(
name="Figures", source="/document/pages/*/cleaned_sections"
name="Figures",
inputs=[
InputFieldMappingEntry(
name="FigureID", source="/document/pages/*/figures/figureId"
),
InputFieldMappingEntry(
name="FigureUri", source="/document/pages/*/figures/figureUri"
),
],
),
InputFieldMappingEntry(
name="DateLastModified", source="/document/DateLastModified"
),
InputFieldMappingEntry(name="Figures", source="/document/pages/*/figures"),
]

if self.enable_page_by_chunking:
mappings.extend(
[
InputFieldMappingEntry(
name="PageNumber", source="/document/pages/*/page_number"
name="PageNumber", source="/document/pages/*/pageNumber"
)
]
)
Expand Down

0 comments on commit 9c3d28c

Please sign in to comment.