From c8cabb3cf6a7d98f90a0a216620caf766c9d5afa Mon Sep 17 00:00:00 2001 From: myhloli Date: Sat, 30 Nov 2024 01:14:12 +0800 Subject: [PATCH 1/3] feat(ocr_mkcontent): add language detection for line spacing - Introduce language detection to determine line spacing based on language context - Implement different spacing rules for Chinese/Japanese/Korean and Western texts - Adjust span content handling based on detected language and span type --- magic_pdf/dict2md/ocr_mkcontent.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py index 90ebcfbc..70f8138f 100644 --- a/magic_pdf/dict2md/ocr_mkcontent.py +++ b/magic_pdf/dict2md/ocr_mkcontent.py @@ -5,6 +5,7 @@ from magic_pdf.config.make_content_config import DropMode, MakeMode from magic_pdf.config.ocr_content_type import BlockType, ContentType from magic_pdf.libs.commons import join_path +from magic_pdf.libs.language import detect_lang from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char from magic_pdf.para.para_split_v3 import ListLineTag @@ -142,11 +143,14 @@ def merge_para_with_text(para_block): para_text += ' \n' line_text = '' + line_lang = '' for span in line['spans']: span_type = span['type'] if span_type == ContentType.Text: line_text += span['content'].strip() + if line_text != '': + line_lang = detect_lang(line_text) for j, span in enumerate(line['spans']): span_type = span['type'] @@ -159,15 +163,20 @@ def merge_para_with_text(para_block): content = f"\n$$\n{span['content']}\n$$\n" content = content.strip() - if content != '': - if span_type in [ContentType.Text, ContentType.InlineEquation]: - # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除 - if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content): - para_text += content[:-1] - else: # content间需要空格分隔 - para_text += f'{content} ' - elif span_type == ContentType.InterlineEquation: - para_text += content + + if content: + langs = ['zh', 'ja', 'ko'] + if line_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔 + para_text += content if j == len(line['spans']) - 1 else f'{content} ' + else: + if span_type in [ContentType.Text, ContentType.InlineEquation]: + # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除 + if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content): + para_text += content[:-1] + else: # 西方文本语境下 content间需要空格分隔 + para_text += f'{content} ' + elif span_type == ContentType.InterlineEquation: + para_text += content else: continue # 连写字符拆分 From b80befe9cf4603648fccfcaf7b2dd0739de035e6 Mon Sep 17 00:00:00 2001 From: myhloli Date: Sat, 30 Nov 2024 02:16:38 +0800 Subject: [PATCH 2/3] refactor(mkcontent): optimize paragraph text merging and language detection - Extract language detection to block level instead of line level - Improve logic for handling Chinese, Japanese, and Korean languages - Refactor code for better readability and performance - Optimize handling of hyphenated words at line ends --- magic_pdf/dict2md/ocr_mkcontent.py | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py index 70f8138f..9c83fdb0 100644 --- a/magic_pdf/dict2md/ocr_mkcontent.py +++ b/magic_pdf/dict2md/ocr_mkcontent.py @@ -136,21 +136,19 @@ def __replace_ligatures(text: str): def merge_para_with_text(para_block): + block_text = '' + for line in para_block['lines']: + for span in line['spans']: + if span['type'] in [ContentType.Text]: + block_text += span['content'] + block_lang = detect_lang(block_text) + para_text = '' for i, line in enumerate(para_block['lines']): if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False): para_text += ' \n' - line_text = '' - line_lang = '' - for span in line['spans']: - span_type = span['type'] - if span_type == ContentType.Text: - line_text += span['content'].strip() - - if line_text != '': - line_lang = detect_lang(line_text) for j, span in enumerate(line['spans']): span_type = span['type'] @@ -166,12 +164,16 @@ def merge_para_with_text(para_block): if content: langs = ['zh', 'ja', 'ko'] - if line_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔 - para_text += content if j == len(line['spans']) - 1 else f'{content} ' + # logger.info(f'block_lang: {block_lang}, content: {content}') + if block_lang in langs: # 中文/日语/韩文语境下,换行不需要空格分隔 + if j == len(line['spans']) - 1: + para_text += content + else: + para_text += f'{content} ' else: if span_type in [ContentType.Text, ContentType.InlineEquation]: # 如果span是line的最后一个且末尾带有-连字符,那么末尾不应该加空格,同时应该把-删除 - if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content): + if j == len(line['spans'])-1 and span_type == ContentType.Text and __is_hyphen_at_line_end(content): para_text += content[:-1] else: # 西方文本语境下 content间需要空格分隔 para_text += f'{content} ' From b3127233f0332c912d61c10fb27bf2d8761d1a51 Mon Sep 17 00:00:00 2001 From: myhloli Date: Sat, 30 Nov 2024 02:33:26 +0800 Subject: [PATCH 3/3] refactor: modify bbox processing for layout separation - Remove overlap between bboxes for block separation - Sort bboxes by combined x and y coordinates for better layout handling - Comment out previous overlap removal function --- magic_pdf/pre_proc/ocr_detect_all_bboxes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py index 2f4f058c..27008b53 100644 --- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py +++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py @@ -117,8 +117,8 @@ def ocr_prepare_bboxes_for_layout_split_v2( all_bboxes = remove_overlaps_min_blocks(all_bboxes) all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks) """将剩余的bbox做分离处理,防止后面分layout时出错""" - all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes) - + # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes) + all_bboxes.sort(key=lambda x: x[0]+x[1]) return all_bboxes, all_discarded_blocks