From c8cabb3cf6a7d98f90a0a216620caf766c9d5afa Mon Sep 17 00:00:00 2001
From: myhloli <moe@myhloli.com>
Date: Sat, 30 Nov 2024 01:14:12 +0800
Subject: [PATCH 1/3] feat(ocr_mkcontent): add language detection for line
 spacing

- Introduce language detection to determine line spacing based on language context
- Implement different spacing rules for Chinese/Japanese/Korean and Western texts
- Adjust span content handling based on detected language and span type
---
 magic_pdf/dict2md/ocr_mkcontent.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py
index 90ebcfbc..70f8138f 100644
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -5,6 +5,7 @@
 from magic_pdf.config.make_content_config import DropMode, MakeMode
 from magic_pdf.config.ocr_content_type import BlockType, ContentType
 from magic_pdf.libs.commons import join_path
+from magic_pdf.libs.language import detect_lang
 from magic_pdf.libs.markdown_utils import ocr_escape_special_markdown_char
 from magic_pdf.para.para_split_v3 import ListLineTag
 
@@ -142,11 +143,14 @@ def merge_para_with_text(para_block):
             para_text += '  \n'
 
         line_text = ''
+        line_lang = ''
         for span in line['spans']:
             span_type = span['type']
             if span_type == ContentType.Text:
                 line_text += span['content'].strip()
 
+        if line_text != '':
+            line_lang = detect_lang(line_text)
         for j, span in enumerate(line['spans']):
 
             span_type = span['type']
@@ -159,15 +163,20 @@ def merge_para_with_text(para_block):
                 content = f"\n$$\n{span['content']}\n$$\n"
 
             content = content.strip()
-            if content != '':
-                if span_type in [ContentType.Text, ContentType.InlineEquation]:
-                    # 如果span是line的最后一个且末尾带有-连字符，那么末尾不应该加空格,同时应该把-删除
-                    if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
-                        para_text += content[:-1]
-                    else:  # content间需要空格分隔
-                        para_text += f'{content} '
-                elif span_type == ContentType.InterlineEquation:
-                    para_text += content
+
+            if content:
+                langs = ['zh', 'ja', 'ko']
+                if line_lang in langs: # 中文/日语/韩文语境下，换行不需要空格分隔
+                    para_text += content if j == len(line['spans']) - 1 else f'{content} '
+                else:
+                    if span_type in [ContentType.Text, ContentType.InlineEquation]:
+                        # 如果span是line的最后一个且末尾带有-连字符，那么末尾不应该加空格,同时应该把-删除
+                        if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
+                            para_text += content[:-1]
+                        else:  # 西方文本语境下 content间需要空格分隔
+                            para_text += f'{content} '
+                    elif span_type == ContentType.InterlineEquation:
+                        para_text += content
             else:
                 continue
     # 连写字符拆分

From b80befe9cf4603648fccfcaf7b2dd0739de035e6 Mon Sep 17 00:00:00 2001
From: myhloli <moe@myhloli.com>
Date: Sat, 30 Nov 2024 02:16:38 +0800
Subject: [PATCH 2/3] refactor(mkcontent): optimize paragraph text merging and
 language detection

- Extract language detection to block level instead of line level
- Improve logic for handling Chinese, Japanese, and Korean languages
- Refactor code for better readability and performance
- Optimize handling of hyphenated words at line ends
---
 magic_pdf/dict2md/ocr_mkcontent.py | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

diff --git a/magic_pdf/dict2md/ocr_mkcontent.py b/magic_pdf/dict2md/ocr_mkcontent.py
index 70f8138f..9c83fdb0 100644
--- a/magic_pdf/dict2md/ocr_mkcontent.py
+++ b/magic_pdf/dict2md/ocr_mkcontent.py
@@ -136,21 +136,19 @@ def __replace_ligatures(text: str):
 
 
 def merge_para_with_text(para_block):
+    block_text = ''
+    for line in para_block['lines']:
+        for span in line['spans']:
+            if span['type'] in [ContentType.Text]:
+                block_text += span['content']
+    block_lang = detect_lang(block_text)
+
     para_text = ''
     for i, line in enumerate(para_block['lines']):
 
         if i >= 1 and line.get(ListLineTag.IS_LIST_START_LINE, False):
             para_text += '  \n'
 
-        line_text = ''
-        line_lang = ''
-        for span in line['spans']:
-            span_type = span['type']
-            if span_type == ContentType.Text:
-                line_text += span['content'].strip()
-
-        if line_text != '':
-            line_lang = detect_lang(line_text)
         for j, span in enumerate(line['spans']):
 
             span_type = span['type']
@@ -166,12 +164,16 @@ def merge_para_with_text(para_block):
 
             if content:
                 langs = ['zh', 'ja', 'ko']
-                if line_lang in langs: # 中文/日语/韩文语境下，换行不需要空格分隔
-                    para_text += content if j == len(line['spans']) - 1 else f'{content} '
+                # logger.info(f'block_lang: {block_lang}, content: {content}')
+                if block_lang in langs: # 中文/日语/韩文语境下，换行不需要空格分隔
+                    if j == len(line['spans']) - 1:
+                        para_text += content
+                    else:
+                        para_text += f'{content} '
                 else:
                     if span_type in [ContentType.Text, ContentType.InlineEquation]:
                         # 如果span是line的最后一个且末尾带有-连字符，那么末尾不应该加空格,同时应该把-删除
-                        if j == len(line['spans'])-1 and __is_hyphen_at_line_end(content):
+                        if j == len(line['spans'])-1 and span_type == ContentType.Text and __is_hyphen_at_line_end(content):
                             para_text += content[:-1]
                         else:  # 西方文本语境下 content间需要空格分隔
                             para_text += f'{content} '

From b3127233f0332c912d61c10fb27bf2d8761d1a51 Mon Sep 17 00:00:00 2001
From: myhloli <moe@myhloli.com>
Date: Sat, 30 Nov 2024 02:33:26 +0800
Subject: [PATCH 3/3] refactor: modify bbox processing for layout separation

- Remove overlap between bboxes for block separation
- Sort bboxes by combined x and y coordinates for better layout handling
- Comment out previous overlap removal function
---
 magic_pdf/pre_proc/ocr_detect_all_bboxes.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
index 2f4f058c..27008b53 100644
--- a/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
+++ b/magic_pdf/pre_proc/ocr_detect_all_bboxes.py
@@ -117,8 +117,8 @@ def ocr_prepare_bboxes_for_layout_split_v2(
     all_bboxes = remove_overlaps_min_blocks(all_bboxes)
     all_discarded_blocks = remove_overlaps_min_blocks(all_discarded_blocks)
     """将剩余的bbox做分离处理，防止后面分layout时出错"""
-    all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
-
+    # all_bboxes, drop_reasons = remove_overlap_between_bbox_for_block(all_bboxes)
+    all_bboxes.sort(key=lambda x: x[0]+x[1])
     return all_bboxes, all_discarded_blocks