From cd6058e72982a754d331873aee3802fcb32d4661 Mon Sep 17 00:00:00 2001 From: Raduan77 Date: Wed, 18 Dec 2024 10:58:18 +0100 Subject: [PATCH 1/3] feat(epub): Add EPUB support --- src/markitdown/_markitdown.py | 61 ++++++++++++++++++++++++++++++++++ tests/test_files/test.epub | Bin 0 -> 2677 bytes tests/test_markitdown.py | 19 +++++++++++ 3 files changed, 80 insertions(+) create mode 100644 tests/test_files/test.epub diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index 2e7e5ff..d55c14b 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -24,6 +24,9 @@ import pdfminer import pdfminer.high_level import pptx +from ebooklib import epub, ITEM_DOCUMENT +import html2text + # File-format detection import puremagic @@ -690,6 +693,63 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]: ) +class EpubConverter(DocumentConverter): + """Converts EPUB files to Markdown. Preserves chapter structure and metadata.""" + + def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult: + """Convert an EPUB file to markdown. + + Args: + local_path: Path to the EPUB file + **kwargs: Additional arguments (unused) + + Returns: + DocumentConverterResult containing the converted markdown + + Raises: + FileConversionException: If the file is not an EPUB file + """ + # Check if this is an EPUB file + file_ext = kwargs.get("file_extension", "").lower() + if not file_ext.endswith(".epub"): + return None + + book = epub.read_epub(local_path) + + # Initialize result with book title + result = DocumentConverterResult( + title=( + book.get_metadata("DC", "title")[0][0] + if book.get_metadata("DC", "title") + else None + ) + ) + + # Start with metadata + metadata_md = [] + if book.get_metadata("DC", "creator"): + metadata_md.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}") + if book.get_metadata("DC", "description"): + metadata_md.append(f"\n{book.get_metadata('DC', 'description')[0][0]}") + + # Convert content + content_md = [] + h = html2text.HTML2Text() + h.body_width = 0 # Don't wrap lines + + for item in book.get_items(): + if item.get_type() == ITEM_DOCUMENT: + content = item.get_content().decode("utf-8") + # Convert HTML content to markdown + markdown_content = h.handle(content) + content_md.append(markdown_content) + + # Combine all parts + result.text_content = "\n\n".join(metadata_md + content_md) + + return result + + class DocxConverter(HtmlConverter): """ Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible. @@ -1273,6 +1333,7 @@ def __init__( self.register_page_converter(IpynbConverter()) self.register_page_converter(PdfConverter()) self.register_page_converter(ZipConverter()) + self.register_page_converter(EpubConverter()) def convert( self, source: Union[str, requests.Response], **kwargs: Any diff --git a/tests/test_files/test.epub b/tests/test_files/test.epub new file mode 100644 index 0000000000000000000000000000000000000000..25c77b57ecd19d84e747738f63330238f68bc2bc GIT binary patch literal 2677 zcmZ{mc{tSDAIAq1MvWQE#WiRwS+bXuwVSQMjWJ`ySgr}fXpEsDaVHv5M3xZ6*h6wr zmaNIvwUi}f%NEI&HH7$$mg|?_{hjagd7ktA<8{t+-p~7d&KqgW0^|kkjx394D|cM3 z-0t6Qu`|kz;O0cRLUzKD$wYz!jzVxJDLRq8F39>4$jI*``&V+B8pD>$AT}Ar$Eb7q z=4a#$O$-zr+({H1f#l?=;NwQbywc*VRXTKFq$=xXWtwV1YW8VvZjFO%>^atAC=DYN zuvBUe8QZbyw_$TRxhdpiuuFpADa=_YBG}K^f#s55^Kpus8SFuZJPxX6s+`O$)6-#~ z?^~N4Z8jU_tGXN%(n@ty!0o(XmQfK1UXcLcdCIr79I;?jtd z!)w1cJw{0&I>IC78!A|C4LA%$YOR}CqZNGa$+HU(VpZ>ZcHB_6k|*Wn)`H4-VDOu& zwuYsmPZJC4Mk8G(rn@%4*7&~jc-wP^?kn>m+<7%X+lvcFxRNvJ)(@AZBSA zQY2~rFk1Uj-rFPLc6a0^a%0e{OEImA4dU2_4GdFxq+Y1S0l({I3p7r!c?A0r$Z3D7 z_?pFPmY`lOSq%rnscLx_GxOwUeF=&bF-dR;@;Jw~SfNeKizr$gt3|aN=D*GPqgxn+ z<~xBGJz=yBy}G6~I1m`Zo>(faI!PCN`6@}IUm@%-E-l|FfPCK=99N!4Zqw8v=?=7u zD$?y=&^>yvNUX$Y15P~XQh1ygCtA)N5_7GU2cI@StH>X@6|IodNjjar2Owupy?7b9@* zq|mlsn0_|QHE`C54if+%$uJGqKZbF^k^fLu@NuEI5#6E+Np)~&@S0{rx3Fe+eN%gH z?d^v}x>??TcXAE%!g0TmZ{N|T7QR*q$AUoH74ug=29P4EXe^iQ1a0dHkHfCb^sAsQ z>ucI?7KD^e(1x_0&8;4uSA-XwKVu3DLe1Sm!21&;J%sLfBniBUQfWEnnHc}!(8M5T zH&_NX@M_kz>`#ft4R%S%(%Yp=mU_uEas%TFxoNo0`Vn`)bjI9Rh+=PDzzXui$1eX(|P@2+asRVKxY1uAEXUYW#Faxza}`q^i(QKeF6H zw~l)CKCBDV%5NL7SQlIF9p{9w?}vV-|aByAm=Zb)`+MX9S_6{sLqgF%A?$ zO_4s$4H4?!yr8nNsaQxST=!6cFgazqie9$rMK2d{y!jk9tX1V!a(KS2y|emK)#)0m z%$Kw{L{=EM4hn8TBnaQSBY`{H26UFze|vWaVK#$zy-~y-!*(} zG}>&&UrRS=@tF);qzRj2Vp^l8WT(N8Hv#3fc_Jx7XS4imwOK~w`HB^73~B1EC*4v( zV7XHo2k94{2`FNy%~xQiHXQlQaAJ%^%y|2H;z|7U)&X!ZA4h?ujFqm}BAABF@*JEj zb`d;#<;hG0d@A>{yA`xHIb@9$@&f-&~rg{pYwV!aYO%U$?qx zZ;$Wmf#qYtY6=X6pD`3Z^uy8=cLxQMgHKZQb5fwvq2PIHs5HL`)V{w9bIoYRd*Fdc z`7$qezFr{k8ljJVmC4BZ92zsnH?ZTpQ&M}^)+$oCE4)Nu(WoIbcsbK z+~PKcu?brnmP`5q_}RI&u441t3|*NSx^n)|m4x&DW?O31bJT91U?>}FYyvjabU7aR zs&|zL`83jG$U=x0t&zwe4BN3AOF9Yu0Z$&%(;BgN95in`Xb!z7*BGS~V@Gc}BLh51!S7C@ zl+dQcI0=a5eUWQ&wx;#n<4vbAg#b8MF#%%Zay_>4)WT3LX=e#&Z3~t^A2&jN=b^bg zp56DUF1z{ox|*loiA1qnSs=|!LV#vseD^{*c7vj{UecEzZsRLil&C9}vCeH0*G{Q? zknL);9M^~wZD^Ef(LjI{+$-06w%jvkoik_KjDT&+T(k7U-aHm#cFNQV4Wuy>GcU{k z4=oG{zm5Qgrh5@)kNWq(@{Iui0+`ep(|?aI``6f?3ij6cm9h8!jIf{H|3vK3Ml3(* z|9UX?10mz*r3c QU}Job7@9oUO_hNE03xtGod5s; literal 0 HcmV?d00001 diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py index 316e670..d38355c 100644 --- a/tests/test_markitdown.py +++ b/tests/test_markitdown.py @@ -130,6 +130,18 @@ "5bda1dd6", ] +EPUB_TEST_STRINGS = [ + "Author: Test Author", + "A test EPUB document for MarkItDown testing", + "# Chapter 1: Test Content", + "This is a **test** paragraph with some formatting", + "* A bullet point", + "* Another point", + "# Chapter 2: More Content", + "_different_ style", + "> This is a blockquote for testing", +] + @pytest.mark.skipif( skip_remote, @@ -161,6 +173,13 @@ def test_markitdown_remote() -> None: def test_markitdown_local() -> None: markitdown = MarkItDown() + # Test EPUB processing + result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.epub")) + assert result.title == "Test EPUB Document" + for test_string in EPUB_TEST_STRINGS: + text_content = result.text_content.replace("\\", "") + assert test_string in text_content + # Test XLSX processing result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx")) for test_string in XLSX_TEST_STRINGS: From 98f1cdbb1c2a3416f64aecec815d5d98a11d9ad7 Mon Sep 17 00:00:00 2001 From: Raduan77 Date: Wed, 18 Dec 2024 10:58:33 +0100 Subject: [PATCH 2/3] add new dependencies --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index c5bd58b..8682024 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -40,6 +40,8 @@ dependencies = [ "pathvalidate", "charset-normalizer", "openai", + "ebooklib", + "html2text>=2020.1.16", ] [project.urls] From 48f12167286e916737853bc62a8420a9e546754b Mon Sep 17 00:00:00 2001 From: Raduan77 Date: Thu, 9 Jan 2025 20:23:54 +0100 Subject: [PATCH 3/3] migrate to use HTML converter + add convert_em method to it --- src/markitdown/_markitdown.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py index abf51bd..b51780d 100644 --- a/src/markitdown/_markitdown.py +++ b/src/markitdown/_markitdown.py @@ -78,6 +78,10 @@ def __init__(self, **options: Any): # Explicitly cast options to the expected type if necessary super().__init__(**options) + def convert_em(self, el: Any, text: str, convert_as_inline: bool) -> str: + """Convert emphasis tags () to markdown style (_text_)""" + return f"_{text}_" if text.strip() else "" + def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str: """Same as usual, but be sure to start with a new line""" if not convert_as_inline: @@ -740,15 +744,12 @@ def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult: # Convert content content_md = [] - h = html2text.HTML2Text() - h.body_width = 0 # Don't wrap lines - for item in book.get_items(): if item.get_type() == ITEM_DOCUMENT: content = item.get_content().decode("utf-8") - # Convert HTML content to markdown - markdown_content = h.handle(content) - content_md.append(markdown_content) + html_result = HtmlConverter()._convert(content) + if html_result and html_result.text_content: + content_md.append(html_result.text_content) # Combine all parts result.text_content = "\n\n".join(metadata_md + content_md)