From cd6058e72982a754d331873aee3802fcb32d4661 Mon Sep 17 00:00:00 2001
From: Raduan77 <rfalshedivat@edu.hse.ru>
Date: Wed, 18 Dec 2024 10:58:18 +0100
Subject: [PATCH 1/3] feat(epub): Add EPUB support

---
 src/markitdown/_markitdown.py |  61 ++++++++++++++++++++++++++++++++++
 tests/test_files/test.epub    | Bin 0 -> 2677 bytes
 tests/test_markitdown.py      |  19 +++++++++++
 3 files changed, 80 insertions(+)
 create mode 100644 tests/test_files/test.epub

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index 2e7e5ff..d55c14b 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -24,6 +24,9 @@
 import pdfminer
 import pdfminer.high_level
 import pptx
+from ebooklib import epub, ITEM_DOCUMENT
+import html2text
+
 
 # File-format detection
 import puremagic
@@ -690,6 +693,63 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         )
 
 
+class EpubConverter(DocumentConverter):
+    """Converts EPUB files to Markdown. Preserves chapter structure and metadata."""
+
+    def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
+        """Convert an EPUB file to markdown.
+
+        Args:
+            local_path: Path to the EPUB file
+            **kwargs: Additional arguments (unused)
+
+        Returns:
+            DocumentConverterResult containing the converted markdown
+
+        Raises:
+            FileConversionException: If the file is not an EPUB file
+        """
+        # Check if this is an EPUB file
+        file_ext = kwargs.get("file_extension", "").lower()
+        if not file_ext.endswith(".epub"):
+            return None
+
+        book = epub.read_epub(local_path)
+
+        # Initialize result with book title
+        result = DocumentConverterResult(
+            title=(
+                book.get_metadata("DC", "title")[0][0]
+                if book.get_metadata("DC", "title")
+                else None
+            )
+        )
+
+        # Start with metadata
+        metadata_md = []
+        if book.get_metadata("DC", "creator"):
+            metadata_md.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}")
+        if book.get_metadata("DC", "description"):
+            metadata_md.append(f"\n{book.get_metadata('DC', 'description')[0][0]}")
+
+        # Convert content
+        content_md = []
+        h = html2text.HTML2Text()
+        h.body_width = 0  # Don't wrap lines
+
+        for item in book.get_items():
+            if item.get_type() == ITEM_DOCUMENT:
+                content = item.get_content().decode("utf-8")
+                # Convert HTML content to markdown
+                markdown_content = h.handle(content)
+                content_md.append(markdown_content)
+
+        # Combine all parts
+        result.text_content = "\n\n".join(metadata_md + content_md)
+
+        return result
+
+
 class DocxConverter(HtmlConverter):
     """
     Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
@@ -1273,6 +1333,7 @@ def __init__(
         self.register_page_converter(IpynbConverter())
         self.register_page_converter(PdfConverter())
         self.register_page_converter(ZipConverter())
+        self.register_page_converter(EpubConverter())
 
     def convert(
         self, source: Union[str, requests.Response], **kwargs: Any
diff --git a/tests/test_files/test.epub b/tests/test_files/test.epub
new file mode 100644
index 0000000000000000000000000000000000000000..25c77b57ecd19d84e747738f63330238f68bc2bc
GIT binary patch
literal 2677
zcmZ{mc{tSDAIAq1MvWQE#WiRwS+bXuwVSQMjWJ`ySgr}fXpEsDaVHv5M3xZ6*h6wr
zmaNIvwUi}f%NEI&HH7$$mg|?_{hjagd7ktA<8{t+-p~7d&KqgW0^|kkjx394D|cM3
z-0t6Qu`|kz;O0cRLUzKD$wYz!jzVxJDLRq8F39>4$jI*``&V+B8pD>$AT}Ar$Eb7q
z=4a#$O$-zr+({H1f#l?=;NwQbywc*VRXTKFq$=xXWtwV1YW8VvZjFO%>^atAC=DYN
zuvBUe8QZbyw_$TRxhdpiuuFpADa=_YBG}K^f#s55^Kpus8SFuZJPxX6s+`O$)6-#~
z?^~N4Z8jU_tGXN%(<O+u25*O#K+`x|2?d<*j?LkfaqeCT)grL=A&rZaW=zxiLg7k(
z<;u+)asdMP$t6iG*wdkp{AB?N`MnVMdHs*^SQ4bT0rK<lZpZb#VPOUU;(!1ElyN+L
zBu3}!(VR#W1$Qz&$y~#IKnXguEX(e_fJE>n@ty!0o(XmQfK1UXcLcdCIr79I;?jtd
z!)w1cJw{0&I>IC78!A|C4LA%$YOR}CqZNGa$+HU(VpZ>ZcHB_6k|*Wn)`H4-VDOu&
zwuYsmPZJC4Mk8G(rn@%4*7&~jc-wP^?kn>m+<7%X+lvcFx<gGAR@7fgdx*GF<#O24
zDK*Q9m6_$Gs7HOk3&}2C`iKr4s1Z;Z*c_X9tZe9HRE|U-E4H;=neJ(DR?c8+gh}c~
z2a5xiO}jG2ya`Y}x5RfVFalMC#kbrKQsk-j9k<$^P@g<$dO)M&J%>RNvJ)(@AZBSA
zQY2~rFk1Uj-rFPLc6a0^a%0e{OEImA4dU2_4GdFxq+Y1S0l({I3p7r!c?A0r$Z3D7
z_?pFPmY`lOSq%rnscLx_GxOwUeF=&bF-dR;@;Jw~SfNeKizr$gt3|aN=D*GPqgxn+
z<~xBGJz=yBy}G6~I1m`Zo>(faI!PCN`6@}IUm@%-E-l|FfPCK=99N!4Zqw8v=?=7u
zD$?y=&^>yvNUX$Y15P~XQh1ygCtA<Hu`vGVPU+Z^y#97_==0{Tzu&?6_~)A?jy~dA
zq(^gYUQrs{HqKa9y7+;W6K1TL>)N5_7GU2cI@StH>X@6|IodNjjar2Owupy?7b9@*
zq|mlsn0_|QHE`C54if+%$uJGqKZbF^k^fLu@NuEI5#6E+Np)~&@S0{rx3Fe+eN%gH
z?d^v}x>??TcXAE%!g0TmZ{N|T7QR*q$AUoH74ug=29P4EXe^iQ1a0dHkHfCb^sAsQ
z>ucI?7KD^e(1x_0&8;4uSA-XwKVu3DLe1Sm!21&;J%sLfBniBUQfWEnnHc}!(8M5T
zH&_NX@M_kz>`#ft4R%S%(%Yp=mU_uEas%TFxoNo0<oD5P{2Z@;QH#HasZWQw3JFQA
zaY^%GBc9UpU-_Gt0a9pB@ajS0M&^}+AbgYtLPm{EGSo&1oABNo6=hc@v^pq08)opA
za>`Vn`)bjI9Rh+=PDzzXui$1eX(|P@2+asRVKxY1uAEXUYW#Faxza}`q^i(QKeF6H
zw~l)CKCBDV%5<bfDw{R)6zzoPw0zL^vHleHz^y=}r*fvF$e&z|_n&PZ{8{j&C6eKB
zhTu&9Blxjzg4;$_q3V>NL7SQlIF9p{9w?}vV-|aByAm=Zb)`+MX9S_6{sLqgF%A?$
zO_4s$4H4?!yr8nNsaQxST=!6cFgazqie9$rMK2d{y!jk9tX1V!a(KS2y|emK)#)0m
z%$Kw{L{=EM4hn8TBnaQSBY`{H26UFze|vWaVK#$z<X`?Qj_zk2Nic^I>y-~y-!*(}
zG}>&&UrRS=@tF);qzRj2Vp^l8WT(N8Hv#3fc_Jx7XS4imwOK~w`HB^73~B1EC*4v(
zV7XHo2k94{2`FNy%~xQiHXQlQaAJ%^%y|2H;z|7U)&X!ZA4h?ujFqm}BAABF@*JEj
zb`d;#<;hG<XHMPC<J4HR_rr~?>0d@A>{yA`xHIb@9$@&f-&~rg{pYwV!aYO%U$?qx
zZ;$Wmf#qYtY6=X6pD`3Z^uy8=cLxQMgHKZQb5fwvq2PIHs5HL`)V{w9bIoYRd*Fdc
z`7$qezFr{k8ljJVmC4BZ92zsnH?ZTpQ&M}^)+$oCE4)N<F%7=*wf7CwUtFw9_G_Uu
zhR@za-;+#LyHIRTd^XHx^x?g1!KXOt@L=1#rBHBbtKqL&B^lIL5iaWDL6dOv9da8t
z5=sEmo(GAkd~oi$x`8#jpa%kB;^ndYk57jz*gQz7-4YbV@-|?JI4A4CH_*%vzF#g{
z<B<^n=hq@OR%IQQHK`Dt8&i#QI`kfQ$aqI{2}NEF#=+uiLKgQ{OjEB>u(WoIbcsbK
z+~P<TnOBWX)@N;qiEqp^I1*xl4i?)4FVxX8awblVXmOMawT$_&sa|<%FH=2)+JYD_
z9x>Kcu?brnmP`5q_}RI&u441t3|*NSx^n)|m4x&DW?O31bJT91U?>}FYyvjabU7aR
zs&|zL`83jG$U=x0t&zwe4BN3A<fhYm{XcKNbrPL1mK|-auyfH*S1}PAdD!`THcvRK
zwc_D)uR{3cfytMnUe5v>OF9Yu0Z$&%(;BgN95in`Xb!z7*BGS~V@Gc}BLh51!S7C@
zl+dQcI0=a5eUWQ&wx;#n<4vbAg#b8MF#%%Zay_>4)WT3LX=e#&Z3~t^A2&jN=b^bg
zp56DUF1z{ox|*loiA1qnSs=|!LV#vseD^{*c7vj{UecEzZsRLil&C9}vCeH0*G{Q?
zknL);9M^~wZD^Ef(LjI{+$-06w%jvkoik_KjDT&+T(k7U-aHm#cFNQV4Wuy>GcU{k
z4=oG{zm5Qgrh5@)kNWq(@{Iui0+`ep(|?aI``6f?3ij6cm9h8!jIf{H|3vK3Ml3(*
z|9UX?<NF)l9$vu+Dc?5V#lN?|{rLXdyoU?3e#gJx?fv9_zwVJ<Y~RU$+>10mz*r3c
QU}Job7@9oUO_hNE03xtGod5s;

literal 0
HcmV?d00001

diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
index 316e670..d38355c 100644
--- a/tests/test_markitdown.py
+++ b/tests/test_markitdown.py
@@ -130,6 +130,18 @@
     "5bda1dd6",
 ]
 
+EPUB_TEST_STRINGS = [
+    "Author: Test Author",
+    "A test EPUB document for MarkItDown testing",
+    "# Chapter 1: Test Content",
+    "This is a **test** paragraph with some formatting",
+    "* A bullet point",
+    "* Another point",
+    "# Chapter 2: More Content",
+    "_different_ style",
+    "> This is a blockquote for testing",
+]
+
 
 @pytest.mark.skipif(
     skip_remote,
@@ -161,6 +173,13 @@ def test_markitdown_remote() -> None:
 def test_markitdown_local() -> None:
     markitdown = MarkItDown()
 
+    # Test EPUB processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.epub"))
+    assert result.title == "Test EPUB Document"
+    for test_string in EPUB_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
     # Test XLSX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
     for test_string in XLSX_TEST_STRINGS:

From 98f1cdbb1c2a3416f64aecec815d5d98a11d9ad7 Mon Sep 17 00:00:00 2001
From: Raduan77 <rfalshedivat@edu.hse.ru>
Date: Wed, 18 Dec 2024 10:58:33 +0100
Subject: [PATCH 2/3] add new dependencies

---
 pyproject.toml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index c5bd58b..8682024 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -40,6 +40,8 @@ dependencies = [
   "pathvalidate",
   "charset-normalizer",
   "openai",
+  "ebooklib",
+  "html2text>=2020.1.16",
 ]
 
 [project.urls]

From 48f12167286e916737853bc62a8420a9e546754b Mon Sep 17 00:00:00 2001
From: Raduan77 <rfalshedivat@edu.hse.ru>
Date: Thu, 9 Jan 2025 20:23:54 +0100
Subject: [PATCH 3/3] migrate to use HTML converter + add convert_em method to
 it

---
 src/markitdown/_markitdown.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
index abf51bd..b51780d 100644
--- a/src/markitdown/_markitdown.py
+++ b/src/markitdown/_markitdown.py
@@ -78,6 +78,10 @@ def __init__(self, **options: Any):
         # Explicitly cast options to the expected type if necessary
         super().__init__(**options)
 
+    def convert_em(self, el: Any, text: str, convert_as_inline: bool) -> str:
+        """Convert emphasis tags (<em>) to markdown style (_text_)"""
+        return f"_{text}_" if text.strip() else ""
+
     def convert_hn(self, n: int, el: Any, text: str, convert_as_inline: bool) -> str:
         """Same as usual, but be sure to start with a new line"""
         if not convert_as_inline:
@@ -740,15 +744,12 @@ def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
 
         # Convert content
         content_md = []
-        h = html2text.HTML2Text()
-        h.body_width = 0  # Don't wrap lines
-
         for item in book.get_items():
             if item.get_type() == ITEM_DOCUMENT:
                 content = item.get_content().decode("utf-8")
-                # Convert HTML content to markdown
-                markdown_content = h.handle(content)
-                content_md.append(markdown_content)
+                html_result = HtmlConverter()._convert(content)
+                if html_result and html_result.text_content:
+                    content_md.append(html_result.text_content)
 
         # Combine all parts
         result.text_content = "\n\n".join(metadata_md + content_md)