microsoft · 0xRaduan · Dec 18, 2024 · Dec 18, 2024 · Dec 19, 2024 · Jan 9, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,6 +40,8 @@ dependencies = [
   "pathvalidate",
   "charset-normalizer",
   "openai",
+  "ebooklib",
+  "html2text>=2020.1.16",
 ]
 
 [project.urls]

diff --git a/src/markitdown/_markitdown.py b/src/markitdown/_markitdown.py
@@ -24,6 +24,9 @@
 import pdfminer
 import pdfminer.high_level
 import pptx
+from ebooklib import epub, ITEM_DOCUMENT
+import html2text
+
 
 # File-format detection
 import puremagic
@@ -690,6 +693,63 @@ def convert(self, local_path, **kwargs) -> Union[None, DocumentConverterResult]:
         )
 
 
+class EpubConverter(DocumentConverter):
+    """Converts EPUB files to Markdown. Preserves chapter structure and metadata."""
+
+    def convert(self, local_path: str, **kwargs: Any) -> DocumentConverterResult:
+        """Convert an EPUB file to markdown.
+
+        Args:
+            local_path: Path to the EPUB file
+            **kwargs: Additional arguments (unused)
+
+        Returns:
+            DocumentConverterResult containing the converted markdown
+
+        Raises:
+            FileConversionException: If the file is not an EPUB file
+        """
+        # Check if this is an EPUB file
+        file_ext = kwargs.get("file_extension", "").lower()
+        if not file_ext.endswith(".epub"):
+            return None
+
+        book = epub.read_epub(local_path)
+
+        # Initialize result with book title
+        result = DocumentConverterResult(
+            title=(
+                book.get_metadata("DC", "title")[0][0]
+                if book.get_metadata("DC", "title")
+                else None
+            )
+        )
+
+        # Start with metadata
+        metadata_md = []
+        if book.get_metadata("DC", "creator"):
+            metadata_md.append(f"Author: {book.get_metadata('DC', 'creator')[0][0]}")
+        if book.get_metadata("DC", "description"):
+            metadata_md.append(f"\n{book.get_metadata('DC', 'description')[0][0]}")
+
+        # Convert content
+        content_md = []
+        h = html2text.HTML2Text()
+        h.body_width = 0  # Don't wrap lines
 class HtmlConverter(DocumentConverter): 
     """Anything with content type text/html""" 
     def convert( 
         self, local_path: str, **kwargs: Any 
     ) -> Union[None, DocumentConverterResult]: 
         # Bail if not html 
         extension = kwargs.get("file_extension", "") 
         if extension.lower() not in [".html", ".htm"]: 
             return None 
         result = None 
         with open(local_path, "rt", encoding="utf-8") as fh: 
             result = self._convert(fh.read()) 
         return result 
     def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: 
         """Helper function that converts and HTML string.""" 
         # Parse the string 
         soup = BeautifulSoup(html_content, "html.parser") 
         # Remove javascript and style blocks 
         for script in soup(["script", "style"]): 
             script.extract() 
         # Print only the main content 
         body_elm = soup.find("body") 
         webpage_text = "" 
         if body_elm: 
             webpage_text = _CustomMarkdownify().convert_soup(body_elm) 
         else: 
             webpage_text = _CustomMarkdownify().convert_soup(soup) 
         assert isinstance(webpage_text, str) 
         return DocumentConverterResult( 
             title=None if soup.title is None else soup.title.string, 
             text_content=webpage_text, 
         ) 
 class HtmlConverter(DocumentConverter): 
     """Anything with content type text/html""" 
  
     def convert( 
         self, local_path: str, **kwargs: Any 
     ) -> Union[None, DocumentConverterResult]: 
         # Bail if not html 
         extension = kwargs.get("file_extension", "") 
         if extension.lower() not in [".html", ".htm"]: 
             return None 
  
         result = None 
         with open(local_path, "rt", encoding="utf-8") as fh: 
             result = self._convert(fh.read()) 
  
         return result 
  
     def _convert(self, html_content: str) -> Union[None, DocumentConverterResult]: 
         """Helper function that converts and HTML string.""" 
  
         # Parse the string 
         soup = BeautifulSoup(html_content, "html.parser") 
  
         # Remove javascript and style blocks 
         for script in soup(["script", "style"]): 
             script.extract() 
  
         # Print only the main content 
         body_elm = soup.find("body") 
         webpage_text = "" 
         if body_elm: 
             webpage_text = _CustomMarkdownify().convert_soup(body_elm) 
         else: 
             webpage_text = _CustomMarkdownify().convert_soup(soup) 
  
         assert isinstance(webpage_text, str) 
  
         return DocumentConverterResult( 
             title=None if soup.title is None else soup.title.string, 
             text_content=webpage_text, 
         ) 
+
+        for item in book.get_items():
+            if item.get_type() == ITEM_DOCUMENT:
+                content = item.get_content().decode("utf-8")
+                # Convert HTML content to markdown
+                markdown_content = h.handle(content)
+                content_md.append(markdown_content)
+
+        # Combine all parts
+        result.text_content = "\n\n".join(metadata_md + content_md)
+
+        return result
+
+
 class DocxConverter(HtmlConverter):
     """
     Converts DOCX files to Markdown. Style information (e.g.m headings) and tables are preserved where possible.
@@ -1284,6 +1344,7 @@ def __init__(
         self.register_page_converter(IpynbConverter())
         self.register_page_converter(PdfConverter())
         self.register_page_converter(ZipConverter())
+        self.register_page_converter(EpubConverter())
 
     def convert(
         self, source: Union[str, requests.Response], **kwargs: Any

diff --git a/tests/test_files/test.epub b/tests/test_files/test.epub
diff --git a/tests/test_markitdown.py b/tests/test_markitdown.py
@@ -130,6 +130,18 @@
     "5bda1dd6",
 ]
 
+EPUB_TEST_STRINGS = [
+    "Author: Test Author",
+    "A test EPUB document for MarkItDown testing",
+    "# Chapter 1: Test Content",
+    "This is a **test** paragraph with some formatting",
+    "* A bullet point",
+    "* Another point",
+    "# Chapter 2: More Content",
+    "_different_ style",
+    "> This is a blockquote for testing",
+]
+
 
 @pytest.mark.skipif(
     skip_remote,
@@ -161,6 +173,13 @@ def test_markitdown_remote() -> None:
 def test_markitdown_local() -> None:
     markitdown = MarkItDown()
 
+    # Test EPUB processing
+    result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.epub"))
+    assert result.title == "Test EPUB Document"
+    for test_string in EPUB_TEST_STRINGS:
+        text_content = result.text_content.replace("\\", "")
+        assert test_string in text_content
+
     # Test XLSX processing
     result = markitdown.convert(os.path.join(TEST_FILES_DIR, "test.xlsx"))
     for test_string in XLSX_TEST_STRINGS: