diff --git a/src/Html2Markdown/Replacement/CodeTagReplacer.cs b/src/Html2Markdown/Replacement/CodeTagReplacer.cs index f2770ffd..bad7e74f 100644 --- a/src/Html2Markdown/Replacement/CodeTagReplacer.cs +++ b/src/Html2Markdown/Replacement/CodeTagReplacer.cs @@ -6,7 +6,7 @@ public class CodeTagReplacer : CustomReplacer { public CodeTagReplacer() { - CustomAction = HtmlParser.ReplaceCode; + CustomAction = html => HtmlParser.ReplaceCode(html, false); } public CodeTagReplacer(bool supportSyntaxHighlighting) diff --git a/src/Html2Markdown/Replacement/CommonMark/CommonMarkTextFormattingReplacementGroup.cs b/src/Html2Markdown/Replacement/CommonMark/CommonMarkTextFormattingReplacementGroup.cs new file mode 100644 index 00000000..385fdcdf --- /dev/null +++ b/src/Html2Markdown/Replacement/CommonMark/CommonMarkTextFormattingReplacementGroup.cs @@ -0,0 +1,29 @@ +namespace Html2Markdown.Replacement.CommonMark; + +/// +/// A group of IReplacer to deal with converting HTML for +/// formatting text +/// +public class CommonMarkTextFormattingReplacementGroup : IReplacementGroup +{ + private readonly IList _replacements = new List { + new StrongTagReplacer(), + new EmphasisTagReplacer(), + new ImageTagReplacer(), + new CustomReplacer + { + CustomAction = ReplaceLists + }, + new AnchorTagReplacer() + }; + + private static string ReplaceLists(string html) + { + return HtmlParser.ReplaceLists(html); + } + + public IEnumerable Replacers() + { + return _replacements; + } +} \ No newline at end of file diff --git a/src/Html2Markdown/Replacement/CustomReplacer.cs b/src/Html2Markdown/Replacement/CustomReplacer.cs index 3a4153a9..a01fc0b0 100644 --- a/src/Html2Markdown/Replacement/CustomReplacer.cs +++ b/src/Html2Markdown/Replacement/CustomReplacer.cs @@ -9,5 +9,5 @@ public string Replace(string html) return CustomAction.Invoke(html); } - protected Func CustomAction { get; init; } + public Func CustomAction { get; init; } } \ No newline at end of file diff --git a/src/Html2Markdown/Replacement/HtmlParser.cs b/src/Html2Markdown/Replacement/HtmlParser.cs index b92c042c..93b1f45a 100644 --- a/src/Html2Markdown/Replacement/HtmlParser.cs +++ b/src/Html2Markdown/Replacement/HtmlParser.cs @@ -38,7 +38,10 @@ private static string ReplaceList(string html) listItems.ToList().ForEach(listItem => { var listPrefix = listType.Equals("ol") ? $"{++counter}. " : "* "; - var finalList = listItem.Replace(@"", string.Empty); + //In case of multiline Html, a line can end with a new line. In this case we want to remove the closing tag as well as the new line + //otherwise we may only keep the line breaks between tags and create a double line break in the markdown + var closingTag = listItem.EndsWith($"{Environment.NewLine}") ? $"{Environment.NewLine}" : ""; + var finalList = listItem.Replace(closingTag, string.Empty); if (finalList.Trim().Length == 0) { return; @@ -48,10 +51,16 @@ private static string ReplaceList(string html) finalList = TwoNewLines().Replace(finalList, $"{Environment.NewLine}{Environment.NewLine} "); // indent nested lists finalList = NestedList().Replace(finalList, "\n$1 $2"); + // remove the indent from the first line + if (listItem.StartsWith("

")) + { + finalList = ReplaceParagraph(finalList, true); + } markdownList.Add($"{listPrefix}{finalList}"); }); - return Environment.NewLine + Environment.NewLine + markdownList.Aggregate((current, item) => current + Environment.NewLine + item) + Environment.NewLine + Environment.NewLine; + //If a new line is already ending the markdown item, then we don't need to add another one + return Environment.NewLine + Environment.NewLine + markdownList.Aggregate((current, item) => current.EndsWith(Environment.NewLine) ? current + item : current + Environment.NewLine + item) + Environment.NewLine + Environment.NewLine; } private static bool ListIsEmpty(IReadOnlyCollection listItems) @@ -128,7 +137,7 @@ internal static string ReplaceImg(string html) return doc.DocumentNode.OuterHtml; } - public static string ReplaceAnchor(string html) + internal static string ReplaceAnchor(string html) { var doc = GetHtmlDocument(html); var nodes = doc.DocumentNode.SelectNodes("//a"); @@ -155,9 +164,7 @@ public static string ReplaceAnchor(string html) return doc.DocumentNode.OuterHtml; } - public static string ReplaceCode(string html) => ReplaceCode(html, false); - - public static string ReplaceCode(string html, bool supportSyntaxHighlighting) + internal static string ReplaceCode(string html, bool supportSyntaxHighlighting) { var doc = GetHtmlDocument(html); var nodes = doc.DocumentNode.SelectNodes("//code"); @@ -219,7 +226,7 @@ private static string GetSyntaxHighlightLanguage(HtmlNode node) : classAttributeValue; } - public static string ReplaceBlockquote(string html) + internal static string ReplaceBlockquote(string html) { var doc = GetHtmlDocument(html); var nodes = doc.DocumentNode.SelectNodes("//blockquote"); @@ -248,12 +255,14 @@ public static string ReplaceBlockquote(string html) return doc.DocumentNode.OuterHtml; } - public static string ReplaceEntities(string html) + internal static string ReplaceEntities(string html) { return WebUtility.HtmlDecode(html); } - public static string ReplaceParagraph(string html) + internal static string ReplaceParagraph(string html) => ReplaceParagraph(html, false); + + private static string ReplaceParagraph(string html, bool nestedIntoList) { var doc = GetHtmlDocument(html); var nodes = doc.DocumentNode.SelectNodes("//p"); @@ -266,7 +275,12 @@ public static string ReplaceParagraph(string html) var text = node.InnerHtml; var markdown = Spaces().Replace(text, " "); markdown = markdown.Replace(Environment.NewLine, " "); - markdown = Environment.NewLine + Environment.NewLine + markdown + Environment.NewLine; + + //If a paragraph is contained in a list, we don't want to add new line characters + var openingTag = nestedIntoList ? "" : Environment.NewLine + Environment.NewLine; + var closingTag = nestedIntoList ? "" : Environment.NewLine; + + markdown = openingTag + markdown + closingTag; ReplaceNode(node, markdown); }); diff --git a/src/Html2Markdown/Scheme/CommonMark.cs b/src/Html2Markdown/Scheme/CommonMark.cs index 8e0cc6cc..75f7374c 100644 --- a/src/Html2Markdown/Scheme/CommonMark.cs +++ b/src/Html2Markdown/Scheme/CommonMark.cs @@ -14,7 +14,7 @@ public class CommonMark : AbstractScheme { public CommonMark() { - AddReplacementGroup(ReplacerCollection, new TextFormattingReplacementGroup()); + AddReplacementGroup(ReplacerCollection, new CommonMarkTextFormattingReplacementGroup()); AddReplacementGroup(ReplacerCollection, new HeadingReplacementGroup()); AddReplacementGroup(ReplacerCollection, new IllegalHtmlReplacementGroup()); AddReplacementGroup(ReplacerCollection, new CommonMarkLayoutReplacementGroup()); diff --git a/test/Html2Markdown.Test/CommonMarkSchemeConverterTest.Convert_WhenThereIsAMultilineOrderedListWithNestedParagraphsAndCodeElement_ThenReplaceWithMarkdownLists.verified.txt b/test/Html2Markdown.Test/CommonMarkSchemeConverterTest.Convert_WhenThereIsAMultilineOrderedListWithNestedParagraphsAndCodeElement_ThenReplaceWithMarkdownLists.verified.txt new file mode 100644 index 00000000..2771a2a4 --- /dev/null +++ b/test/Html2Markdown.Test/CommonMarkSchemeConverterTest.Convert_WhenThereIsAMultilineOrderedListWithNestedParagraphsAndCodeElement_ThenReplaceWithMarkdownLists.verified.txt @@ -0,0 +1,6 @@ +This code is with an ordered list and paragraphs. + +1. Yes, this is a `code` element +2. No : + + * `Some code we are looking at` \ No newline at end of file diff --git a/test/Html2Markdown.Test/CommonMarkSchemeConverterTest.Convert_WhenThereIsAnOrderedListWithNestedParagraphs_ThenReplaceWithMarkdownLists.verified.txt b/test/Html2Markdown.Test/CommonMarkSchemeConverterTest.Convert_WhenThereIsAnOrderedListWithNestedParagraphs_ThenReplaceWithMarkdownLists.verified.txt new file mode 100644 index 00000000..2771a2a4 --- /dev/null +++ b/test/Html2Markdown.Test/CommonMarkSchemeConverterTest.Convert_WhenThereIsAnOrderedListWithNestedParagraphs_ThenReplaceWithMarkdownLists.verified.txt @@ -0,0 +1,6 @@ +This code is with an ordered list and paragraphs. + +1. Yes, this is a `code` element +2. No : + + * `Some code we are looking at` \ No newline at end of file diff --git a/test/Html2Markdown.Test/MarkdownSchemeConverterTest.Convert_WhenThereIsAMultilineOrderedListWithNestedParagraphsAndCodeElement_ThenReplaceWithMarkdownLists.verified.txt b/test/Html2Markdown.Test/MarkdownSchemeConverterTest.Convert_WhenThereIsAMultilineOrderedListWithNestedParagraphsAndCodeElement_ThenReplaceWithMarkdownLists.verified.txt new file mode 100644 index 00000000..2771a2a4 --- /dev/null +++ b/test/Html2Markdown.Test/MarkdownSchemeConverterTest.Convert_WhenThereIsAMultilineOrderedListWithNestedParagraphsAndCodeElement_ThenReplaceWithMarkdownLists.verified.txt @@ -0,0 +1,6 @@ +This code is with an ordered list and paragraphs. + +1. Yes, this is a `code` element +2. No : + + * `Some code we are looking at` \ No newline at end of file diff --git a/test/Html2Markdown.Test/MarkdownSchemeConverterTest.Convert_WhenThereIsAnOrderedListWithNestedParagraphs_ThenReplaceWithMarkdownLists.verified.txt b/test/Html2Markdown.Test/MarkdownSchemeConverterTest.Convert_WhenThereIsAnOrderedListWithNestedParagraphs_ThenReplaceWithMarkdownLists.verified.txt new file mode 100644 index 00000000..2771a2a4 --- /dev/null +++ b/test/Html2Markdown.Test/MarkdownSchemeConverterTest.Convert_WhenThereIsAnOrderedListWithNestedParagraphs_ThenReplaceWithMarkdownLists.verified.txt @@ -0,0 +1,6 @@ +This code is with an ordered list and paragraphs. + +1. Yes, this is a `code` element +2. No : + + * `Some code we are looking at` \ No newline at end of file diff --git a/test/Html2Markdown.Test/MarkdownSchemeConverterTest.cs b/test/Html2Markdown.Test/MarkdownSchemeConverterTest.cs index ba6630c5..5128aaf6 100644 --- a/test/Html2Markdown.Test/MarkdownSchemeConverterTest.cs +++ b/test/Html2Markdown.Test/MarkdownSchemeConverterTest.cs @@ -608,6 +608,32 @@ public Task Convert_WhenThereIsAnOrderedListWithANestedUnorderedList_ThenReplace return CheckConversion(html); } + [Test] + public Task Convert_WhenThereIsAnOrderedListWithNestedParagraphs_ThenReplaceWithMarkdownLists() + { + const string html = @"

This code is with an ordered list and paragraphs.

  1. Yes, this is a code element

  2. No :

    • Some code we are looking at
"; + + return CheckConversion(html); + } + + [Test] + public Task Convert_WhenThereIsAMultilineOrderedListWithNestedParagraphsAndCodeElement_ThenReplaceWithMarkdownLists() + { + const string html = @"

This code is with an ordered list and paragraphs.

+
    +
  1. Yes, this is a code element

    +
  2. +
  3. No :

    +
      +
    • Some code we are looking at
    • +
    +
  4. +
+"; + + return CheckConversion(html); + } + #endregion #region Extra HTML Removal