Skip to content

Commit

Permalink
Remove extra whitespace on lists containing p tags
Browse files Browse the repository at this point in the history
Closes #359

May relate to #202

Massive thanks to https://github.com/YoussefAzaroual for most of the
work in #359
  • Loading branch information
baynezy committed Dec 21, 2023
1 parent f68630e commit 731b2c4
Show file tree
Hide file tree
Showing 10 changed files with 106 additions and 13 deletions.
2 changes: 1 addition & 1 deletion src/Html2Markdown/Replacement/CodeTagReplacer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ public class CodeTagReplacer : CustomReplacer
{
public CodeTagReplacer()
{
CustomAction = HtmlParser.ReplaceCode;
CustomAction = html => HtmlParser.ReplaceCode(html, false);
}

public CodeTagReplacer(bool supportSyntaxHighlighting)
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
namespace Html2Markdown.Replacement.CommonMark;

/// <summary>
/// A group of IReplacer to deal with converting HTML for
/// formatting text
/// </summary>
public class CommonMarkTextFormattingReplacementGroup : IReplacementGroup
{
private readonly IList<IReplacer> _replacements = new List<IReplacer> {
new StrongTagReplacer(),
new EmphasisTagReplacer(),
new ImageTagReplacer(),
new CustomReplacer
{
CustomAction = ReplaceLists
},
new AnchorTagReplacer()
};

private static string ReplaceLists(string html)
{
return HtmlParser.ReplaceLists(html);
}

public IEnumerable<IReplacer> Replacers()
{
return _replacements;
}
}
2 changes: 1 addition & 1 deletion src/Html2Markdown/Replacement/CustomReplacer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,5 @@ public string Replace(string html)
return CustomAction.Invoke(html);
}

protected Func<string, string> CustomAction { get; init; }
public Func<string, string> CustomAction { get; init; }
}
34 changes: 24 additions & 10 deletions src/Html2Markdown/Replacement/HtmlParser.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,10 @@ private static string ReplaceList(string html)
listItems.ToList().ForEach(listItem =>
{
var listPrefix = listType.Equals("ol") ? $"{++counter}. " : "* ";
var finalList = listItem.Replace(@"</li>", string.Empty);
//In case of multiline Html, a line can end with a new line. In this case we want to remove the closing tag as well as the new line
//otherwise we may only keep the line breaks between tags and create a double line break in the markdown
var closingTag = listItem.EndsWith($"</li>{Environment.NewLine}") ? $"</li>{Environment.NewLine}" : "</li>";
var finalList = listItem.Replace(closingTag, string.Empty);

if (finalList.Trim().Length == 0) {
return;
Expand All @@ -48,10 +51,16 @@ private static string ReplaceList(string html)
finalList = TwoNewLines().Replace(finalList, $"{Environment.NewLine}{Environment.NewLine} ");
// indent nested lists
finalList = NestedList().Replace(finalList, "\n$1 $2");
// remove the indent from the first line
if (listItem.StartsWith("<p>"))
{
finalList = ReplaceParagraph(finalList, true);
}
markdownList.Add($"{listPrefix}{finalList}");
});

return Environment.NewLine + Environment.NewLine + markdownList.Aggregate((current, item) => current + Environment.NewLine + item) + Environment.NewLine + Environment.NewLine;
//If a new line is already ending the markdown item, then we don't need to add another one
return Environment.NewLine + Environment.NewLine + markdownList.Aggregate((current, item) => current.EndsWith(Environment.NewLine) ? current + item : current + Environment.NewLine + item) + Environment.NewLine + Environment.NewLine;
}

private static bool ListIsEmpty(IReadOnlyCollection<string> listItems)
Expand Down Expand Up @@ -128,7 +137,7 @@ internal static string ReplaceImg(string html)
return doc.DocumentNode.OuterHtml;
}

public static string ReplaceAnchor(string html)
internal static string ReplaceAnchor(string html)
{
var doc = GetHtmlDocument(html);
var nodes = doc.DocumentNode.SelectNodes("//a");
Expand All @@ -155,9 +164,7 @@ public static string ReplaceAnchor(string html)
return doc.DocumentNode.OuterHtml;
}

public static string ReplaceCode(string html) => ReplaceCode(html, false);

public static string ReplaceCode(string html, bool supportSyntaxHighlighting)
internal static string ReplaceCode(string html, bool supportSyntaxHighlighting)
{
var doc = GetHtmlDocument(html);
var nodes = doc.DocumentNode.SelectNodes("//code");
Expand Down Expand Up @@ -219,7 +226,7 @@ private static string GetSyntaxHighlightLanguage(HtmlNode node)
: classAttributeValue;
}

public static string ReplaceBlockquote(string html)
internal static string ReplaceBlockquote(string html)
{
var doc = GetHtmlDocument(html);
var nodes = doc.DocumentNode.SelectNodes("//blockquote");
Expand Down Expand Up @@ -248,12 +255,14 @@ public static string ReplaceBlockquote(string html)
return doc.DocumentNode.OuterHtml;
}

public static string ReplaceEntities(string html)
internal static string ReplaceEntities(string html)
{
return WebUtility.HtmlDecode(html);
}

public static string ReplaceParagraph(string html)
internal static string ReplaceParagraph(string html) => ReplaceParagraph(html, false);

private static string ReplaceParagraph(string html, bool nestedIntoList)
{
var doc = GetHtmlDocument(html);
var nodes = doc.DocumentNode.SelectNodes("//p");
Expand All @@ -266,7 +275,12 @@ public static string ReplaceParagraph(string html)
var text = node.InnerHtml;
var markdown = Spaces().Replace(text, " ");
markdown = markdown.Replace(Environment.NewLine, " ");
markdown = Environment.NewLine + Environment.NewLine + markdown + Environment.NewLine;

//If a paragraph is contained in a list, we don't want to add new line characters
var openingTag = nestedIntoList ? "" : Environment.NewLine + Environment.NewLine;
var closingTag = nestedIntoList ? "" : Environment.NewLine;

markdown = openingTag + markdown + closingTag;
ReplaceNode(node, markdown);
});

Expand Down
2 changes: 1 addition & 1 deletion src/Html2Markdown/Scheme/CommonMark.cs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ public class CommonMark : AbstractScheme
{
public CommonMark()
{
AddReplacementGroup(ReplacerCollection, new TextFormattingReplacementGroup());
AddReplacementGroup(ReplacerCollection, new CommonMarkTextFormattingReplacementGroup());
AddReplacementGroup(ReplacerCollection, new HeadingReplacementGroup());
AddReplacementGroup(ReplacerCollection, new IllegalHtmlReplacementGroup());
AddReplacementGroup(ReplacerCollection, new CommonMarkLayoutReplacementGroup());
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
This code is with an ordered list and paragraphs.

1. Yes, this is a `code` element
2. No :

* `Some code we are looking at`
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
This code is with an ordered list and paragraphs.

1. Yes, this is a `code` element
2. No :

* `Some code we are looking at`
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
This code is with an ordered list and paragraphs.

1. Yes, this is a `code` element
2. No :

* `Some code we are looking at`
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
This code is with an ordered list and paragraphs.

1. Yes, this is a `code` element
2. No :

* `Some code we are looking at`
26 changes: 26 additions & 0 deletions test/Html2Markdown.Test/MarkdownSchemeConverterTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -608,6 +608,32 @@ public Task Convert_WhenThereIsAnOrderedListWithANestedUnorderedList_ThenReplace
return CheckConversion(html);
}

[Test]
public Task Convert_WhenThereIsAnOrderedListWithNestedParagraphs_ThenReplaceWithMarkdownLists()
{
const string html = @"<p>This code is with an ordered list and paragraphs.</p><ol><li><p>Yes, this is a <code>code</code> element</p></li><li><p>No :</p><ul><li><code>Some code we are looking at</code></li></ul></li></ol>";

return CheckConversion(html);
}

[Test]
public Task Convert_WhenThereIsAMultilineOrderedListWithNestedParagraphsAndCodeElement_ThenReplaceWithMarkdownLists()
{
const string html = @"<p>This code is with an ordered list and paragraphs.</p>
<ol>
<li><p>Yes, this is a <code>code</code> element</p>
</li>
<li><p>No :</p>
<ul>
<li><code>Some code we are looking at</code></li>
</ul>
</li>
</ol>
";

return CheckConversion(html);
}

#endregion

#region Extra HTML Removal
Expand Down

0 comments on commit 731b2c4

Please sign in to comment.