Skip to content

Commit

Permalink
[Data Liberation] Add HTML to Blocks converter (#2095)
Browse files Browse the repository at this point in the history
Adds a basic `WP_HTML_To_Blocks` class that accepts HTML and outputs
block markup.

It's a very basic converter. It only considers the markup and won't
consider any visual changes introduced via CSS or JavaScript. Only a few
core blocks are supported in this initial PR. The API can easily support
more HTML elements and blocks.

To preserve visual fidelity between the original HTML page and the
produced block markup, we'll need an annotated HTML input produced by
the [Try WordPress](https://github.com/WordPress/try-wordpress/) browser
extension. It would contain each element's colors, sizes, etc. We cannot
possibly get all from just analyzing the HTML on the server without
building a full-blown, browser-like HTML renderer in PHP, and I know I'm
not building one.

A part of #1894

 ## Example

```php
$html = <<<HTML
<meta name="post_title" content="My first post">
<p>Hello <b>world</b>!</p>
HTML;

$converter = new WP_HTML_To_Blocks( $html );
$converter->convert();

var_dump( $converter->get_all_metadata() );
/*
 * array( 'post_title' => array( 'My first post' ) )
 */

var_dump( $converter->get_block_markup() );
/*
 * <!-- wp:paragraph -->
 * <p>Hello <b>world</b>!</p>
 * <!-- /wp:paragraph -->
 */
```

 ## Caveats

I had to patch WP_HTML_Processor to stop baling out on `<meta>` tags
referencing the document charset. Ideally we'd patch WordPress core to
stop baling out when the charset is UTF-8.

 ## Testing instructions

This PR mostly adds new code. Just confirm the unit tests pass in CI.

cc @brandonpayton @zaerl @sirreal @dmsnell @ellatrix
  • Loading branch information
adamziel authored Dec 19, 2024
1 parent d84c326 commit ee3ce32
Show file tree
Hide file tree
Showing 16 changed files with 4,143 additions and 11 deletions.
1 change: 1 addition & 0 deletions .eslintignore
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ __pycache__
packages/playground/wordpress-builds/src/wordpress
packages/playground/wordpress-builds/public
packages/playground/sync/src/test/wp-*
packages/playground/data-liberation/tests/fixtures
packages/php-wasm/node/src/test/__test*
*.timestamp-1678999213403.mjs
.local
Expand Down
1 change: 1 addition & 0 deletions .prettierignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
/packages/playground/wordpress-builds/build/build-assets
/packages/playground/wordpress-builds/src/wordpress
/packages/playground/wordpress-builds/public/
/packages/playground/data-liberation/tests/fixtures
/packages/php-wasm/node/src/test/__test*
__pycache__
*.timestamp-1678999213403.mjs
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ public function get_all_metadata() {
return $this->frontmatter;
}

public function get_meta_value( $key ) {
public function get_first_meta_value( $key ) {
if ( ! array_key_exists( $key, $this->frontmatter ) ) {
return null;
}
Expand Down
4 changes: 4 additions & 0 deletions packages/playground/data-liberation/bootstrap.php
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,13 @@
require_once __DIR__ . '/src/block-markup/WP_Block_Markup_Processor.php';
require_once __DIR__ . '/src/block-markup/WP_Block_Markup_Url_Processor.php';
require_once __DIR__ . '/src/block-markup/WP_URL_In_Text_Processor.php';
require_once __DIR__ . '/src/block-markup/WP_HTML_To_Blocks.php';
require_once __DIR__ . '/src/block-markup/WP_URL.php';

require_once __DIR__ . '/src/xml-api/WP_XML_Decoder.php';
require_once __DIR__ . '/src/xml-api/WP_XML_Processor.php';
require_once __DIR__ . '/src/wxr/WP_WXR_Reader.php';
require_once __DIR__ . '/src/import/WP_Import_Utils.php';
require_once __DIR__ . '/src/import/WP_Block_Object.php';
require_once __DIR__ . '/src/import/WP_Entity_Importer.php';
require_once __DIR__ . '/src/import/WP_File_Visitor.php';
Expand All @@ -64,6 +66,8 @@
require_once __DIR__ . '/src/import/WP_Stream_Importer.php';
require_once __DIR__ . '/src/import/WP_Entity_Iterator_Chain.php';
require_once __DIR__ . '/src/import/WP_Retry_Frontloading_Iterator.php';
require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php';
require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php';

require_once __DIR__ . '/src/utf8_decoder.php';

Expand Down
2 changes: 2 additions & 0 deletions packages/playground/data-liberation/phpunit.xml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
<phpunit xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" bootstrap="bootstrap.php" colors="true" xsi:noNamespaceSchemaLocation="https://schema.phpunit.de/10.0/phpunit.xsd" cacheDirectory=".phpunit.cache">
<testsuites>
<testsuite name="Application Test Suite">
<file>tests/WPHTMLEntityReaderTests.php</file>
<file>tests/WPHTMLToBlocksTests.php</file>
<file>tests/WPWXRReaderTests.php</file>
<file>tests/WPRewriteUrlsTests.php</file>
<file>tests/WPURLInTextProcessorTests.php</file>
Expand Down
Original file line number Diff line number Diff line change
@@ -1,8 +1,57 @@
<?php

/**
* Represents a {Data Format} -> Block Markup + Metadata converter.
*
* Used by the Data Liberation importers to accept data formatted as HTML, Markdown, etc.
* and convert them to WordPress posts.
*/
interface WP_Block_Markup_Converter {
/**
* Converts the input document specified in the constructor to block markup.
*
* @return bool Whether the conversion was successful.
*/
public function convert();

/**
* Gets the block markup generated by the convert() method.
*
* @return string The block markup.
*/
public function get_block_markup();

/**
* Gets all the metadata sourced from the input document by the convert() method.
* The data format is:
*
* array(
* 'post_title' => array( 'The Name of the Wind' ),
* 'post_author' => array( 'Patrick Rothfuss', 'Betsy Wollheim' )
* )
*
* Note each meta key may have multiple values. The consumer of this interface
* must account for this.
*
* @return array The metadata sourced from the input document.
*/
public function get_all_metadata();
public function get_meta_value( $key );

/**
* Gets the first metadata value for a given key.
*
* Example:
*
* Metadata:
* array(
* 'post_title' => array( 'The Name of the Wind' ),
* 'post_author' => array( 'Patrick Rothfuss', 'Betsy Wollheim' )
* )
*
* get_first_meta_value( 'post_author' ) returns 'Patrick Rothfuss'.
*
* @param string $key The metadata key.
* @return mixed The metadata value.
*/
public function get_first_meta_value( $key );
}
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,25 @@ public function get_block_attributes() {
return $this->block_attributes;
}

/**
* Overwrites all the block attributes of the currently matched block
* opener.
*
* @param array $attributes The attributes to set.
* @return bool Whether the attributes were set.
*/
public function set_block_attributes( $attributes ) {
if ( '#block-comment' !== $this->get_token_type() ) {
return false;
}
if ( $this->is_block_closer() ) {
return false;
}
$this->block_attributes = $attributes;
$this->block_attributes_updated = true;
return true;
}

public function is_block_closer() {
return $this->block_name !== null && $this->block_closer === true;
}
Expand Down Expand Up @@ -165,17 +184,23 @@ private function block_attribute_updates_to_modifiable_text_updates() {
if ( ! $this->block_attributes_updated ) {
return false;
}
$encoded_attributes = json_encode(
$this->block_attributes_iterator
? $this->block_attributes_iterator->getSubIterator( 0 )->getArrayCopy()
: $this->block_attributes,
JSON_HEX_TAG | // Convert < and > to \u003C and \u003E
JSON_HEX_AMP // Convert & to \u0026
);
if ( $encoded_attributes === '[]' ) {
$encoded_attributes = '';
} else {
$encoded_attributes .= ' ';
}
$this->set_modifiable_text(
' ' .
$this->block_name . ' ' .
json_encode(
$this->block_attributes_iterator
? $this->block_attributes_iterator->getSubIterator( 0 )->getArrayCopy()
: $this->block_attributes,
JSON_HEX_TAG | // Convert < and > to \u003C and \u003E
JSON_HEX_AMP // Convert & to \u0026
)
. ' '
$this->block_name .
' ' .
$encoded_attributes
);

return true;
Expand Down
Loading

0 comments on commit ee3ce32

Please sign in to comment.