From ee3ce32f9911c6e7f48eb3d9c1907b8a9a0a3f28 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Adam=20Zieli=C5=84ski?= Date: Thu, 19 Dec 2024 14:00:52 +0100 Subject: [PATCH] [Data Liberation] Add HTML to Blocks converter (#2095) Adds a basic `WP_HTML_To_Blocks` class that accepts HTML and outputs block markup. It's a very basic converter. It only considers the markup and won't consider any visual changes introduced via CSS or JavaScript. Only a few core blocks are supported in this initial PR. The API can easily support more HTML elements and blocks. To preserve visual fidelity between the original HTML page and the produced block markup, we'll need an annotated HTML input produced by the [Try WordPress](https://github.com/WordPress/try-wordpress/) browser extension. It would contain each element's colors, sizes, etc. We cannot possibly get all from just analyzing the HTML on the server without building a full-blown, browser-like HTML renderer in PHP, and I know I'm not building one. A part of #1894 ## Example ```php $html = <<

Hello world!

HTML; $converter = new WP_HTML_To_Blocks( $html ); $converter->convert(); var_dump( $converter->get_all_metadata() ); /* * array( 'post_title' => array( 'My first post' ) ) */ var_dump( $converter->get_block_markup() ); /* * *

Hello world!

* */ ``` ## Caveats I had to patch WP_HTML_Processor to stop baling out on `` tags referencing the document charset. Ideally we'd patch WordPress core to stop baling out when the charset is UTF-8. ## Testing instructions This PR mostly adds new code. Just confirm the unit tests pass in CI. cc @brandonpayton @zaerl @sirreal @dmsnell @ellatrix --- .eslintignore | 1 + .prettierignore | 1 + .../src/WP_Markdown_To_Blocks.php | 2 +- .../playground/data-liberation/bootstrap.php | 4 + .../playground/data-liberation/phpunit.xml | 2 + .../WP_Block_Markup_Converter.php | 51 +- .../WP_Block_Markup_Processor.php | 43 +- .../src/block-markup/WP_HTML_To_Blocks.php | 428 +++ .../src/entity-readers/WP_Entity_Reader.php | 95 + .../entity-readers/WP_HTML_Entity_Reader.php | 140 + .../src/import/WP_Import_Utils.php | 60 + .../src/import/WP_Imported_Entity.php | 30 + .../tests/WPHTMLEntityReaderTests.php | 70 + .../tests/WPHTMLToBlocksTests.php | 151 + .../html-to-blocks/excerpt.input.html | 189 ++ .../html-to-blocks/excerpt.output.html | 2887 +++++++++++++++++ 16 files changed, 4143 insertions(+), 11 deletions(-) create mode 100644 packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php create mode 100644 packages/playground/data-liberation/src/entity-readers/WP_Entity_Reader.php create mode 100644 packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php create mode 100644 packages/playground/data-liberation/src/import/WP_Import_Utils.php create mode 100644 packages/playground/data-liberation/tests/WPHTMLEntityReaderTests.php create mode 100644 packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php create mode 100644 packages/playground/data-liberation/tests/fixtures/html-to-blocks/excerpt.input.html create mode 100644 packages/playground/data-liberation/tests/fixtures/html-to-blocks/excerpt.output.html diff --git a/.eslintignore b/.eslintignore index b3ac4e5866..2f55011606 100644 --- a/.eslintignore +++ b/.eslintignore @@ -7,6 +7,7 @@ __pycache__ packages/playground/wordpress-builds/src/wordpress packages/playground/wordpress-builds/public packages/playground/sync/src/test/wp-* +packages/playground/data-liberation/tests/fixtures packages/php-wasm/node/src/test/__test* *.timestamp-1678999213403.mjs .local diff --git a/.prettierignore b/.prettierignore index 9162807152..de4d6784be 100644 --- a/.prettierignore +++ b/.prettierignore @@ -8,6 +8,7 @@ /packages/playground/wordpress-builds/build/build-assets /packages/playground/wordpress-builds/src/wordpress /packages/playground/wordpress-builds/public/ +/packages/playground/data-liberation/tests/fixtures /packages/php-wasm/node/src/test/__test* __pycache__ *.timestamp-1678999213403.mjs diff --git a/packages/playground/data-liberation-markdown/src/WP_Markdown_To_Blocks.php b/packages/playground/data-liberation-markdown/src/WP_Markdown_To_Blocks.php index f63fb20c52..78918f5b4f 100644 --- a/packages/playground/data-liberation-markdown/src/WP_Markdown_To_Blocks.php +++ b/packages/playground/data-liberation-markdown/src/WP_Markdown_To_Blocks.php @@ -52,7 +52,7 @@ public function get_all_metadata() { return $this->frontmatter; } - public function get_meta_value( $key ) { + public function get_first_meta_value( $key ) { if ( ! array_key_exists( $key, $this->frontmatter ) ) { return null; } diff --git a/packages/playground/data-liberation/bootstrap.php b/packages/playground/data-liberation/bootstrap.php index 56d10ac191..91038c1ae3 100644 --- a/packages/playground/data-liberation/bootstrap.php +++ b/packages/playground/data-liberation/bootstrap.php @@ -48,11 +48,13 @@ require_once __DIR__ . '/src/block-markup/WP_Block_Markup_Processor.php'; require_once __DIR__ . '/src/block-markup/WP_Block_Markup_Url_Processor.php'; require_once __DIR__ . '/src/block-markup/WP_URL_In_Text_Processor.php'; +require_once __DIR__ . '/src/block-markup/WP_HTML_To_Blocks.php'; require_once __DIR__ . '/src/block-markup/WP_URL.php'; require_once __DIR__ . '/src/xml-api/WP_XML_Decoder.php'; require_once __DIR__ . '/src/xml-api/WP_XML_Processor.php'; require_once __DIR__ . '/src/wxr/WP_WXR_Reader.php'; +require_once __DIR__ . '/src/import/WP_Import_Utils.php'; require_once __DIR__ . '/src/import/WP_Block_Object.php'; require_once __DIR__ . '/src/import/WP_Entity_Importer.php'; require_once __DIR__ . '/src/import/WP_File_Visitor.php'; @@ -64,6 +66,8 @@ require_once __DIR__ . '/src/import/WP_Stream_Importer.php'; require_once __DIR__ . '/src/import/WP_Entity_Iterator_Chain.php'; require_once __DIR__ . '/src/import/WP_Retry_Frontloading_Iterator.php'; +require_once __DIR__ . '/src/entity-readers/WP_Entity_Reader.php'; +require_once __DIR__ . '/src/entity-readers/WP_HTML_Entity_Reader.php'; require_once __DIR__ . '/src/utf8_decoder.php'; diff --git a/packages/playground/data-liberation/phpunit.xml b/packages/playground/data-liberation/phpunit.xml index 800b55f189..9646f33205 100644 --- a/packages/playground/data-liberation/phpunit.xml +++ b/packages/playground/data-liberation/phpunit.xml @@ -2,6 +2,8 @@ + tests/WPHTMLEntityReaderTests.php + tests/WPHTMLToBlocksTests.php tests/WPWXRReaderTests.php tests/WPRewriteUrlsTests.php tests/WPURLInTextProcessorTests.php diff --git a/packages/playground/data-liberation/src/block-markup/WP_Block_Markup_Converter.php b/packages/playground/data-liberation/src/block-markup/WP_Block_Markup_Converter.php index e3cd04b6de..1133293296 100644 --- a/packages/playground/data-liberation/src/block-markup/WP_Block_Markup_Converter.php +++ b/packages/playground/data-liberation/src/block-markup/WP_Block_Markup_Converter.php @@ -1,8 +1,57 @@ Block Markup + Metadata converter. + * + * Used by the Data Liberation importers to accept data formatted as HTML, Markdown, etc. + * and convert them to WordPress posts. + */ interface WP_Block_Markup_Converter { + /** + * Converts the input document specified in the constructor to block markup. + * + * @return bool Whether the conversion was successful. + */ public function convert(); + + /** + * Gets the block markup generated by the convert() method. + * + * @return string The block markup. + */ public function get_block_markup(); + + /** + * Gets all the metadata sourced from the input document by the convert() method. + * The data format is: + * + * array( + * 'post_title' => array( 'The Name of the Wind' ), + * 'post_author' => array( 'Patrick Rothfuss', 'Betsy Wollheim' ) + * ) + * + * Note each meta key may have multiple values. The consumer of this interface + * must account for this. + * + * @return array The metadata sourced from the input document. + */ public function get_all_metadata(); - public function get_meta_value( $key ); + + /** + * Gets the first metadata value for a given key. + * + * Example: + * + * Metadata: + * array( + * 'post_title' => array( 'The Name of the Wind' ), + * 'post_author' => array( 'Patrick Rothfuss', 'Betsy Wollheim' ) + * ) + * + * get_first_meta_value( 'post_author' ) returns 'Patrick Rothfuss'. + * + * @param string $key The metadata key. + * @return mixed The metadata value. + */ + public function get_first_meta_value( $key ); } diff --git a/packages/playground/data-liberation/src/block-markup/WP_Block_Markup_Processor.php b/packages/playground/data-liberation/src/block-markup/WP_Block_Markup_Processor.php index 101cc63484..04785fe138 100644 --- a/packages/playground/data-liberation/src/block-markup/WP_Block_Markup_Processor.php +++ b/packages/playground/data-liberation/src/block-markup/WP_Block_Markup_Processor.php @@ -58,6 +58,25 @@ public function get_block_attributes() { return $this->block_attributes; } + /** + * Overwrites all the block attributes of the currently matched block + * opener. + * + * @param array $attributes The attributes to set. + * @return bool Whether the attributes were set. + */ + public function set_block_attributes( $attributes ) { + if ( '#block-comment' !== $this->get_token_type() ) { + return false; + } + if ( $this->is_block_closer() ) { + return false; + } + $this->block_attributes = $attributes; + $this->block_attributes_updated = true; + return true; + } + public function is_block_closer() { return $this->block_name !== null && $this->block_closer === true; } @@ -165,17 +184,23 @@ private function block_attribute_updates_to_modifiable_text_updates() { if ( ! $this->block_attributes_updated ) { return false; } + $encoded_attributes = json_encode( + $this->block_attributes_iterator + ? $this->block_attributes_iterator->getSubIterator( 0 )->getArrayCopy() + : $this->block_attributes, + JSON_HEX_TAG | // Convert < and > to \u003C and \u003E + JSON_HEX_AMP // Convert & to \u0026 + ); + if ( $encoded_attributes === '[]' ) { + $encoded_attributes = ''; + } else { + $encoded_attributes .= ' '; + } $this->set_modifiable_text( ' ' . - $this->block_name . ' ' . - json_encode( - $this->block_attributes_iterator - ? $this->block_attributes_iterator->getSubIterator( 0 )->getArrayCopy() - : $this->block_attributes, - JSON_HEX_TAG | // Convert < and > to \u003C and \u003E - JSON_HEX_AMP // Convert & to \u0026 - ) - . ' ' + $this->block_name . + ' ' . + $encoded_attributes ); return true; diff --git a/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php b/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php new file mode 100644 index 0000000000..329e75bc39 --- /dev/null +++ b/packages/playground/data-liberation/src/block-markup/WP_HTML_To_Blocks.php @@ -0,0 +1,428 @@ + + *

Hello world!

+ * + * Becomes: + * + * + *

Hello world!

+ * + * + * With the following metadata: + * + * array( + * 'post_title' => array( 'My first post' ), + * ) + */ +class WP_HTML_To_Blocks implements WP_Block_Markup_Converter { + const STATE_READY = 'STATE_READY'; + const STATE_COMPLETE = 'STATE_COMPLETE'; + + private $state = self::STATE_READY; + private $block_stack = array(); + private $html; + private $ignore_text = false; + private $in_ephemeral_paragraph = false; + private $block_markup = ''; + private $metadata = array(); + + public function __construct( $html ) { + $this->html = WP_HTML_Processor::create_fragment( $html ); + } + + /** + * @inheritDoc + */ + public function convert() { + if ( self::STATE_READY !== $this->state ) { + return false; + } + + while ( $this->html->next_token() ) { + switch ( $this->html->get_token_type() ) { + case '#text': + if ( $this->ignore_text ) { + break; + } + $this->append_html( htmlspecialchars( $this->html->get_modifiable_text() ) ); + break; + case '#tag': + $this->handle_tag(); + break; + } + } + + $this->close_ephemeral_paragraph(); + return true; + } + + /** + * @inheritDoc + */ + public function get_first_meta_value( $key ) { + if ( ! array_key_exists( $key, $this->metadata ) ) { + return null; + } + return $this->metadata[ $key ][0]; + } + + /** + * @inheritDoc + */ + public function get_all_metadata() { + return $this->metadata; + } + + /** + * @inheritDoc + */ + public function get_block_markup() { + return $this->block_markup; + } + + /** + * Converts the currently matched HTML tag to block markup + * or metadata. + */ + private function handle_tag() { + $html = $this->html; + $tag = $html->get_tag(); + $tag_lowercase = strtolower( $tag ); + + $is_opener = ! $html->is_tag_closer() && $html->expects_closer(); + $is_closer = $html->is_tag_closer(); + $is_void_tag = ! $html->expects_closer(); + $prefix = ( + $is_void_tag ? '' : ( + $is_closer ? '-' : '+' + ) + ); + $event = $prefix . $tag; + switch ( $event ) { + case 'META': + $key = $html->get_attribute( 'name' ); + $value = $html->get_attribute( 'content' ); + if ( ! array_key_exists( $key, $this->metadata ) ) { + $this->metadata[ $key ] = array(); + } + $this->metadata[ $key ][] = $value; + break; + case 'IMG': + $template = new \WP_HTML_Tag_Processor( '' ); + $template->next_tag(); + foreach ( array( 'alt', 'title', 'src' ) as $attr ) { + if ( $html->get_attribute( $attr ) ) { + $template->set_attribute( $attr, $html->get_attribute( $attr ) ); + } + } + $this->append_html( $template->get_updated_html() ); + break; + case 'INPUT': + // Insert the input tag as HTML blocks. + $this->push_block( 'html' ); + $template = new \WP_HTML_Tag_Processor( '' ); + $template->next_tag(); + $attrs = $this->html->get_attribute_names_with_prefix( '' ); + foreach ( $attrs as $attr ) { + $template->set_attribute( $attr, $this->html->get_attribute( $attr ) ); + } + $this->append_html( htmlspecialchars( $template->get_updated_html() ) ); + $this->pop_block(); + break; + case 'HR': + $this->push_block( 'separator' ); + $this->block_markup .= '
'; + $this->pop_block(); + break; + + // Block elements + case '+SCRIPT': + $this->ignore_text = true; + break; + case '-SCRIPT': + $this->ignore_text = false; + break; + + case '+UL': + case '+OL': + $this->push_block( 'list', array( 'ordered' => $tag === 'ol' ) ); + $this->block_markup .= '
    '; + break; + case '-UL': + case '-OL': + $this->block_markup .= '
'; + $this->pop_block(); + break; + + case '+LI': + $this->push_block( 'list-item' ); + $this->block_markup .= '<' . $tag_lowercase . '>'; + break; + case '-LI': + $this->block_markup .= ''; + $this->pop_block(); + break; + + case '+TABLE': + $this->push_block( 'table' ); + $this->block_markup .= '
'; + $this->block_markup .= ''; + break; + case '-TABLE': + $this->block_markup .= '
'; + $this->block_markup .= '
'; + $this->pop_block(); + break; + + case '+THEAD': + case '+TBODY': + case '+TFOOT': + case '+TR': + case '+TD': + case '+TH': + $this->block_markup .= '<' . $tag_lowercase . '>'; + break; + case '-THEAD': + case '-TBODY': + case '-TFOOT': + case '-TR': + case '-TD': + case '-TH': + $this->block_markup .= ''; + break; + + case '+BLOCKQUOTE': + $this->push_block( 'quote' ); + $this->block_markup .= '<' . $tag_lowercase . '>'; + break; + case '-BLOCKQUOTE': + $this->block_markup .= ''; + $this->pop_block(); + break; + + case '+PRE': + $this->push_block( 'code' ); + $this->block_markup .= '<' . $tag_lowercase . ' class="wp-block-code">'; + break; + case '-PRE': + $this->block_markup .= ''; + $this->pop_block(); + break; + + case '+CODE': + /* + * Guess whether this is: + * - An inline element? Let's convert it into a formatting element. + * - A block element? Let's convert it into a block. + */ + if ( $this->is_at_inline_code_element() ) { + $this->append_html( '<' . $tag_lowercase . '>' ); + } else { + $this->push_block( 'code' ); + $this->block_markup .= '<' . $tag_lowercase . ' class="wp-block-code">'; + } + break; + case '-CODE': + $this->block_markup .= ''; + if ( ! $this->is_at_inline_code_element() ) { + $this->pop_block(); + } + break; + + case '+P': + $this->push_block( 'paragraph' ); + $this->block_markup .= '

'; + break; + case '-P': + $this->block_markup .= '

'; + $this->pop_block(); + break; + + case '+H1': + case '+H2': + case '+H3': + case '+H4': + case '+H5': + case '+H6': + $this->push_block( + 'heading', + array( + 'level' => (int) $tag[1] ? (int) $tag[1] : 1, + ) + ); + $this->block_markup .= ''; + break; + case '-H1': + case '-H2': + case '-H3': + case '-H4': + case '-H5': + case '-H6': + $this->block_markup .= ''; + $this->pop_block(); + break; + + // Inline elements + case '+A': + $template = new \WP_HTML_Tag_Processor( '' ); + $template->next_tag(); + if ( $html->get_attribute( 'href' ) ) { + $template->set_attribute( 'href', $html->get_attribute( 'href' ) ); + } + $this->append_html( $template->get_updated_html() ); + break; + case '-A': + $this->block_markup .= ''; + break; + + // Formats – just pass through (minus the HTML attributes) + default: + if ( $this->should_preserve_tag_in_rich_text( $tag ) ) { + if ( $is_opener ) { + $this->append_html( '<' . $tag_lowercase . '>' ); + } elseif ( $is_closer ) { + $this->append_html( '' ); + } + } else { + /* + * Ignore all the other tags. We've included all the meaningful + * handlers in the switch statement above and there's not much + * we can do with generic tags such as
, ,
, etc. + */ + } + break; + } + } + + /** + * Checks whether the given tag is an inline formatting element + * that we want to preserve when parsing rich text. For example, + * tags are meaningful from the rich text perspective, but + *
tags are not. + * + * @param string $tag The tag to check. + * @return bool Whether the tag should be preserved in rich text. + */ + private function should_preserve_tag_in_rich_text( $tag ) { + return in_array( + $tag, + array( + 'B', + 'STRONG', + 'I', + 'U', + 'S', + 'SMALL', + 'SUP', + 'SUB', + 'MARK', + 'EM', + 'CITE', + 'DFN', + 'CODE', + 'KBD', + 'SAMP', + 'VAR', + ), + true + ); + } + + private function is_at_inline_code_element() { + $breadcrumbs = $this->html->get_breadcrumbs(); + foreach ( $breadcrumbs as $tag ) { + switch ( $tag ) { + case 'A': + case 'P': + case 'LI': + case 'TABLE': + case 'H1': + case 'H2': + case 'H3': + case 'H4': + case 'H5': + case 'H6': + return true; + } + } + return false; + } + + /** + * Appends a snippet of HTML to the block markup. + * Ensures given $html is a part of a block. If no block is + * currently open, it appends a new paragraph block. + * + * @param string $html The HTML snippet to append. + */ + private function append_html( $html ) { + $html = trim( $html ); + if ( empty( $html ) ) { + return; + } + // Make sure two subsequent append_html() calls don't merge the text. + $html .= ' '; + $this->ensure_open_block(); + $this->block_markup .= $html; + } + + /** + * Pushes a new block onto the stack of open blocks and appends the block + * opener to the block markup. + * + * @param string $name The name of the block to push. + * @param array $attributes The attributes of the block to push. + */ + private function push_block( $name, $attributes = array() ) { + $this->close_ephemeral_paragraph(); + $block = new \WP_Block_Object( $name, $attributes ); + array_push( $this->block_stack, $block ); + $this->block_markup .= WP_Import_Utils::block_opener( $block->block_name, $block->attrs ) . "\n"; + } + + /** + * Pops the last block from the stack of open blocks and appends the block + * closer to the block markup. + * + * @return \WP_Block_Object The last block that was popped. + */ + private function pop_block() { + if ( ! empty( $this->block_stack ) ) { + $popped = array_pop( $this->block_stack ); + $this->block_markup .= WP_Import_Utils::block_closer( $popped->block_name ) . "\n"; + return $popped; + } + } + + /** + * Ensures that a block is open. If no block is currently open, it appends + * a new, ephemeral paragraph block that will be automatically closed + * when the next block opens OR when the HTML ends. + */ + private function ensure_open_block() { + if ( empty( $this->block_stack ) && ! $this->in_ephemeral_paragraph ) { + $this->block_markup .= WP_Import_Utils::block_opener( 'paragraph' ) . "\n"; + $this->block_markup .= '

'; + $this->in_ephemeral_paragraph = true; + } + } + + /** + * Closes the ephemeral paragraph if it is currently open. + */ + private function close_ephemeral_paragraph() { + if ( $this->in_ephemeral_paragraph ) { + $this->block_markup .= '

'; + $this->block_markup .= WP_Import_Utils::block_closer( 'paragraph' ); + $this->in_ephemeral_paragraph = false; + } + } +} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_Entity_Reader.php new file mode 100644 index 0000000000..a45017fd0f --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_Entity_Reader.php @@ -0,0 +1,95 @@ +get_entity() && ! $this->is_finished() && ! $this->get_last_error() ) { + $this->next(); + } + return $this->get_entity(); + } + + private $last_next_result = null; + public function next(): void { + // @TODO: Don't keep track of this. Just make sure the next_entity() + // call will make the is_finished() true. + $this->last_next_result = $this->next_entity(); + } + + public function key(): string { + return $this->get_reentrancy_cursor(); + } + + public function valid(): bool { + return false !== $this->last_next_result && ! $this->is_finished() && ! $this->get_last_error(); + } + + public function rewind(): void { + // Haven't started yet. + if ( null === $this->last_next_result ) { + return; + } + _doing_it_wrong( + __METHOD__, + 'WP_WXR_Entity_Reader does not support rewinding.', + null + ); + } +} diff --git a/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php b/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php new file mode 100644 index 0000000000..95923ef390 --- /dev/null +++ b/packages/playground/data-liberation/src/entity-readers/WP_HTML_Entity_Reader.php @@ -0,0 +1,140 @@ +html = $html; + $this->post_id = $post_id; + } + + /** + * Advances to the next entity. + * + * @return bool Whether the next entity was found. + */ + public function next_entity() { + // If we're finished, we're finished. + if ( $this->finished ) { + return false; + } + + // If we've already read some entities, skip to the next one. + if ( null !== $this->entities ) { + if ( count( $this->entities ) <= 1 ) { + $this->finished = true; + return false; + } + array_shift( $this->entities ); + return true; + } + + // We did not read any entities yet. Let's convert the HTML document into entities. + $converter = new WP_HTML_To_Blocks( $this->html ); + if ( false === $converter->convert() ) { + return false; + } + + $all_metadata = $converter->get_all_metadata(); + $post_fields = array(); + $other_metadata = array(); + foreach ( $all_metadata as $key => $values ) { + if ( in_array( $key, WP_Imported_Entity::POST_FIELDS, true ) ) { + $post_fields[ $key ] = $values[0]; + } else { + $other_metadata[ $key ] = $values[0]; + } + } + + // Emit the post entity. + $this->entities[] = new WP_Imported_Entity( + 'post', + array_merge( + $post_fields, + array( + 'post_id' => $this->post_id, + 'content' => $converter->get_block_markup(), + ) + ) + ); + + // Emit all the metadata that don't belong to the post entity. + foreach ( $other_metadata as $key => $value ) { + $this->entities[] = new WP_Imported_Entity( + 'post_meta', + array( + 'post_id' => $this->post_id, + 'meta_key' => $key, + 'meta_value' => $value, + ) + ); + } + return true; + } + + /** + * Returns the current entity. + * + * @return WP_Imported_Entity|false The current entity, or false if there are no entities left. + */ + public function get_entity() { + if ( $this->is_finished() ) { + return false; + } + return $this->entities[0]; + } + + /** + * Checks if this reader has finished yet. + * + * @return bool Whether the reader has finished. + */ + public function is_finished(): bool { + return $this->finished; + } + + /** + * Returns the last error that occurred when processing the HTML. + * + * @return string|null The last error, or null if there was no error. + */ + public function get_last_error(): ?string { + return null; + } +} diff --git a/packages/playground/data-liberation/src/import/WP_Import_Utils.php b/packages/playground/data-liberation/src/import/WP_Import_Utils.php new file mode 100644 index 0000000000..61dbf97a82 --- /dev/null +++ b/packages/playground/data-liberation/src/import/WP_Import_Utils.php @@ -0,0 +1,60 @@ +"; + $processor = new WP_Block_Markup_Processor( $template ); + $processor->next_token(); + $processor->set_block_attributes( $attrs ); + return $processor->get_updated_html(); + } + + /** + * Generates a block closer comment. + * + * @param string $block_name The name of the block. + * @return string The block closer. + */ + public static function block_closer( $block_name ) { + return ""; + } + + /** + * Convert an array of WP_Block_Object objects to HTML markup. + * + * @param array $blocks The blocks to convert to markup. + * @return string The HTML markup. + */ + public static function convert_blocks_to_markup( $blocks ) { + $block_markup = ''; + + foreach ( $blocks as $block ) { + // Allow mixing of inner blocks and content strings. + if ( is_string( $block ) ) { + $block_markup .= $block; + continue; + } + // Start of block comment + $block_markup .= self::block_opener( $block->block_name, $block->attrs ); + $block_markup .= $block->attrs['content'] ?? ''; + $block_markup .= self::convert_blocks_to_markup( $block->inner_blocks ); + $block_markup .= self::block_closer( $block->block_name ); + } + + return $block_markup; + } +} diff --git a/packages/playground/data-liberation/src/import/WP_Imported_Entity.php b/packages/playground/data-liberation/src/import/WP_Imported_Entity.php index 96c3dd3dd2..41a11e8491 100644 --- a/packages/playground/data-liberation/src/import/WP_Imported_Entity.php +++ b/packages/playground/data-liberation/src/import/WP_Imported_Entity.php @@ -1,5 +1,9 @@ + + + +

It is our pleasure to announce that WordPress 6.8 was released

+

Last week, WordPress 6.8 was released.

+HTML; + $reader = new WP_HTML_Entity_Reader( $html, 1 ); + $entities = []; + while ( $reader->next_entity() ) { + $data = $reader->get_entity()->get_data(); + if(isset($data['content'])) { + $data['content'] = $this->normalize_markup( $data['content'] ); + } + $entities[] = [ + 'type' => $reader->get_entity()->get_type(), + 'data' => $data, + ]; + } + $expected_entities = [ + [ + 'type' => 'post', + 'data' => [ + 'post_title' => 'WordPress 6.8 was released', + 'post_date' => '2024-12-16', + 'post_id' => 1, + 'content' => $this->normalize_markup(<< +

It is our pleasure to announce that WordPress 6.8 was released

+ + + +

Last week, WordPress 6.8 was released.

+ +HTML) + ] + ], + [ + 'type' => 'post_meta', + 'data' => [ + 'post_id' => 1, + 'meta_key' => 'custom_post_meta', + 'meta_value' => 'custom_post_meta_value', + ] + ], + [ + 'type' => 'post_meta', + 'data' => [ + 'post_id' => 1, + 'meta_key' => 'color_palette', + 'meta_value' => 'use_that_pretty_one', + ] + ], + ]; + $this->assertEquals( $expected_entities, $entities ); + } + + private function normalize_markup( $markup ) { + return str_replace( "\n", '', WP_HTML_Processor::create_fragment( $markup )->serialize() ); + } + +} diff --git a/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php b/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php new file mode 100644 index 0000000000..41d6ba8ae8 --- /dev/null +++ b/packages/playground/data-liberation/tests/WPHTMLToBlocksTests.php @@ -0,0 +1,151 @@ + + + + + + + +

WordPress 6.8 was released

+

Last week, WordPress 6.8 was released. This release includes a new default theme, a new block editor experience, and a new block library. It also includes a new block editor experience, and a new block library.

+HTML; + $converter = new WP_HTML_To_Blocks( $html ); + $converter->convert( $html ); + $metadata = $converter->get_all_metadata(); + $expected_metadata = [ + 'post_title' => ['WordPress 6.8 was released'], + 'post_date' => ['2024-12-16'], + 'post_modified' => ['2024-12-16'], + 'post_author' => ['1'], + 'post_author_name' => ['The WordPress Team'], + 'post_author_url' => ['https://wordpress.org'], + 'post_author_avatar' => ['https://wordpress.org/wp-content/uploads/2024/04/wordpress-logo-2024.png'], + ]; + $this->assertEquals( $expected_metadata, $metadata ); + } + + /** + * @dataProvider provider_test_conversion + */ + public function test_html_to_blocks_conversion( $html, $expected ) { + $converter = new WP_HTML_To_Blocks( $html ); + $converter->convert( $html ); + $blocks = $converter->get_block_markup(); + + $this->assertEquals( $this->normalize_markup($expected), $this->normalize_markup($blocks) ); + } + + private function normalize_markup( $markup ) { + $processor = WP_HTML_Processor::create_fragment( $markup ); + $serialized = $processor->serialize(); + $serialized = trim( + str_replace( + // Naively remove all the newlines to prevent minor formatting differences + // from causing false negatives in $expected === $actual. + "\n", + '', + $serialized + ) + ); + return $serialized; + } + + public function provider_test_conversion() { + return [ + 'A simple paragraph' => [ + 'html' => '

A simple paragraph

', + 'expected' => "

A simple paragraph

" + ], + 'A simple list' => [ + 'html' => '
  • Item 1
  • Item 2
', + 'expected' => <<
    \n
  • Item 1
  • Item 2
+HTML + ], + 'A non-normative list' => [ + 'html' => '