Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
d8ac361
Add spawn_fragment_parser method
sirreal Sep 12, 2024
ad8f8db
Fix the processor context_node
sirreal Sep 12, 2024
e2efee4
Make it public
sirreal Sep 13, 2024
4f5249c
Fix spawn_fragment_parser method
sirreal Sep 13, 2024
eaed863
Process non-body context tests
sirreal Sep 13, 2024
25b18fa
Handle all the different document context in html5lib tests
sirreal Sep 13, 2024
9ac142f
lints
sirreal Sep 13, 2024
9ede14f
Merge branch 'trunk' into html-api/add-spawn-fragment-parser-method
sirreal Nov 6, 2024
3f35886
Make spawned fragment parse have HTML > [context-node-tag] in breadcr…
sirreal Nov 6, 2024
ba9e218
Fallback to context node when checking namespace
sirreal Nov 6, 2024
fe48fa5
Add tests
sirreal Nov 6, 2024
fa4c5cb
Set the form element pointer on the fragment parser
sirreal Nov 6, 2024
fbb5c2f
Merge branch 'trunk' into html-api/add-spawn-fragment-parser-method
sirreal Nov 6, 2024
336050d
Merge branch 'trunk' into html-api/add-spawn-fragment-parser-method
sirreal Nov 6, 2024
943bbdd
Revert "Fallback to context node when checking namespace"
sirreal Nov 6, 2024
e3a0a86
Fix initial namespace on integration nodes
sirreal Nov 6, 2024
9d3b318
Merge branch 'trunk' into html-api/add-spawn-fragment-parser-method
sirreal Nov 7, 2024
27a9781
Rename method, use static constructor, add comments
sirreal Nov 7, 2024
0789538
Update method name in tests
sirreal Nov 8, 2024
5e8b82e
Add ticket to tests
sirreal Nov 8, 2024
7eeec27
Merge branch 'trunk' into html-api/add-spawn-fragment-parser-method
sirreal Nov 11, 2024
37f9ff4
Update method name in html5lib tests
sirreal Nov 12, 2024
80ae6f2
Handle null return from create_fragment
sirreal Nov 12, 2024
9866402
Use a cloned copy of the FORM element from the parent processor
sirreal Nov 12, 2024
bcebeba
Remove stale comment
sirreal Nov 12, 2024
9e11f19
Improve method documentation with examples
sirreal Nov 12, 2024
05801bb
Merge branch 'trunk' into html-api/add-spawn-fragment-parser-method
sirreal Nov 12, 2024
4618b90
Merge branch 'trunk' into html-api/add-spawn-fragment-parser-method
sirreal Nov 13, 2024
a02e238
Improve docblock language and formatting
sirreal Nov 20, 2024
9440890
Improve comment, add PLAINTEXT
sirreal Nov 20, 2024
25ae695
Pull in relevant fixes/improvements from #7777
sirreal Nov 20, 2024
f2b4121
Merge branch 'trunk' into html-api/add-spawn-fragment-parser-method
sirreal Nov 20, 2024
0662156
Fix context node attributes
sirreal Nov 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
114 changes: 114 additions & 0 deletions src/wp-includes/html-api/class-wp-html-processor.php
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,120 @@ function ( WP_HTML_Token $token ): void {
};
}

/**
* Creates a fragment processor at the current node.
*
* HTML Fragment parsing always happens with a context node. HTML Fragment Processors can be
* instantiated with a `BODY` context node via `WP_HTML_Processor::create_fragment( $html )`.
*
* The context node may impact how a fragment of HTML is parsed. For example, consider the HTML
* fragment `<td />Inside TD?</td>`.
*
* A BODY context node will produce the following tree:
*
* └─#text Inside TD?
*
* Notice that the `<td>` tags are completely ignored.
*
* Compare that with an SVG context node that produces the following tree:
*
* ├─svg:td
* └─#text Inside TD?
*
* Here, a `td` node in the `svg` namespace is created, and its self-closing flag is respected.
* This is a peculiarity of parsing HTML in foreign content like SVG.
*
* Finally, consider the tree produced with a TABLE context node:
*
* └─TBODY
* └─TR
* └─TD
* └─#text Inside TD?
*
* These examples demonstrate how important the context node may be when processing an HTML
* fragment. Special care must be taken when processing fragments that are expected to appear
* in specific contexts. SVG and TABLE are good examples, but there are others.
*
* @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm
*
* @param string $html Input HTML fragment to process.
* @return static|null The created processor if successful, otherwise null.
*/
public function create_fragment_at_current_node( string $html ) {
if ( $this->get_token_type() !== '#tag' ) {
return null;
}

$namespace = $this->current_element->token->namespace;

/*
* Prevent creating fragments at nodes that require a special tokenizer state.
* This is unsupported by the HTML Processor.
*/
if (
'html' === $namespace &&
in_array( $this->current_element->token->node_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true )
) {
return null;
}

$fragment_processor = static::create_fragment( $html );
if ( null === $fragment_processor ) {
return null;
}

$fragment_processor->compat_mode = $this->compat_mode;

$fragment_processor->context_node = clone $this->state->current_token;
$fragment_processor->context_node->bookmark_name = 'context-node';
$fragment_processor->context_node->on_destroy = null;

$fragment_processor->state->context_node = array( $fragment_processor->context_node->node_name, array() );

$attribute_names = $this->get_attribute_names_with_prefix( '' );
if ( null !== $attribute_names ) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The docs say that get_attribute_names_with_prefix returns null

when no tag opener is matched.

This got me thinking -- should we disallow create_fragment_at_current_node from being called when the processor is paused on a tag closer?

(If we do, then we should be able to remove this guard, as it would be guaranteed that we're at a tag opener.)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good thought! This won't behave as expected on closers, and it seems like a strange thing to do in general.

#7859

foreach ( $attribute_names as $name ) {
$fragment_processor->state->context_node[1][ $name ] = $this->get_attribute( $name );
}
}

$fragment_processor->breadcrumbs = array( 'HTML', $fragment_processor->context_node->node_name );

if ( 'TEMPLATE' === $fragment_processor->context_node->node_name ) {
$fragment_processor->state->stack_of_template_insertion_modes[] = WP_HTML_Processor_State::INSERTION_MODE_IN_TEMPLATE;
}

$fragment_processor->reset_insertion_mode_appropriately();

/*
* > Set the parser's form element pointer to the nearest node to the context element that
* > is a form element (going straight up the ancestor chain, and including the element
* > itself, if it is a form element), if any. (If there is no such form element, the
* > form element pointer keeps its initial value, null.)
*/
foreach ( $this->state->stack_of_open_elements->walk_up() as $element ) {
if ( 'FORM' === $element->node_name && 'html' === $element->namespace ) {
$fragment_processor->state->form_element = clone $element;
$fragment_processor->state->form_element->bookmark_name = null;
$fragment_processor->state->form_element->on_destroy = null;
break;
}
}

$fragment_processor->state->encoding_confidence = 'irrelevant';

/*
* Update the parsing namespace near the end of the process.
* This is important so that any push/pop from the stack of open
* elements does not change the parsing namespace.
*/
$fragment_processor->change_parsing_namespace(
$this->current_element->token->integration_node_type ? 'html' : $namespace
);

return $fragment_processor;
}

/**
* Stops the parser and terminates its execution when encountering unsupported markup.
*
Expand Down
60 changes: 60 additions & 0 deletions tests/phpunit/tests/html-api/wpHtmlProcessor.php
Original file line number Diff line number Diff line change
Expand Up @@ -1043,6 +1043,66 @@ public function test_ensure_next_token_method_extensibility( $html, $expected_to
$this->assertEquals( $expected_token_counts, $processor->token_seen_count, 'Snapshot: ' . var_export( $processor->token_seen_count, true ) );
}

/**
* @ticket 62357
*/
public function test_create_fragment_at_current_node_in_foreign_content() {
$processor = WP_HTML_Processor::create_full_parser( '<svg>' );
$this->assertTrue( $processor->next_tag( 'SVG' ) );

$fragment = $processor->create_fragment_at_current_node( "\0preceded-by-nul-byte<rect /><circle></circle><foreignobject><div></div></foreignobject><g>" );

$this->assertSame( 'svg', $fragment->get_namespace() );
$this->assertTrue( $fragment->next_token() );

/*
* In HTML parsing, a nul byte would be ignored.
* In SVG it should be replaced with a replacement character.
*/
$this->assertSame( '#text', $fragment->get_token_type() );
$this->assertSame( "\u{FFFD}", $fragment->get_modifiable_text() );

$this->assertTrue( $fragment->next_tag( 'RECT' ) );
$this->assertSame( 'svg', $fragment->get_namespace() );

$this->assertTrue( $fragment->next_tag( 'CIRCLE' ) );
$this->assertSame( array( 'HTML', 'SVG', 'CIRCLE' ), $fragment->get_breadcrumbs() );
$this->assertTrue( $fragment->next_tag( 'foreignObject' ) );
$this->assertSame( 'svg', $fragment->get_namespace() );
}

/**
* @ticket 62357
*/
public function test_create_fragment_at_current_node_in_foreign_content_integration_point() {
$processor = WP_HTML_Processor::create_full_parser( '<svg><foreignObject>' );
$this->assertTrue( $processor->next_tag( 'foreignObject' ) );

$fragment = $processor->create_fragment_at_current_node( "<image>\0not-preceded-by-nul-byte<rect />" );

// Nothing has been processed, the html namespace should be used for parsing as an integration point.
$this->assertSame( 'html', $fragment->get_namespace() );

// HTML parsing transforms IMAGE into IMG.
$this->assertTrue( $fragment->next_tag( 'IMG' ) );

$this->assertTrue( $fragment->next_token() );

// In HTML parsing, the nul byte is ignored and the text is reached.
$this->assertSame( '#text', $fragment->get_token_type() );
$this->assertSame( 'not-preceded-by-nul-byte', $fragment->get_modifiable_text() );

/*
* svg:foreignObject is an HTML integration point, so the processor should be in the HTML namespace.
* RECT is an HTML element here, meaning it may have the self-closing flag but does not self-close.
*/
$this->assertTrue( $fragment->next_tag( 'RECT' ) );
$this->assertSame( array( 'HTML', 'FOREIGNOBJECT', 'RECT' ), $fragment->get_breadcrumbs() );
$this->assertSame( 'html', $fragment->get_namespace() );
$this->assertTrue( $fragment->has_self_closing_flag() );
$this->assertTrue( $fragment->expects_closer() );
}

/**
* Ensure that lowercased tag_name query matches tags case-insensitively.
*
Expand Down
85 changes: 71 additions & 14 deletions tests/phpunit/tests/html-api/wpHtmlProcessorHtml5lib.php
Original file line number Diff line number Diff line change
Expand Up @@ -138,10 +138,6 @@ public function data_external_html5lib_tests() {
* @return bool True if the test case should be skipped. False otherwise.
*/
private static function should_skip_test( ?string $test_context_element, string $test_name ): bool {
if ( null !== $test_context_element && 'body' !== $test_context_element ) {
return true;
}

if ( array_key_exists( $test_name, self::SKIP_TESTS ) ) {
return true;
}
Expand All @@ -157,18 +153,79 @@ private static function should_skip_test( ?string $test_context_element, string
* @return string|null Tree structure of parsed HTML, if supported, else null.
*/
private static function build_tree_representation( ?string $fragment_context, string $html ) {
$processor = $fragment_context
? WP_HTML_Processor::create_fragment( $html, "<{$fragment_context}>" )
: WP_HTML_Processor::create_full_parser( $html );
if ( null === $processor ) {
throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() );
$processor = null;
if ( $fragment_context ) {
if ( 'body' === $fragment_context ) {
$processor = WP_HTML_Processor::create_fragment( $html );
} else {

/*
* If the string of characters starts with "svg ", the context
* element is in the SVG namespace and the substring after
* "svg " is the local name. If the string of characters starts
* with "math ", the context element is in the MathML namespace
* and the substring after "math " is the local name.
* Otherwise, the context element is in the HTML namespace and
* the string is the local name.
*/
if ( str_starts_with( $fragment_context, 'svg ' ) ) {
$tag_name = substr( $fragment_context, 4 );
if ( 'svg' === $tag_name ) {
$parent_processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><svg>' );
} else {
$parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><svg><{$tag_name}>" );
}
$parent_processor->next_tag( $tag_name );
} elseif ( str_starts_with( $fragment_context, 'math ' ) ) {
$tag_name = substr( $fragment_context, 5 );
if ( 'math' === $tag_name ) {
$parent_processor = WP_HTML_Processor::create_full_parser( '<!DOCTYPE html><math>' );
} else {
$parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><math><{$tag_name}>" );
}
$parent_processor->next_tag( $tag_name );
} else {
if ( in_array(
$fragment_context,
array(
'caption',
'col',
'colgroup',
'tbody',
'td',
'tfoot',
'th',
'thead',
'tr',
),
true
) ) {
$parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><table><{$fragment_context}>" );
$parent_processor->next_tag();
} else {
$parent_processor = WP_HTML_Processor::create_full_parser( "<!DOCTYPE html><{$fragment_context}>" );
}
$parent_processor->next_tag( $fragment_context );
}
if ( null !== $parent_processor->get_unsupported_exception() ) {
throw $parent_processor->get_unsupported_exception();
}
if ( null !== $parent_processor->get_last_error() ) {
throw new Exception( $parent_processor->get_last_error() );
}
$processor = $parent_processor->create_fragment_at_current_node( $html );
}

if ( null === $processor ) {
throw new WP_HTML_Unsupported_Exception( "Could not create a parser with the given fragment context: {$fragment_context}.", '', 0, '', array(), array() );
}
} else {
$processor = WP_HTML_Processor::create_full_parser( $html );
if ( null === $processor ) {
throw new Exception( 'Could not create a full parser.' );
}
}

/*
* The fragment parser will start in 2 levels deep at: html > body > [position]
* and requires adjustment to initial parameters.
* The full parser will not.
*/
$output = '';
$indent_level = 0;
$was_text = null;
Expand Down
Loading