Skip to content

Commit 4b34369

Browse files
committed
HTML API: Allow more contexts in create_fragment.
This changeset modifies `WP_HTML_Processor::create_fragment( $html, $context )` to use a full processor and `create_fragment_at_node` instead of the other way around. This makes more sense and makes the main factory methods more clear, where the state required for fragments is set up in `create_fragment_at_node` instead of in both `create_fragment` and `create_fragment_at_current_node`. This allows for more HTML contexts to be provided to the basic `create_fragment` where the provided context HTML is appended to `<!DOCTYPE html>`, a full processor is created, the last tag opener is found, and a fragment parser is created at that node via `create_fragment_at_current_node`. The HTML5lib tests are updated accordingly to use this new method to create fragments. Props jonsurrell, dmsnell, bernhard-reiter. Fixes #62584. git-svn-id: https://develop.svn.wordpress.org/trunk@59467 602fd350-edb4-49c9-b593-d223f7449a82
1 parent e904637 commit 4b34369

File tree

4 files changed

+307
-172
lines changed

4 files changed

+307
-172
lines changed

src/wp-includes/html-api/class-wp-html-processor.php

Lines changed: 87 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -279,51 +279,62 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
279279
* form is provided because a context element may have attributes that
280280
* impact the parse, such as with a SCRIPT tag and its `type` attribute.
281281
*
282-
* ## Current HTML Support
282+
* Example:
283+
*
284+
* // Usually, snippets of HTML ought to be processed in the default `<body>` context:
285+
* $processor = WP_HTML_Processor::create_fragment( '<p>Hi</p>' );
286+
*
287+
* // Some fragments should be processed in the correct context like this SVG:
288+
* $processor = WP_HTML_Processor::create_fragment( '<rect width="10" height="10" />', '<svg>' );
289+
*
290+
* // This fragment with TD tags should be processed in a TR context:
291+
* $processor = WP_HTML_Processor::create_fragment(
292+
* '<td>1<td>2<td>3',
293+
* '<table><tbody><tr>'
294+
* );
283295
*
284-
* - The only supported context is `<body>`, which is the default value.
285-
* - The only supported document encoding is `UTF-8`, which is the default value.
296+
* In order to create a fragment processor at the correct location, the
297+
* provided fragment will be processed as part of a full HTML document.
298+
* The processor will search for the last opener tag in the document and
299+
* create a fragment processor at that location. The document will be
300+
* forced into "no-quirks" mode by including the HTML5 doctype.
301+
*
302+
* For advanced usage and precise control over the context element, use
303+
* `WP_HTML_Processor::create_full_processor()` and
304+
* `WP_HTML_Processor::create_fragment_at_current_node()`.
305+
*
306+
* UTF-8 is the only allowed encoding. If working with a document that
307+
* isn't UTF-8, first convert the document to UTF-8, then pass in the
308+
* converted HTML.
286309
*
287310
* @since 6.4.0
288311
* @since 6.6.0 Returns `static` instead of `self` so it can create subclass instances.
312+
* @since 6.8.0 Can create fragments with any context element.
289313
*
290314
* @param string $html Input HTML fragment to process.
291-
* @param string $context Context element for the fragment, must be default of `<body>`.
315+
* @param string $context Context element for the fragment. Defaults to `<body>`.
292316
* @param string $encoding Text encoding of the document; must be default of 'UTF-8'.
293317
* @return static|null The created processor if successful, otherwise null.
294318
*/
295319
public static function create_fragment( $html, $context = '<body>', $encoding = 'UTF-8' ) {
296-
if ( '<body>' !== $context || 'UTF-8' !== $encoding ) {
320+
$context_processor = static::create_full_parser( "<!DOCTYPE html>{$context}", $encoding );
321+
if ( null === $context_processor ) {
297322
return null;
298323
}
299324

300-
$processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE );
301-
$processor->state->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY;
302-
$processor->state->encoding = $encoding;
303-
$processor->state->encoding_confidence = 'certain';
304-
305-
// @todo Create "fake" bookmarks for non-existent but implied nodes.
306-
$processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 );
307-
$processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 );
308-
309-
$root_node = new WP_HTML_Token(
310-
'root-node',
311-
'HTML',
312-
false
313-
);
314-
315-
$processor->state->stack_of_open_elements->push( $root_node );
316-
317-
$context_node = new WP_HTML_Token(
318-
'context-node',
319-
'BODY',
320-
false
321-
);
325+
while ( $context_processor->next_tag() ) {
326+
$context_processor->set_bookmark( 'final_node' );
327+
}
322328

323-
$processor->context_node = $context_node;
324-
$processor->breadcrumbs = array( 'HTML', $context_node->node_name );
329+
if (
330+
! $context_processor->has_bookmark( 'final_node' ) ||
331+
! $context_processor->seek( 'final_node' )
332+
) {
333+
_doing_it_wrong( __METHOD__, __( 'No valid context element was detected.' ), '6.8.0' );
334+
return null;
335+
}
325336

326-
return $processor;
337+
return $context_processor->create_fragment_at_current_node( $html );
327338
}
328339

329340
/**
@@ -333,9 +344,9 @@ public static function create_fragment( $html, $context = '<body>', $encoding =
333344
* entire HTML document from start to finish. Consider a fragment parser with
334345
* a context node of `<body>`.
335346
*
336-
* Since UTF-8 is the only currently-accepted charset, if working with a
337-
* document that isn't UTF-8, it's important to convert the document before
338-
* creating the processor: pass in the converted HTML.
347+
* UTF-8 is the only allowed encoding. If working with a document that
348+
* isn't UTF-8, first convert the document to UTF-8, then pass in the
349+
* converted HTML.
339350
*
340351
* @param string $html Input HTML document to process.
341352
* @param string|null $known_definite_encoding Optional. If provided, specifies the charset used
@@ -459,35 +470,72 @@ function ( WP_HTML_Token $token ): void {
459470
*
460471
* @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm
461472
*
473+
* @since 6.8.0
474+
*
462475
* @param string $html Input HTML fragment to process.
463476
* @return static|null The created processor if successful, otherwise null.
464477
*/
465478
public function create_fragment_at_current_node( string $html ) {
466479
if ( $this->get_token_type() !== '#tag' || $this->is_tag_closer() ) {
480+
_doing_it_wrong(
481+
__METHOD__,
482+
__( 'The context element must be a start tag.' ),
483+
'6.8.0'
484+
);
467485
return null;
468486
}
469487

488+
$tag_name = $this->current_element->token->node_name;
470489
$namespace = $this->current_element->token->namespace;
471490

491+
if ( 'html' === $namespace && self::is_void( $tag_name ) ) {
492+
_doing_it_wrong(
493+
__METHOD__,
494+
sprintf(
495+
// translators: %s: A tag name like INPUT or BR.
496+
__( 'The context element cannot be a void element, found "%s".' ),
497+
$tag_name
498+
),
499+
'6.8.0'
500+
);
501+
return null;
502+
}
503+
472504
/*
473505
* Prevent creating fragments at nodes that require a special tokenizer state.
474506
* This is unsupported by the HTML Processor.
475507
*/
476508
if (
477509
'html' === $namespace &&
478-
in_array( $this->current_element->token->node_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true )
510+
in_array( $tag_name, array( 'IFRAME', 'NOEMBED', 'NOFRAMES', 'SCRIPT', 'STYLE', 'TEXTAREA', 'TITLE', 'XMP', 'PLAINTEXT' ), true )
479511
) {
512+
_doing_it_wrong(
513+
__METHOD__,
514+
sprintf(
515+
// translators: %s: A tag name like IFRAME or TEXTAREA.
516+
__( 'The context element "%s" is not supported.' ),
517+
$tag_name
518+
),
519+
'6.8.0'
520+
);
480521
return null;
481522
}
482523

483-
$fragment_processor = static::create_fragment( $html );
484-
if ( null === $fragment_processor ) {
485-
return null;
486-
}
524+
$fragment_processor = new static( $html, self::CONSTRUCTOR_UNLOCK_CODE );
487525

488526
$fragment_processor->compat_mode = $this->compat_mode;
489527

490-
$fragment_processor->context_node = clone $this->state->current_token;
528+
// @todo Create "fake" bookmarks for non-existent but implied nodes.
529+
$fragment_processor->bookmarks['root-node'] = new WP_HTML_Span( 0, 0 );
530+
$root_node = new WP_HTML_Token(
531+
'root-node',
532+
'HTML',
533+
false
534+
);
535+
$fragment_processor->state->stack_of_open_elements->push( $root_node );
536+
537+
$fragment_processor->bookmarks['context-node'] = new WP_HTML_Span( 0, 0 );
538+
$fragment_processor->context_node = clone $this->current_element->token;
491539
$fragment_processor->context_node->bookmark_name = 'context-node';
492540
$fragment_processor->context_node->on_destroy = null;
493541

tests/phpunit/tests/html-api/wpHtmlProcessor.php

Lines changed: 0 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -1043,83 +1043,6 @@ public function test_ensure_next_token_method_extensibility( $html, $expected_to
10431043
$this->assertEquals( $expected_token_counts, $processor->token_seen_count, 'Snapshot: ' . var_export( $processor->token_seen_count, true ) );
10441044
}
10451045

1046-
/**
1047-
* @ticket 62357
1048-
*/
1049-
public function test_create_fragment_at_current_node_in_foreign_content() {
1050-
$processor = WP_HTML_Processor::create_full_parser( '<svg>' );
1051-
$this->assertTrue( $processor->next_tag( 'SVG' ) );
1052-
1053-
$fragment = $processor->create_fragment_at_current_node( "\0preceded-by-nul-byte<rect /><circle></circle><foreignobject><div></div></foreignobject><g>" );
1054-
1055-
$this->assertSame( 'svg', $fragment->get_namespace() );
1056-
$this->assertTrue( $fragment->next_token() );
1057-
1058-
/*
1059-
* In HTML parsing, a nul byte would be ignored.
1060-
* In SVG it should be replaced with a replacement character.
1061-
*/
1062-
$this->assertSame( '#text', $fragment->get_token_type() );
1063-
$this->assertSame( "\u{FFFD}", $fragment->get_modifiable_text() );
1064-
1065-
$this->assertTrue( $fragment->next_tag( 'RECT' ) );
1066-
$this->assertSame( 'svg', $fragment->get_namespace() );
1067-
1068-
$this->assertTrue( $fragment->next_tag( 'CIRCLE' ) );
1069-
$this->assertSame( array( 'HTML', 'SVG', 'CIRCLE' ), $fragment->get_breadcrumbs() );
1070-
$this->assertTrue( $fragment->next_tag( 'foreignObject' ) );
1071-
$this->assertSame( 'svg', $fragment->get_namespace() );
1072-
}
1073-
1074-
/**
1075-
* @ticket 62357
1076-
*/
1077-
public function test_create_fragment_at_current_node_in_foreign_content_integration_point() {
1078-
$processor = WP_HTML_Processor::create_full_parser( '<svg><foreignObject>' );
1079-
$this->assertTrue( $processor->next_tag( 'foreignObject' ) );
1080-
1081-
$fragment = $processor->create_fragment_at_current_node( "<image>\0not-preceded-by-nul-byte<rect />" );
1082-
1083-
// Nothing has been processed, the html namespace should be used for parsing as an integration point.
1084-
$this->assertSame( 'html', $fragment->get_namespace() );
1085-
1086-
// HTML parsing transforms IMAGE into IMG.
1087-
$this->assertTrue( $fragment->next_tag( 'IMG' ) );
1088-
1089-
$this->assertTrue( $fragment->next_token() );
1090-
1091-
// In HTML parsing, the nul byte is ignored and the text is reached.
1092-
$this->assertSame( '#text', $fragment->get_token_type() );
1093-
$this->assertSame( 'not-preceded-by-nul-byte', $fragment->get_modifiable_text() );
1094-
1095-
/*
1096-
* svg:foreignObject is an HTML integration point, so the processor should be in the HTML namespace.
1097-
* RECT is an HTML element here, meaning it may have the self-closing flag but does not self-close.
1098-
*/
1099-
$this->assertTrue( $fragment->next_tag( 'RECT' ) );
1100-
$this->assertSame( array( 'HTML', 'FOREIGNOBJECT', 'RECT' ), $fragment->get_breadcrumbs() );
1101-
$this->assertSame( 'html', $fragment->get_namespace() );
1102-
$this->assertTrue( $fragment->has_self_closing_flag() );
1103-
$this->assertTrue( $fragment->expects_closer() );
1104-
}
1105-
1106-
/**
1107-
* @ticket 62357
1108-
*/
1109-
public function test_prevent_fragment_creation_on_closers() {
1110-
$processor = WP_HTML_Processor::create_full_parser( '<p></p>' );
1111-
$processor->next_tag( 'P' );
1112-
$processor->next_tag(
1113-
array(
1114-
'tag_name' => 'P',
1115-
'tag_closers' => 'visit',
1116-
)
1117-
);
1118-
$this->assertSame( 'P', $processor->get_tag() );
1119-
$this->assertTrue( $processor->is_tag_closer() );
1120-
$this->assertNull( $processor->create_fragment_at_current_node( '<i>fragment HTML</i>' ) );
1121-
}
1122-
11231046
/**
11241047
* Ensure that lowercased tag_name query matches tags case-insensitively.
11251048
*

0 commit comments

Comments
 (0)