@@ -279,51 +279,62 @@ class WP_HTML_Processor extends WP_HTML_Tag_Processor {
279279 * form is provided because a context element may have attributes that
280280 * impact the parse, such as with a SCRIPT tag and its `type` attribute.
281281 *
282- * ## Current HTML Support
282+ * Example:
283+ *
284+ * // Usually, snippets of HTML ought to be processed in the default `<body>` context:
285+ * $processor = WP_HTML_Processor::create_fragment( '<p>Hi</p>' );
286+ *
287+ * // Some fragments should be processed in the correct context like this SVG:
288+ * $processor = WP_HTML_Processor::create_fragment( '<rect width="10" height="10" />', '<svg>' );
289+ *
290+ * // This fragment with TD tags should be processed in a TR context:
291+ * $processor = WP_HTML_Processor::create_fragment(
292+ * '<td>1<td>2<td>3',
293+ * '<table><tbody><tr>'
294+ * );
283295 *
284- * - The only supported context is `<body>`, which is the default value.
285- * - The only supported document encoding is `UTF-8`, which is the default value.
296+ * In order to create a fragment processor at the correct location, the
297+ * provided fragment will be processed as part of a full HTML document.
298+ * The processor will search for the last opener tag in the document and
299+ * create a fragment processor at that location. The document will be
300+ * forced into "no-quirks" mode by including the HTML5 doctype.
301+ *
302+ * For advanced usage and precise control over the context element, use
303+ * `WP_HTML_Processor::create_full_processor()` and
304+ * `WP_HTML_Processor::create_fragment_at_current_node()`.
305+ *
306+ * UTF-8 is the only allowed encoding. If working with a document that
307+ * isn't UTF-8, first convert the document to UTF-8, then pass in the
308+ * converted HTML.
286309 *
287310 * @since 6.4.0
288311 * @since 6.6.0 Returns `static` instead of `self` so it can create subclass instances.
312+ * @since 6.8.0 Can create fragments with any context element.
289313 *
290314 * @param string $html Input HTML fragment to process.
291- * @param string $context Context element for the fragment, must be default of `<body>`.
315+ * @param string $context Context element for the fragment. Defaults to `<body>`.
292316 * @param string $encoding Text encoding of the document; must be default of 'UTF-8'.
293317 * @return static|null The created processor if successful, otherwise null.
294318 */
295319 public static function create_fragment ( $ html , $ context = '<body> ' , $ encoding = 'UTF-8 ' ) {
296- if ( '<body> ' !== $ context || 'UTF-8 ' !== $ encoding ) {
320+ $ context_processor = static ::create_full_parser ( "<!DOCTYPE html> {$ context }" , $ encoding );
321+ if ( null === $ context_processor ) {
297322 return null ;
298323 }
299324
300- $ processor = new static ( $ html , self ::CONSTRUCTOR_UNLOCK_CODE );
301- $ processor ->state ->insertion_mode = WP_HTML_Processor_State::INSERTION_MODE_IN_BODY ;
302- $ processor ->state ->encoding = $ encoding ;
303- $ processor ->state ->encoding_confidence = 'certain ' ;
304-
305- // @todo Create "fake" bookmarks for non-existent but implied nodes.
306- $ processor ->bookmarks ['root-node ' ] = new WP_HTML_Span ( 0 , 0 );
307- $ processor ->bookmarks ['context-node ' ] = new WP_HTML_Span ( 0 , 0 );
308-
309- $ root_node = new WP_HTML_Token (
310- 'root-node ' ,
311- 'HTML ' ,
312- false
313- );
314-
315- $ processor ->state ->stack_of_open_elements ->push ( $ root_node );
316-
317- $ context_node = new WP_HTML_Token (
318- 'context-node ' ,
319- 'BODY ' ,
320- false
321- );
325+ while ( $ context_processor ->next_tag () ) {
326+ $ context_processor ->set_bookmark ( 'final_node ' );
327+ }
322328
323- $ processor ->context_node = $ context_node ;
324- $ processor ->breadcrumbs = array ( 'HTML ' , $ context_node ->node_name );
329+ if (
330+ ! $ context_processor ->has_bookmark ( 'final_node ' ) ||
331+ ! $ context_processor ->seek ( 'final_node ' )
332+ ) {
333+ _doing_it_wrong ( __METHOD__ , __ ( 'No valid context element was detected. ' ), '6.8.0 ' );
334+ return null ;
335+ }
325336
326- return $ processor ;
337+ return $ context_processor -> create_fragment_at_current_node ( $ html ) ;
327338 }
328339
329340 /**
@@ -333,9 +344,9 @@ public static function create_fragment( $html, $context = '<body>', $encoding =
333344 * entire HTML document from start to finish. Consider a fragment parser with
334345 * a context node of `<body>`.
335346 *
336- * Since UTF-8 is the only currently-accepted charset, if working with a
337- * document that isn't UTF-8, it's important to convert the document before
338- * creating the processor: pass in the converted HTML.
347+ * UTF-8 is the only allowed encoding. If working with a document that
348+ * isn't UTF-8, first convert the document to UTF-8, then pass in the
349+ * converted HTML.
339350 *
340351 * @param string $html Input HTML document to process.
341352 * @param string|null $known_definite_encoding Optional. If provided, specifies the charset used
@@ -459,35 +470,72 @@ function ( WP_HTML_Token $token ): void {
459470 *
460471 * @see https://html.spec.whatwg.org/multipage/parsing.html#html-fragment-parsing-algorithm
461472 *
473+ * @since 6.8.0
474+ *
462475 * @param string $html Input HTML fragment to process.
463476 * @return static|null The created processor if successful, otherwise null.
464477 */
465478 public function create_fragment_at_current_node ( string $ html ) {
466479 if ( $ this ->get_token_type () !== '#tag ' || $ this ->is_tag_closer () ) {
480+ _doing_it_wrong (
481+ __METHOD__ ,
482+ __ ( 'The context element must be a start tag. ' ),
483+ '6.8.0 '
484+ );
467485 return null ;
468486 }
469487
488+ $ tag_name = $ this ->current_element ->token ->node_name ;
470489 $ namespace = $ this ->current_element ->token ->namespace ;
471490
491+ if ( 'html ' === $ namespace && self ::is_void ( $ tag_name ) ) {
492+ _doing_it_wrong (
493+ __METHOD__ ,
494+ sprintf (
495+ // translators: %s: A tag name like INPUT or BR.
496+ __ ( 'The context element cannot be a void element, found "%s". ' ),
497+ $ tag_name
498+ ),
499+ '6.8.0 '
500+ );
501+ return null ;
502+ }
503+
472504 /*
473505 * Prevent creating fragments at nodes that require a special tokenizer state.
474506 * This is unsupported by the HTML Processor.
475507 */
476508 if (
477509 'html ' === $ namespace &&
478- in_array ( $ this -> current_element -> token -> node_name , array ( 'IFRAME ' , 'NOEMBED ' , 'NOFRAMES ' , 'SCRIPT ' , 'STYLE ' , 'TEXTAREA ' , 'TITLE ' , 'XMP ' , 'PLAINTEXT ' ), true )
510+ in_array ( $ tag_name , array ( 'IFRAME ' , 'NOEMBED ' , 'NOFRAMES ' , 'SCRIPT ' , 'STYLE ' , 'TEXTAREA ' , 'TITLE ' , 'XMP ' , 'PLAINTEXT ' ), true )
479511 ) {
512+ _doing_it_wrong (
513+ __METHOD__ ,
514+ sprintf (
515+ // translators: %s: A tag name like IFRAME or TEXTAREA.
516+ __ ( 'The context element "%s" is not supported. ' ),
517+ $ tag_name
518+ ),
519+ '6.8.0 '
520+ );
480521 return null ;
481522 }
482523
483- $ fragment_processor = static ::create_fragment ( $ html );
484- if ( null === $ fragment_processor ) {
485- return null ;
486- }
524+ $ fragment_processor = new static ( $ html , self ::CONSTRUCTOR_UNLOCK_CODE );
487525
488526 $ fragment_processor ->compat_mode = $ this ->compat_mode ;
489527
490- $ fragment_processor ->context_node = clone $ this ->state ->current_token ;
528+ // @todo Create "fake" bookmarks for non-existent but implied nodes.
529+ $ fragment_processor ->bookmarks ['root-node ' ] = new WP_HTML_Span ( 0 , 0 );
530+ $ root_node = new WP_HTML_Token (
531+ 'root-node ' ,
532+ 'HTML ' ,
533+ false
534+ );
535+ $ fragment_processor ->state ->stack_of_open_elements ->push ( $ root_node );
536+
537+ $ fragment_processor ->bookmarks ['context-node ' ] = new WP_HTML_Span ( 0 , 0 );
538+ $ fragment_processor ->context_node = clone $ this ->current_element ->token ;
491539 $ fragment_processor ->context_node ->bookmark_name = 'context-node ' ;
492540 $ fragment_processor ->context_node ->on_destroy = null ;
493541
0 commit comments