[ Index ] |
PHP Cross Reference of Unnamed Project |
[Summary view] [Print] [Text view]
1 <?php 2 3 /** 4 * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library. 5 * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts. 6 * 7 * @note 8 * Recent changes to PHP's DOM extension have resulted in some fatal 9 * error conditions with the original version of PH5P. Pending changes, 10 * this lexer will punt to DirectLex if DOM throws an exception. 11 */ 12 13 class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex 14 { 15 /** 16 * @param string $html 17 * @param HTMLPurifier_Config $config 18 * @param HTMLPurifier_Context $context 19 * @return HTMLPurifier_Token[] 20 */ 21 public function tokenizeHTML($html, $config, $context) 22 { 23 $new_html = $this->normalize($html, $config, $context); 24 $new_html = $this->wrapHTML($new_html, $config, $context); 25 try { 26 $parser = new HTML5($new_html); 27 $doc = $parser->save(); 28 } catch (DOMException $e) { 29 // Uh oh, it failed. Punt to DirectLex. 30 $lexer = new HTMLPurifier_Lexer_DirectLex(); 31 $context->register('PH5PError', $e); // save the error, so we can detect it 32 return $lexer->tokenizeHTML($html, $config, $context); // use original HTML 33 } 34 $tokens = array(); 35 $this->tokenizeDOM( 36 $doc->getElementsByTagName('html')->item(0)-> // <html> 37 getElementsByTagName('body')->item(0) // <body> 38 , 39 $tokens 40 ); 41 return $tokens; 42 } 43 } 44 45 /* 46 47 Copyright 2007 Jeroen van der Meer <http://jero.net/> 48 49 Permission is hereby granted, free of charge, to any person obtaining a 50 copy of this software and associated documentation files (the 51 "Software"), to deal in the Software without restriction, including 52 without limitation the rights to use, copy, modify, merge, publish, 53 distribute, sublicense, and/or sell copies of the Software, and to 54 permit persons to whom the Software is furnished to do so, subject to 55 the following conditions: 56 57 The above copyright notice and this permission notice shall be included 58 in all copies or substantial portions of the Software. 59 60 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 61 OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 62 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 63 IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 64 CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 65 TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 66 SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 67 68 */ 69 70 class HTML5 71 { 72 private $data; 73 private $char; 74 private $EOF; 75 private $state; 76 private $tree; 77 private $token; 78 private $content_model; 79 private $escape = false; 80 private $entities = array( 81 'AElig;', 82 'AElig', 83 'AMP;', 84 'AMP', 85 'Aacute;', 86 'Aacute', 87 'Acirc;', 88 'Acirc', 89 'Agrave;', 90 'Agrave', 91 'Alpha;', 92 'Aring;', 93 'Aring', 94 'Atilde;', 95 'Atilde', 96 'Auml;', 97 'Auml', 98 'Beta;', 99 'COPY;', 100 'COPY', 101 'Ccedil;', 102 'Ccedil', 103 'Chi;', 104 'Dagger;', 105 'Delta;', 106 'ETH;', 107 'ETH', 108 'Eacute;', 109 'Eacute', 110 'Ecirc;', 111 'Ecirc', 112 'Egrave;', 113 'Egrave', 114 'Epsilon;', 115 'Eta;', 116 'Euml;', 117 'Euml', 118 'GT;', 119 'GT', 120 'Gamma;', 121 'Iacute;', 122 'Iacute', 123 'Icirc;', 124 'Icirc', 125 'Igrave;', 126 'Igrave', 127 'Iota;', 128 'Iuml;', 129 'Iuml', 130 'Kappa;', 131 'LT;', 132 'LT', 133 'Lambda;', 134 'Mu;', 135 'Ntilde;', 136 'Ntilde', 137 'Nu;', 138 'OElig;', 139 'Oacute;', 140 'Oacute', 141 'Ocirc;', 142 'Ocirc', 143 'Ograve;', 144 'Ograve', 145 'Omega;', 146 'Omicron;', 147 'Oslash;', 148 'Oslash', 149 'Otilde;', 150 'Otilde', 151 'Ouml;', 152 'Ouml', 153 'Phi;', 154 'Pi;', 155 'Prime;', 156 'Psi;', 157 'QUOT;', 158 'QUOT', 159 'REG;', 160 'REG', 161 'Rho;', 162 'Scaron;', 163 'Sigma;', 164 'THORN;', 165 'THORN', 166 'TRADE;', 167 'Tau;', 168 'Theta;', 169 'Uacute;', 170 'Uacute', 171 'Ucirc;', 172 'Ucirc', 173 'Ugrave;', 174 'Ugrave', 175 'Upsilon;', 176 'Uuml;', 177 'Uuml', 178 'Xi;', 179 'Yacute;', 180 'Yacute', 181 'Yuml;', 182 'Zeta;', 183 'aacute;', 184 'aacute', 185 'acirc;', 186 'acirc', 187 'acute;', 188 'acute', 189 'aelig;', 190 'aelig', 191 'agrave;', 192 'agrave', 193 'alefsym;', 194 'alpha;', 195 'amp;', 196 'amp', 197 'and;', 198 'ang;', 199 'apos;', 200 'aring;', 201 'aring', 202 'asymp;', 203 'atilde;', 204 'atilde', 205 'auml;', 206 'auml', 207 'bdquo;', 208 'beta;', 209 'brvbar;', 210 'brvbar', 211 'bull;', 212 'cap;', 213 'ccedil;', 214 'ccedil', 215 'cedil;', 216 'cedil', 217 'cent;', 218 'cent', 219 'chi;', 220 'circ;', 221 'clubs;', 222 'cong;', 223 'copy;', 224 'copy', 225 'crarr;', 226 'cup;', 227 'curren;', 228 'curren', 229 'dArr;', 230 'dagger;', 231 'darr;', 232 'deg;', 233 'deg', 234 'delta;', 235 'diams;', 236 'divide;', 237 'divide', 238 'eacute;', 239 'eacute', 240 'ecirc;', 241 'ecirc', 242 'egrave;', 243 'egrave', 244 'empty;', 245 'emsp;', 246 'ensp;', 247 'epsilon;', 248 'equiv;', 249 'eta;', 250 'eth;', 251 'eth', 252 'euml;', 253 'euml', 254 'euro;', 255 'exist;', 256 'fnof;', 257 'forall;', 258 'frac12;', 259 'frac12', 260 'frac14;', 261 'frac14', 262 'frac34;', 263 'frac34', 264 'frasl;', 265 'gamma;', 266 'ge;', 267 'gt;', 268 'gt', 269 'hArr;', 270 'harr;', 271 'hearts;', 272 'hellip;', 273 'iacute;', 274 'iacute', 275 'icirc;', 276 'icirc', 277 'iexcl;', 278 'iexcl', 279 'igrave;', 280 'igrave', 281 'image;', 282 'infin;', 283 'int;', 284 'iota;', 285 'iquest;', 286 'iquest', 287 'isin;', 288 'iuml;', 289 'iuml', 290 'kappa;', 291 'lArr;', 292 'lambda;', 293 'lang;', 294 'laquo;', 295 'laquo', 296 'larr;', 297 'lceil;', 298 'ldquo;', 299 'le;', 300 'lfloor;', 301 'lowast;', 302 'loz;', 303 'lrm;', 304 'lsaquo;', 305 'lsquo;', 306 'lt;', 307 'lt', 308 'macr;', 309 'macr', 310 'mdash;', 311 'micro;', 312 'micro', 313 'middot;', 314 'middot', 315 'minus;', 316 'mu;', 317 'nabla;', 318 'nbsp;', 319 'nbsp', 320 'ndash;', 321 'ne;', 322 'ni;', 323 'not;', 324 'not', 325 'notin;', 326 'nsub;', 327 'ntilde;', 328 'ntilde', 329 'nu;', 330 'oacute;', 331 'oacute', 332 'ocirc;', 333 'ocirc', 334 'oelig;', 335 'ograve;', 336 'ograve', 337 'oline;', 338 'omega;', 339 'omicron;', 340 'oplus;', 341 'or;', 342 'ordf;', 343 'ordf', 344 'ordm;', 345 'ordm', 346 'oslash;', 347 'oslash', 348 'otilde;', 349 'otilde', 350 'otimes;', 351 'ouml;', 352 'ouml', 353 'para;', 354 'para', 355 'part;', 356 'permil;', 357 'perp;', 358 'phi;', 359 'pi;', 360 'piv;', 361 'plusmn;', 362 'plusmn', 363 'pound;', 364 'pound', 365 'prime;', 366 'prod;', 367 'prop;', 368 'psi;', 369 'quot;', 370 'quot', 371 'rArr;', 372 'radic;', 373 'rang;', 374 'raquo;', 375 'raquo', 376 'rarr;', 377 'rceil;', 378 'rdquo;', 379 'real;', 380 'reg;', 381 'reg', 382 'rfloor;', 383 'rho;', 384 'rlm;', 385 'rsaquo;', 386 'rsquo;', 387 'sbquo;', 388 'scaron;', 389 'sdot;', 390 'sect;', 391 'sect', 392 'shy;', 393 'shy', 394 'sigma;', 395 'sigmaf;', 396 'sim;', 397 'spades;', 398 'sub;', 399 'sube;', 400 'sum;', 401 'sup1;', 402 'sup1', 403 'sup2;', 404 'sup2', 405 'sup3;', 406 'sup3', 407 'sup;', 408 'supe;', 409 'szlig;', 410 'szlig', 411 'tau;', 412 'there4;', 413 'theta;', 414 'thetasym;', 415 'thinsp;', 416 'thorn;', 417 'thorn', 418 'tilde;', 419 'times;', 420 'times', 421 'trade;', 422 'uArr;', 423 'uacute;', 424 'uacute', 425 'uarr;', 426 'ucirc;', 427 'ucirc', 428 'ugrave;', 429 'ugrave', 430 'uml;', 431 'uml', 432 'upsih;', 433 'upsilon;', 434 'uuml;', 435 'uuml', 436 'weierp;', 437 'xi;', 438 'yacute;', 439 'yacute', 440 'yen;', 441 'yen', 442 'yuml;', 443 'yuml', 444 'zeta;', 445 'zwj;', 446 'zwnj;' 447 ); 448 449 const PCDATA = 0; 450 const RCDATA = 1; 451 const CDATA = 2; 452 const PLAINTEXT = 3; 453 454 const DOCTYPE = 0; 455 const STARTTAG = 1; 456 const ENDTAG = 2; 457 const COMMENT = 3; 458 const CHARACTR = 4; 459 const EOF = 5; 460 461 public function __construct($data) 462 { 463 $this->data = $data; 464 $this->char = -1; 465 $this->EOF = strlen($data); 466 $this->tree = new HTML5TreeConstructer; 467 $this->content_model = self::PCDATA; 468 469 $this->state = 'data'; 470 471 while ($this->state !== null) { 472 $this->{$this->state . 'State'}(); 473 } 474 } 475 476 public function save() 477 { 478 return $this->tree->save(); 479 } 480 481 private function char() 482 { 483 return ($this->char < $this->EOF) 484 ? $this->data[$this->char] 485 : false; 486 } 487 488 private function character($s, $l = 0) 489 { 490 if ($s + $l < $this->EOF) { 491 if ($l === 0) { 492 return $this->data[$s]; 493 } else { 494 return substr($this->data, $s, $l); 495 } 496 } 497 } 498 499 private function characters($char_class, $start) 500 { 501 return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start)); 502 } 503 504 private function dataState() 505 { 506 // Consume the next input character 507 $this->char++; 508 $char = $this->char(); 509 510 if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) { 511 /* U+0026 AMPERSAND (&) 512 When the content model flag is set to one of the PCDATA or RCDATA 513 states: switch to the entity data state. Otherwise: treat it as per 514 the "anything else" entry below. */ 515 $this->state = 'entityData'; 516 517 } elseif ($char === '-') { 518 /* If the content model flag is set to either the RCDATA state or 519 the CDATA state, and the escape flag is false, and there are at 520 least three characters before this one in the input stream, and the 521 last four characters in the input stream, including this one, are 522 U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS, 523 and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */ 524 if (($this->content_model === self::RCDATA || $this->content_model === 525 self::CDATA) && $this->escape === false && 526 $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--' 527 ) { 528 $this->escape = true; 529 } 530 531 /* In any case, emit the input character as a character token. Stay 532 in the data state. */ 533 $this->emitToken( 534 array( 535 'type' => self::CHARACTR, 536 'data' => $char 537 ) 538 ); 539 540 /* U+003C LESS-THAN SIGN (<) */ 541 } elseif ($char === '<' && ($this->content_model === self::PCDATA || 542 (($this->content_model === self::RCDATA || 543 $this->content_model === self::CDATA) && $this->escape === false)) 544 ) { 545 /* When the content model flag is set to the PCDATA state: switch 546 to the tag open state. 547 548 When the content model flag is set to either the RCDATA state or 549 the CDATA state and the escape flag is false: switch to the tag 550 open state. 551 552 Otherwise: treat it as per the "anything else" entry below. */ 553 $this->state = 'tagOpen'; 554 555 /* U+003E GREATER-THAN SIGN (>) */ 556 } elseif ($char === '>') { 557 /* If the content model flag is set to either the RCDATA state or 558 the CDATA state, and the escape flag is true, and the last three 559 characters in the input stream including this one are U+002D 560 HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"), 561 set the escape flag to false. */ 562 if (($this->content_model === self::RCDATA || 563 $this->content_model === self::CDATA) && $this->escape === true && 564 $this->character($this->char, 3) === '-->' 565 ) { 566 $this->escape = false; 567 } 568 569 /* In any case, emit the input character as a character token. 570 Stay in the data state. */ 571 $this->emitToken( 572 array( 573 'type' => self::CHARACTR, 574 'data' => $char 575 ) 576 ); 577 578 } elseif ($this->char === $this->EOF) { 579 /* EOF 580 Emit an end-of-file token. */ 581 $this->EOF(); 582 583 } elseif ($this->content_model === self::PLAINTEXT) { 584 /* When the content model flag is set to the PLAINTEXT state 585 THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of 586 the text and emit it as a character token. */ 587 $this->emitToken( 588 array( 589 'type' => self::CHARACTR, 590 'data' => substr($this->data, $this->char) 591 ) 592 ); 593 594 $this->EOF(); 595 596 } else { 597 /* Anything else 598 THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that 599 otherwise would also be treated as a character token and emit it 600 as a single character token. Stay in the data state. */ 601 $len = strcspn($this->data, '<&', $this->char); 602 $char = substr($this->data, $this->char, $len); 603 $this->char += $len - 1; 604 605 $this->emitToken( 606 array( 607 'type' => self::CHARACTR, 608 'data' => $char 609 ) 610 ); 611 612 $this->state = 'data'; 613 } 614 } 615 616 private function entityDataState() 617 { 618 // Attempt to consume an entity. 619 $entity = $this->entity(); 620 621 // If nothing is returned, emit a U+0026 AMPERSAND character token. 622 // Otherwise, emit the character token that was returned. 623 $char = (!$entity) ? '&' : $entity; 624 $this->emitToken( 625 array( 626 'type' => self::CHARACTR, 627 'data' => $char 628 ) 629 ); 630 631 // Finally, switch to the data state. 632 $this->state = 'data'; 633 } 634 635 private function tagOpenState() 636 { 637 switch ($this->content_model) { 638 case self::RCDATA: 639 case self::CDATA: 640 /* If the next input character is a U+002F SOLIDUS (/) character, 641 consume it and switch to the close tag open state. If the next 642 input character is not a U+002F SOLIDUS (/) character, emit a 643 U+003C LESS-THAN SIGN character token and switch to the data 644 state to process the next input character. */ 645 if ($this->character($this->char + 1) === '/') { 646 $this->char++; 647 $this->state = 'closeTagOpen'; 648 649 } else { 650 $this->emitToken( 651 array( 652 'type' => self::CHARACTR, 653 'data' => '<' 654 ) 655 ); 656 657 $this->state = 'data'; 658 } 659 break; 660 661 case self::PCDATA: 662 // If the content model flag is set to the PCDATA state 663 // Consume the next input character: 664 $this->char++; 665 $char = $this->char(); 666 667 if ($char === '!') { 668 /* U+0021 EXCLAMATION MARK (!) 669 Switch to the markup declaration open state. */ 670 $this->state = 'markupDeclarationOpen'; 671 672 } elseif ($char === '/') { 673 /* U+002F SOLIDUS (/) 674 Switch to the close tag open state. */ 675 $this->state = 'closeTagOpen'; 676 677 } elseif (preg_match('/^[A-Za-z]$/', $char)) { 678 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z 679 Create a new start tag token, set its tag name to the lowercase 680 version of the input character (add 0x0020 to the character's code 681 point), then switch to the tag name state. (Don't emit the token 682 yet; further details will be filled in before it is emitted.) */ 683 $this->token = array( 684 'name' => strtolower($char), 685 'type' => self::STARTTAG, 686 'attr' => array() 687 ); 688 689 $this->state = 'tagName'; 690 691 } elseif ($char === '>') { 692 /* U+003E GREATER-THAN SIGN (>) 693 Parse error. Emit a U+003C LESS-THAN SIGN character token and a 694 U+003E GREATER-THAN SIGN character token. Switch to the data state. */ 695 $this->emitToken( 696 array( 697 'type' => self::CHARACTR, 698 'data' => '<>' 699 ) 700 ); 701 702 $this->state = 'data'; 703 704 } elseif ($char === '?') { 705 /* U+003F QUESTION MARK (?) 706 Parse error. Switch to the bogus comment state. */ 707 $this->state = 'bogusComment'; 708 709 } else { 710 /* Anything else 711 Parse error. Emit a U+003C LESS-THAN SIGN character token and 712 reconsume the current input character in the data state. */ 713 $this->emitToken( 714 array( 715 'type' => self::CHARACTR, 716 'data' => '<' 717 ) 718 ); 719 720 $this->char--; 721 $this->state = 'data'; 722 } 723 break; 724 } 725 } 726 727 private function closeTagOpenState() 728 { 729 $next_node = strtolower($this->characters('A-Za-z', $this->char + 1)); 730 $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName; 731 732 if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) && 733 (!$the_same || ($the_same && (!preg_match( 734 '/[\t\n\x0b\x0c >\/]/', 735 $this->character($this->char + 1 + strlen($next_node)) 736 ) || $this->EOF === $this->char))) 737 ) { 738 /* If the content model flag is set to the RCDATA or CDATA states then 739 examine the next few characters. If they do not match the tag name of 740 the last start tag token emitted (case insensitively), or if they do but 741 they are not immediately followed by one of the following characters: 742 * U+0009 CHARACTER TABULATION 743 * U+000A LINE FEED (LF) 744 * U+000B LINE TABULATION 745 * U+000C FORM FEED (FF) 746 * U+0020 SPACE 747 * U+003E GREATER-THAN SIGN (>) 748 * U+002F SOLIDUS (/) 749 * EOF 750 ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character 751 token, a U+002F SOLIDUS character token, and switch to the data state 752 to process the next input character. */ 753 $this->emitToken( 754 array( 755 'type' => self::CHARACTR, 756 'data' => '</' 757 ) 758 ); 759 760 $this->state = 'data'; 761 762 } else { 763 /* Otherwise, if the content model flag is set to the PCDATA state, 764 or if the next few characters do match that tag name, consume the 765 next input character: */ 766 $this->char++; 767 $char = $this->char(); 768 769 if (preg_match('/^[A-Za-z]$/', $char)) { 770 /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z 771 Create a new end tag token, set its tag name to the lowercase version 772 of the input character (add 0x0020 to the character's code point), then 773 switch to the tag name state. (Don't emit the token yet; further details 774 will be filled in before it is emitted.) */ 775 $this->token = array( 776 'name' => strtolower($char), 777 'type' => self::ENDTAG 778 ); 779 780 $this->state = 'tagName'; 781 782 } elseif ($char === '>') { 783 /* U+003E GREATER-THAN SIGN (>) 784 Parse error. Switch to the data state. */ 785 $this->state = 'data'; 786 787 } elseif ($this->char === $this->EOF) { 788 /* EOF 789 Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F 790 SOLIDUS character token. Reconsume the EOF character in the data state. */ 791 $this->emitToken( 792 array( 793 'type' => self::CHARACTR, 794 'data' => '</' 795 ) 796 ); 797 798 $this->char--; 799 $this->state = 'data'; 800 801 } else { 802 /* Parse error. Switch to the bogus comment state. */ 803 $this->state = 'bogusComment'; 804 } 805 } 806 } 807 808 private function tagNameState() 809 { 810 // Consume the next input character: 811 $this->char++; 812 $char = $this->character($this->char); 813 814 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 815 /* U+0009 CHARACTER TABULATION 816 U+000A LINE FEED (LF) 817 U+000B LINE TABULATION 818 U+000C FORM FEED (FF) 819 U+0020 SPACE 820 Switch to the before attribute name state. */ 821 $this->state = 'beforeAttributeName'; 822 823 } elseif ($char === '>') { 824 /* U+003E GREATER-THAN SIGN (>) 825 Emit the current tag token. Switch to the data state. */ 826 $this->emitToken($this->token); 827 $this->state = 'data'; 828 829 } elseif ($this->char === $this->EOF) { 830 /* EOF 831 Parse error. Emit the current tag token. Reconsume the EOF 832 character in the data state. */ 833 $this->emitToken($this->token); 834 835 $this->char--; 836 $this->state = 'data'; 837 838 } elseif ($char === '/') { 839 /* U+002F SOLIDUS (/) 840 Parse error unless this is a permitted slash. Switch to the before 841 attribute name state. */ 842 $this->state = 'beforeAttributeName'; 843 844 } else { 845 /* Anything else 846 Append the current input character to the current tag token's tag name. 847 Stay in the tag name state. */ 848 $this->token['name'] .= strtolower($char); 849 $this->state = 'tagName'; 850 } 851 } 852 853 private function beforeAttributeNameState() 854 { 855 // Consume the next input character: 856 $this->char++; 857 $char = $this->character($this->char); 858 859 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 860 /* U+0009 CHARACTER TABULATION 861 U+000A LINE FEED (LF) 862 U+000B LINE TABULATION 863 U+000C FORM FEED (FF) 864 U+0020 SPACE 865 Stay in the before attribute name state. */ 866 $this->state = 'beforeAttributeName'; 867 868 } elseif ($char === '>') { 869 /* U+003E GREATER-THAN SIGN (>) 870 Emit the current tag token. Switch to the data state. */ 871 $this->emitToken($this->token); 872 $this->state = 'data'; 873 874 } elseif ($char === '/') { 875 /* U+002F SOLIDUS (/) 876 Parse error unless this is a permitted slash. Stay in the before 877 attribute name state. */ 878 $this->state = 'beforeAttributeName'; 879 880 } elseif ($this->char === $this->EOF) { 881 /* EOF 882 Parse error. Emit the current tag token. Reconsume the EOF 883 character in the data state. */ 884 $this->emitToken($this->token); 885 886 $this->char--; 887 $this->state = 'data'; 888 889 } else { 890 /* Anything else 891 Start a new attribute in the current tag token. Set that attribute's 892 name to the current input character, and its value to the empty string. 893 Switch to the attribute name state. */ 894 $this->token['attr'][] = array( 895 'name' => strtolower($char), 896 'value' => null 897 ); 898 899 $this->state = 'attributeName'; 900 } 901 } 902 903 private function attributeNameState() 904 { 905 // Consume the next input character: 906 $this->char++; 907 $char = $this->character($this->char); 908 909 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 910 /* U+0009 CHARACTER TABULATION 911 U+000A LINE FEED (LF) 912 U+000B LINE TABULATION 913 U+000C FORM FEED (FF) 914 U+0020 SPACE 915 Stay in the before attribute name state. */ 916 $this->state = 'afterAttributeName'; 917 918 } elseif ($char === '=') { 919 /* U+003D EQUALS SIGN (=) 920 Switch to the before attribute value state. */ 921 $this->state = 'beforeAttributeValue'; 922 923 } elseif ($char === '>') { 924 /* U+003E GREATER-THAN SIGN (>) 925 Emit the current tag token. Switch to the data state. */ 926 $this->emitToken($this->token); 927 $this->state = 'data'; 928 929 } elseif ($char === '/' && $this->character($this->char + 1) !== '>') { 930 /* U+002F SOLIDUS (/) 931 Parse error unless this is a permitted slash. Switch to the before 932 attribute name state. */ 933 $this->state = 'beforeAttributeName'; 934 935 } elseif ($this->char === $this->EOF) { 936 /* EOF 937 Parse error. Emit the current tag token. Reconsume the EOF 938 character in the data state. */ 939 $this->emitToken($this->token); 940 941 $this->char--; 942 $this->state = 'data'; 943 944 } else { 945 /* Anything else 946 Append the current input character to the current attribute's name. 947 Stay in the attribute name state. */ 948 $last = count($this->token['attr']) - 1; 949 $this->token['attr'][$last]['name'] .= strtolower($char); 950 951 $this->state = 'attributeName'; 952 } 953 } 954 955 private function afterAttributeNameState() 956 { 957 // Consume the next input character: 958 $this->char++; 959 $char = $this->character($this->char); 960 961 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 962 /* U+0009 CHARACTER TABULATION 963 U+000A LINE FEED (LF) 964 U+000B LINE TABULATION 965 U+000C FORM FEED (FF) 966 U+0020 SPACE 967 Stay in the after attribute name state. */ 968 $this->state = 'afterAttributeName'; 969 970 } elseif ($char === '=') { 971 /* U+003D EQUALS SIGN (=) 972 Switch to the before attribute value state. */ 973 $this->state = 'beforeAttributeValue'; 974 975 } elseif ($char === '>') { 976 /* U+003E GREATER-THAN SIGN (>) 977 Emit the current tag token. Switch to the data state. */ 978 $this->emitToken($this->token); 979 $this->state = 'data'; 980 981 } elseif ($char === '/' && $this->character($this->char + 1) !== '>') { 982 /* U+002F SOLIDUS (/) 983 Parse error unless this is a permitted slash. Switch to the 984 before attribute name state. */ 985 $this->state = 'beforeAttributeName'; 986 987 } elseif ($this->char === $this->EOF) { 988 /* EOF 989 Parse error. Emit the current tag token. Reconsume the EOF 990 character in the data state. */ 991 $this->emitToken($this->token); 992 993 $this->char--; 994 $this->state = 'data'; 995 996 } else { 997 /* Anything else 998 Start a new attribute in the current tag token. Set that attribute's 999 name to the current input character, and its value to the empty string. 1000 Switch to the attribute name state. */ 1001 $this->token['attr'][] = array( 1002 'name' => strtolower($char), 1003 'value' => null 1004 ); 1005 1006 $this->state = 'attributeName'; 1007 } 1008 } 1009 1010 private function beforeAttributeValueState() 1011 { 1012 // Consume the next input character: 1013 $this->char++; 1014 $char = $this->character($this->char); 1015 1016 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 1017 /* U+0009 CHARACTER TABULATION 1018 U+000A LINE FEED (LF) 1019 U+000B LINE TABULATION 1020 U+000C FORM FEED (FF) 1021 U+0020 SPACE 1022 Stay in the before attribute value state. */ 1023 $this->state = 'beforeAttributeValue'; 1024 1025 } elseif ($char === '"') { 1026 /* U+0022 QUOTATION MARK (") 1027 Switch to the attribute value (double-quoted) state. */ 1028 $this->state = 'attributeValueDoubleQuoted'; 1029 1030 } elseif ($char === '&') { 1031 /* U+0026 AMPERSAND (&) 1032 Switch to the attribute value (unquoted) state and reconsume 1033 this input character. */ 1034 $this->char--; 1035 $this->state = 'attributeValueUnquoted'; 1036 1037 } elseif ($char === '\'') { 1038 /* U+0027 APOSTROPHE (') 1039 Switch to the attribute value (single-quoted) state. */ 1040 $this->state = 'attributeValueSingleQuoted'; 1041 1042 } elseif ($char === '>') { 1043 /* U+003E GREATER-THAN SIGN (>) 1044 Emit the current tag token. Switch to the data state. */ 1045 $this->emitToken($this->token); 1046 $this->state = 'data'; 1047 1048 } else { 1049 /* Anything else 1050 Append the current input character to the current attribute's value. 1051 Switch to the attribute value (unquoted) state. */ 1052 $last = count($this->token['attr']) - 1; 1053 $this->token['attr'][$last]['value'] .= $char; 1054 1055 $this->state = 'attributeValueUnquoted'; 1056 } 1057 } 1058 1059 private function attributeValueDoubleQuotedState() 1060 { 1061 // Consume the next input character: 1062 $this->char++; 1063 $char = $this->character($this->char); 1064 1065 if ($char === '"') { 1066 /* U+0022 QUOTATION MARK (") 1067 Switch to the before attribute name state. */ 1068 $this->state = 'beforeAttributeName'; 1069 1070 } elseif ($char === '&') { 1071 /* U+0026 AMPERSAND (&) 1072 Switch to the entity in attribute value state. */ 1073 $this->entityInAttributeValueState('double'); 1074 1075 } elseif ($this->char === $this->EOF) { 1076 /* EOF 1077 Parse error. Emit the current tag token. Reconsume the character 1078 in the data state. */ 1079 $this->emitToken($this->token); 1080 1081 $this->char--; 1082 $this->state = 'data'; 1083 1084 } else { 1085 /* Anything else 1086 Append the current input character to the current attribute's value. 1087 Stay in the attribute value (double-quoted) state. */ 1088 $last = count($this->token['attr']) - 1; 1089 $this->token['attr'][$last]['value'] .= $char; 1090 1091 $this->state = 'attributeValueDoubleQuoted'; 1092 } 1093 } 1094 1095 private function attributeValueSingleQuotedState() 1096 { 1097 // Consume the next input character: 1098 $this->char++; 1099 $char = $this->character($this->char); 1100 1101 if ($char === '\'') { 1102 /* U+0022 QUOTATION MARK (') 1103 Switch to the before attribute name state. */ 1104 $this->state = 'beforeAttributeName'; 1105 1106 } elseif ($char === '&') { 1107 /* U+0026 AMPERSAND (&) 1108 Switch to the entity in attribute value state. */ 1109 $this->entityInAttributeValueState('single'); 1110 1111 } elseif ($this->char === $this->EOF) { 1112 /* EOF 1113 Parse error. Emit the current tag token. Reconsume the character 1114 in the data state. */ 1115 $this->emitToken($this->token); 1116 1117 $this->char--; 1118 $this->state = 'data'; 1119 1120 } else { 1121 /* Anything else 1122 Append the current input character to the current attribute's value. 1123 Stay in the attribute value (single-quoted) state. */ 1124 $last = count($this->token['attr']) - 1; 1125 $this->token['attr'][$last]['value'] .= $char; 1126 1127 $this->state = 'attributeValueSingleQuoted'; 1128 } 1129 } 1130 1131 private function attributeValueUnquotedState() 1132 { 1133 // Consume the next input character: 1134 $this->char++; 1135 $char = $this->character($this->char); 1136 1137 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 1138 /* U+0009 CHARACTER TABULATION 1139 U+000A LINE FEED (LF) 1140 U+000B LINE TABULATION 1141 U+000C FORM FEED (FF) 1142 U+0020 SPACE 1143 Switch to the before attribute name state. */ 1144 $this->state = 'beforeAttributeName'; 1145 1146 } elseif ($char === '&') { 1147 /* U+0026 AMPERSAND (&) 1148 Switch to the entity in attribute value state. */ 1149 $this->entityInAttributeValueState(); 1150 1151 } elseif ($char === '>') { 1152 /* U+003E GREATER-THAN SIGN (>) 1153 Emit the current tag token. Switch to the data state. */ 1154 $this->emitToken($this->token); 1155 $this->state = 'data'; 1156 1157 } else { 1158 /* Anything else 1159 Append the current input character to the current attribute's value. 1160 Stay in the attribute value (unquoted) state. */ 1161 $last = count($this->token['attr']) - 1; 1162 $this->token['attr'][$last]['value'] .= $char; 1163 1164 $this->state = 'attributeValueUnquoted'; 1165 } 1166 } 1167 1168 private function entityInAttributeValueState() 1169 { 1170 // Attempt to consume an entity. 1171 $entity = $this->entity(); 1172 1173 // If nothing is returned, append a U+0026 AMPERSAND character to the 1174 // current attribute's value. Otherwise, emit the character token that 1175 // was returned. 1176 $char = (!$entity) 1177 ? '&' 1178 : $entity; 1179 1180 $last = count($this->token['attr']) - 1; 1181 $this->token['attr'][$last]['value'] .= $char; 1182 } 1183 1184 private function bogusCommentState() 1185 { 1186 /* Consume every character up to the first U+003E GREATER-THAN SIGN 1187 character (>) or the end of the file (EOF), whichever comes first. Emit 1188 a comment token whose data is the concatenation of all the characters 1189 starting from and including the character that caused the state machine 1190 to switch into the bogus comment state, up to and including the last 1191 consumed character before the U+003E character, if any, or up to the 1192 end of the file otherwise. (If the comment was started by the end of 1193 the file (EOF), the token is empty.) */ 1194 $data = $this->characters('^>', $this->char); 1195 $this->emitToken( 1196 array( 1197 'data' => $data, 1198 'type' => self::COMMENT 1199 ) 1200 ); 1201 1202 $this->char += strlen($data); 1203 1204 /* Switch to the data state. */ 1205 $this->state = 'data'; 1206 1207 /* If the end of the file was reached, reconsume the EOF character. */ 1208 if ($this->char === $this->EOF) { 1209 $this->char = $this->EOF - 1; 1210 } 1211 } 1212 1213 private function markupDeclarationOpenState() 1214 { 1215 /* If the next two characters are both U+002D HYPHEN-MINUS (-) 1216 characters, consume those two characters, create a comment token whose 1217 data is the empty string, and switch to the comment state. */ 1218 if ($this->character($this->char + 1, 2) === '--') { 1219 $this->char += 2; 1220 $this->state = 'comment'; 1221 $this->token = array( 1222 'data' => null, 1223 'type' => self::COMMENT 1224 ); 1225 1226 /* Otherwise if the next seven chacacters are a case-insensitive match 1227 for the word "DOCTYPE", then consume those characters and switch to the 1228 DOCTYPE state. */ 1229 } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') { 1230 $this->char += 7; 1231 $this->state = 'doctype'; 1232 1233 /* Otherwise, is is a parse error. Switch to the bogus comment state. 1234 The next character that is consumed, if any, is the first character 1235 that will be in the comment. */ 1236 } else { 1237 $this->char++; 1238 $this->state = 'bogusComment'; 1239 } 1240 } 1241 1242 private function commentState() 1243 { 1244 /* Consume the next input character: */ 1245 $this->char++; 1246 $char = $this->char(); 1247 1248 /* U+002D HYPHEN-MINUS (-) */ 1249 if ($char === '-') { 1250 /* Switch to the comment dash state */ 1251 $this->state = 'commentDash'; 1252 1253 /* EOF */ 1254 } elseif ($this->char === $this->EOF) { 1255 /* Parse error. Emit the comment token. Reconsume the EOF character 1256 in the data state. */ 1257 $this->emitToken($this->token); 1258 $this->char--; 1259 $this->state = 'data'; 1260 1261 /* Anything else */ 1262 } else { 1263 /* Append the input character to the comment token's data. Stay in 1264 the comment state. */ 1265 $this->token['data'] .= $char; 1266 } 1267 } 1268 1269 private function commentDashState() 1270 { 1271 /* Consume the next input character: */ 1272 $this->char++; 1273 $char = $this->char(); 1274 1275 /* U+002D HYPHEN-MINUS (-) */ 1276 if ($char === '-') { 1277 /* Switch to the comment end state */ 1278 $this->state = 'commentEnd'; 1279 1280 /* EOF */ 1281 } elseif ($this->char === $this->EOF) { 1282 /* Parse error. Emit the comment token. Reconsume the EOF character 1283 in the data state. */ 1284 $this->emitToken($this->token); 1285 $this->char--; 1286 $this->state = 'data'; 1287 1288 /* Anything else */ 1289 } else { 1290 /* Append a U+002D HYPHEN-MINUS (-) character and the input 1291 character to the comment token's data. Switch to the comment state. */ 1292 $this->token['data'] .= '-' . $char; 1293 $this->state = 'comment'; 1294 } 1295 } 1296 1297 private function commentEndState() 1298 { 1299 /* Consume the next input character: */ 1300 $this->char++; 1301 $char = $this->char(); 1302 1303 if ($char === '>') { 1304 $this->emitToken($this->token); 1305 $this->state = 'data'; 1306 1307 } elseif ($char === '-') { 1308 $this->token['data'] .= '-'; 1309 1310 } elseif ($this->char === $this->EOF) { 1311 $this->emitToken($this->token); 1312 $this->char--; 1313 $this->state = 'data'; 1314 1315 } else { 1316 $this->token['data'] .= '--' . $char; 1317 $this->state = 'comment'; 1318 } 1319 } 1320 1321 private function doctypeState() 1322 { 1323 /* Consume the next input character: */ 1324 $this->char++; 1325 $char = $this->char(); 1326 1327 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 1328 $this->state = 'beforeDoctypeName'; 1329 1330 } else { 1331 $this->char--; 1332 $this->state = 'beforeDoctypeName'; 1333 } 1334 } 1335 1336 private function beforeDoctypeNameState() 1337 { 1338 /* Consume the next input character: */ 1339 $this->char++; 1340 $char = $this->char(); 1341 1342 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 1343 // Stay in the before DOCTYPE name state. 1344 1345 } elseif (preg_match('/^[a-z]$/', $char)) { 1346 $this->token = array( 1347 'name' => strtoupper($char), 1348 'type' => self::DOCTYPE, 1349 'error' => true 1350 ); 1351 1352 $this->state = 'doctypeName'; 1353 1354 } elseif ($char === '>') { 1355 $this->emitToken( 1356 array( 1357 'name' => null, 1358 'type' => self::DOCTYPE, 1359 'error' => true 1360 ) 1361 ); 1362 1363 $this->state = 'data'; 1364 1365 } elseif ($this->char === $this->EOF) { 1366 $this->emitToken( 1367 array( 1368 'name' => null, 1369 'type' => self::DOCTYPE, 1370 'error' => true 1371 ) 1372 ); 1373 1374 $this->char--; 1375 $this->state = 'data'; 1376 1377 } else { 1378 $this->token = array( 1379 'name' => $char, 1380 'type' => self::DOCTYPE, 1381 'error' => true 1382 ); 1383 1384 $this->state = 'doctypeName'; 1385 } 1386 } 1387 1388 private function doctypeNameState() 1389 { 1390 /* Consume the next input character: */ 1391 $this->char++; 1392 $char = $this->char(); 1393 1394 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 1395 $this->state = 'AfterDoctypeName'; 1396 1397 } elseif ($char === '>') { 1398 $this->emitToken($this->token); 1399 $this->state = 'data'; 1400 1401 } elseif (preg_match('/^[a-z]$/', $char)) { 1402 $this->token['name'] .= strtoupper($char); 1403 1404 } elseif ($this->char === $this->EOF) { 1405 $this->emitToken($this->token); 1406 $this->char--; 1407 $this->state = 'data'; 1408 1409 } else { 1410 $this->token['name'] .= $char; 1411 } 1412 1413 $this->token['error'] = ($this->token['name'] === 'HTML') 1414 ? false 1415 : true; 1416 } 1417 1418 private function afterDoctypeNameState() 1419 { 1420 /* Consume the next input character: */ 1421 $this->char++; 1422 $char = $this->char(); 1423 1424 if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) { 1425 // Stay in the DOCTYPE name state. 1426 1427 } elseif ($char === '>') { 1428 $this->emitToken($this->token); 1429 $this->state = 'data'; 1430 1431 } elseif ($this->char === $this->EOF) { 1432 $this->emitToken($this->token); 1433 $this->char--; 1434 $this->state = 'data'; 1435 1436 } else { 1437 $this->token['error'] = true; 1438 $this->state = 'bogusDoctype'; 1439 } 1440 } 1441 1442 private function bogusDoctypeState() 1443 { 1444 /* Consume the next input character: */ 1445 $this->char++; 1446 $char = $this->char(); 1447 1448 if ($char === '>') { 1449 $this->emitToken($this->token); 1450 $this->state = 'data'; 1451 1452 } elseif ($this->char === $this->EOF) { 1453 $this->emitToken($this->token); 1454 $this->char--; 1455 $this->state = 'data'; 1456 1457 } else { 1458 // Stay in the bogus DOCTYPE state. 1459 } 1460 } 1461 1462 private function entity() 1463 { 1464 $start = $this->char; 1465 1466 // This section defines how to consume an entity. This definition is 1467 // used when parsing entities in text and in attributes. 1468 1469 // The behaviour depends on the identity of the next character (the 1470 // one immediately after the U+0026 AMPERSAND character): 1471 1472 switch ($this->character($this->char + 1)) { 1473 // U+0023 NUMBER SIGN (#) 1474 case '#': 1475 1476 // The behaviour further depends on the character after the 1477 // U+0023 NUMBER SIGN: 1478 switch ($this->character($this->char + 1)) { 1479 // U+0078 LATIN SMALL LETTER X 1480 // U+0058 LATIN CAPITAL LETTER X 1481 case 'x': 1482 case 'X': 1483 // Follow the steps below, but using the range of 1484 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT 1485 // NINE, U+0061 LATIN SMALL LETTER A through to U+0066 1486 // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER 1487 // A, through to U+0046 LATIN CAPITAL LETTER F (in other 1488 // words, 0-9, A-F, a-f). 1489 $char = 1; 1490 $char_class = '0-9A-Fa-f'; 1491 break; 1492 1493 // Anything else 1494 default: 1495 // Follow the steps below, but using the range of 1496 // characters U+0030 DIGIT ZERO through to U+0039 DIGIT 1497 // NINE (i.e. just 0-9). 1498 $char = 0; 1499 $char_class = '0-9'; 1500 break; 1501 } 1502 1503 // Consume as many characters as match the range of characters 1504 // given above. 1505 $this->char++; 1506 $e_name = $this->characters($char_class, $this->char + $char + 1); 1507 $entity = $this->character($start, $this->char); 1508 $cond = strlen($e_name) > 0; 1509 1510 // The rest of the parsing happens bellow. 1511 break; 1512 1513 // Anything else 1514 default: 1515 // Consume the maximum number of characters possible, with the 1516 // consumed characters case-sensitively matching one of the 1517 // identifiers in the first column of the entities table. 1518 $e_name = $this->characters('0-9A-Za-z;', $this->char + 1); 1519 $len = strlen($e_name); 1520 1521 for ($c = 1; $c <= $len; $c++) { 1522 $id = substr($e_name, 0, $c); 1523 $this->char++; 1524 1525 if (in_array($id, $this->entities)) { 1526 if ($e_name[$c - 1] !== ';') { 1527 if ($c < $len && $e_name[$c] == ';') { 1528 $this->char++; // consume extra semicolon 1529 } 1530 } 1531 $entity = $id; 1532 break; 1533 } 1534 } 1535 1536 $cond = isset($entity); 1537 // The rest of the parsing happens bellow. 1538 break; 1539 } 1540 1541 if (!$cond) { 1542 // If no match can be made, then this is a parse error. No 1543 // characters are consumed, and nothing is returned. 1544 $this->char = $start; 1545 return false; 1546 } 1547 1548 // Return a character token for the character corresponding to the 1549 // entity name (as given by the second column of the entities table). 1550 return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8'); 1551 } 1552 1553 private function emitToken($token) 1554 { 1555 $emit = $this->tree->emitToken($token); 1556 1557 if (is_int($emit)) { 1558 $this->content_model = $emit; 1559 1560 } elseif ($token['type'] === self::ENDTAG) { 1561 $this->content_model = self::PCDATA; 1562 } 1563 } 1564 1565 private function EOF() 1566 { 1567 $this->state = null; 1568 $this->tree->emitToken( 1569 array( 1570 'type' => self::EOF 1571 ) 1572 ); 1573 } 1574 } 1575 1576 class HTML5TreeConstructer 1577 { 1578 public $stack = array(); 1579 1580 private $phase; 1581 private $mode; 1582 private $dom; 1583 private $foster_parent = null; 1584 private $a_formatting = array(); 1585 1586 private $head_pointer = null; 1587 private $form_pointer = null; 1588 1589 private $scoping = array('button', 'caption', 'html', 'marquee', 'object', 'table', 'td', 'th'); 1590 private $formatting = array( 1591 'a', 1592 'b', 1593 'big', 1594 'em', 1595 'font', 1596 'i', 1597 'nobr', 1598 's', 1599 'small', 1600 'strike', 1601 'strong', 1602 'tt', 1603 'u' 1604 ); 1605 private $special = array( 1606 'address', 1607 'area', 1608 'base', 1609 'basefont', 1610 'bgsound', 1611 'blockquote', 1612 'body', 1613 'br', 1614 'center', 1615 'col', 1616 'colgroup', 1617 'dd', 1618 'dir', 1619 'div', 1620 'dl', 1621 'dt', 1622 'embed', 1623 'fieldset', 1624 'form', 1625 'frame', 1626 'frameset', 1627 'h1', 1628 'h2', 1629 'h3', 1630 'h4', 1631 'h5', 1632 'h6', 1633 'head', 1634 'hr', 1635 'iframe', 1636 'image', 1637 'img', 1638 'input', 1639 'isindex', 1640 'li', 1641 'link', 1642 'listing', 1643 'menu', 1644 'meta', 1645 'noembed', 1646 'noframes', 1647 'noscript', 1648 'ol', 1649 'optgroup', 1650 'option', 1651 'p', 1652 'param', 1653 'plaintext', 1654 'pre', 1655 'script', 1656 'select', 1657 'spacer', 1658 'style', 1659 'tbody', 1660 'textarea', 1661 'tfoot', 1662 'thead', 1663 'title', 1664 'tr', 1665 'ul', 1666 'wbr' 1667 ); 1668 1669 // The different phases. 1670 const INIT_PHASE = 0; 1671 const ROOT_PHASE = 1; 1672 const MAIN_PHASE = 2; 1673 const END_PHASE = 3; 1674 1675 // The different insertion modes for the main phase. 1676 const BEFOR_HEAD = 0; 1677 const IN_HEAD = 1; 1678 const AFTER_HEAD = 2; 1679 const IN_BODY = 3; 1680 const IN_TABLE = 4; 1681 const IN_CAPTION = 5; 1682 const IN_CGROUP = 6; 1683 const IN_TBODY = 7; 1684 const IN_ROW = 8; 1685 const IN_CELL = 9; 1686 const IN_SELECT = 10; 1687 const AFTER_BODY = 11; 1688 const IN_FRAME = 12; 1689 const AFTR_FRAME = 13; 1690 1691 // The different types of elements. 1692 const SPECIAL = 0; 1693 const SCOPING = 1; 1694 const FORMATTING = 2; 1695 const PHRASING = 3; 1696 1697 const MARKER = 0; 1698 1699 public function __construct() 1700 { 1701 $this->phase = self::INIT_PHASE; 1702 $this->mode = self::BEFOR_HEAD; 1703 $this->dom = new DOMDocument; 1704 1705 $this->dom->encoding = 'UTF-8'; 1706 $this->dom->preserveWhiteSpace = true; 1707 $this->dom->substituteEntities = true; 1708 $this->dom->strictErrorChecking = false; 1709 } 1710 1711 // Process tag tokens 1712 public function emitToken($token) 1713 { 1714 switch ($this->phase) { 1715 case self::INIT_PHASE: 1716 return $this->initPhase($token); 1717 break; 1718 case self::ROOT_PHASE: 1719 return $this->rootElementPhase($token); 1720 break; 1721 case self::MAIN_PHASE: 1722 return $this->mainPhase($token); 1723 break; 1724 case self::END_PHASE : 1725 return $this->trailingEndPhase($token); 1726 break; 1727 } 1728 } 1729 1730 private function initPhase($token) 1731 { 1732 /* Initially, the tree construction stage must handle each token 1733 emitted from the tokenisation stage as follows: */ 1734 1735 /* A DOCTYPE token that is marked as being in error 1736 A comment token 1737 A start tag token 1738 An end tag token 1739 A character token that is not one of one of U+0009 CHARACTER TABULATION, 1740 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1741 or U+0020 SPACE 1742 An end-of-file token */ 1743 if ((isset($token['error']) && $token['error']) || 1744 $token['type'] === HTML5::COMMENT || 1745 $token['type'] === HTML5::STARTTAG || 1746 $token['type'] === HTML5::ENDTAG || 1747 $token['type'] === HTML5::EOF || 1748 ($token['type'] === HTML5::CHARACTR && isset($token['data']) && 1749 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) 1750 ) { 1751 /* This specification does not define how to handle this case. In 1752 particular, user agents may ignore the entirety of this specification 1753 altogether for such documents, and instead invoke special parse modes 1754 with a greater emphasis on backwards compatibility. */ 1755 1756 $this->phase = self::ROOT_PHASE; 1757 return $this->rootElementPhase($token); 1758 1759 /* A DOCTYPE token marked as being correct */ 1760 } elseif (isset($token['error']) && !$token['error']) { 1761 /* Append a DocumentType node to the Document node, with the name 1762 attribute set to the name given in the DOCTYPE token (which will be 1763 "HTML"), and the other attributes specific to DocumentType objects 1764 set to null, empty lists, or the empty string as appropriate. */ 1765 $doctype = new DOMDocumentType(null, null, 'HTML'); 1766 1767 /* Then, switch to the root element phase of the tree construction 1768 stage. */ 1769 $this->phase = self::ROOT_PHASE; 1770 1771 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1772 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1773 or U+0020 SPACE */ 1774 } elseif (isset($token['data']) && preg_match( 1775 '/^[\t\n\x0b\x0c ]+$/', 1776 $token['data'] 1777 ) 1778 ) { 1779 /* Append that character to the Document node. */ 1780 $text = $this->dom->createTextNode($token['data']); 1781 $this->dom->appendChild($text); 1782 } 1783 } 1784 1785 private function rootElementPhase($token) 1786 { 1787 /* After the initial phase, as each token is emitted from the tokenisation 1788 stage, it must be processed as described in this section. */ 1789 1790 /* A DOCTYPE token */ 1791 if ($token['type'] === HTML5::DOCTYPE) { 1792 // Parse error. Ignore the token. 1793 1794 /* A comment token */ 1795 } elseif ($token['type'] === HTML5::COMMENT) { 1796 /* Append a Comment node to the Document object with the data 1797 attribute set to the data given in the comment token. */ 1798 $comment = $this->dom->createComment($token['data']); 1799 $this->dom->appendChild($comment); 1800 1801 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1802 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1803 or U+0020 SPACE */ 1804 } elseif ($token['type'] === HTML5::CHARACTR && 1805 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 1806 ) { 1807 /* Append that character to the Document node. */ 1808 $text = $this->dom->createTextNode($token['data']); 1809 $this->dom->appendChild($text); 1810 1811 /* A character token that is not one of U+0009 CHARACTER TABULATION, 1812 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED 1813 (FF), or U+0020 SPACE 1814 A start tag token 1815 An end tag token 1816 An end-of-file token */ 1817 } elseif (($token['type'] === HTML5::CHARACTR && 1818 !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || 1819 $token['type'] === HTML5::STARTTAG || 1820 $token['type'] === HTML5::ENDTAG || 1821 $token['type'] === HTML5::EOF 1822 ) { 1823 /* Create an HTMLElement node with the tag name html, in the HTML 1824 namespace. Append it to the Document object. Switch to the main 1825 phase and reprocess the current token. */ 1826 $html = $this->dom->createElement('html'); 1827 $this->dom->appendChild($html); 1828 $this->stack[] = $html; 1829 1830 $this->phase = self::MAIN_PHASE; 1831 return $this->mainPhase($token); 1832 } 1833 } 1834 1835 private function mainPhase($token) 1836 { 1837 /* Tokens in the main phase must be handled as follows: */ 1838 1839 /* A DOCTYPE token */ 1840 if ($token['type'] === HTML5::DOCTYPE) { 1841 // Parse error. Ignore the token. 1842 1843 /* A start tag token with the tag name "html" */ 1844 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') { 1845 /* If this start tag token was not the first start tag token, then 1846 it is a parse error. */ 1847 1848 /* For each attribute on the token, check to see if the attribute 1849 is already present on the top element of the stack of open elements. 1850 If it is not, add the attribute and its corresponding value to that 1851 element. */ 1852 foreach ($token['attr'] as $attr) { 1853 if (!$this->stack[0]->hasAttribute($attr['name'])) { 1854 $this->stack[0]->setAttribute($attr['name'], $attr['value']); 1855 } 1856 } 1857 1858 /* An end-of-file token */ 1859 } elseif ($token['type'] === HTML5::EOF) { 1860 /* Generate implied end tags. */ 1861 $this->generateImpliedEndTags(); 1862 1863 /* Anything else. */ 1864 } else { 1865 /* Depends on the insertion mode: */ 1866 switch ($this->mode) { 1867 case self::BEFOR_HEAD: 1868 return $this->beforeHead($token); 1869 break; 1870 case self::IN_HEAD: 1871 return $this->inHead($token); 1872 break; 1873 case self::AFTER_HEAD: 1874 return $this->afterHead($token); 1875 break; 1876 case self::IN_BODY: 1877 return $this->inBody($token); 1878 break; 1879 case self::IN_TABLE: 1880 return $this->inTable($token); 1881 break; 1882 case self::IN_CAPTION: 1883 return $this->inCaption($token); 1884 break; 1885 case self::IN_CGROUP: 1886 return $this->inColumnGroup($token); 1887 break; 1888 case self::IN_TBODY: 1889 return $this->inTableBody($token); 1890 break; 1891 case self::IN_ROW: 1892 return $this->inRow($token); 1893 break; 1894 case self::IN_CELL: 1895 return $this->inCell($token); 1896 break; 1897 case self::IN_SELECT: 1898 return $this->inSelect($token); 1899 break; 1900 case self::AFTER_BODY: 1901 return $this->afterBody($token); 1902 break; 1903 case self::IN_FRAME: 1904 return $this->inFrameset($token); 1905 break; 1906 case self::AFTR_FRAME: 1907 return $this->afterFrameset($token); 1908 break; 1909 case self::END_PHASE: 1910 return $this->trailingEndPhase($token); 1911 break; 1912 } 1913 } 1914 } 1915 1916 private function beforeHead($token) 1917 { 1918 /* Handle the token as follows: */ 1919 1920 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1921 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1922 or U+0020 SPACE */ 1923 if ($token['type'] === HTML5::CHARACTR && 1924 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 1925 ) { 1926 /* Append the character to the current node. */ 1927 $this->insertText($token['data']); 1928 1929 /* A comment token */ 1930 } elseif ($token['type'] === HTML5::COMMENT) { 1931 /* Append a Comment node to the current node with the data attribute 1932 set to the data given in the comment token. */ 1933 $this->insertComment($token['data']); 1934 1935 /* A start tag token with the tag name "head" */ 1936 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') { 1937 /* Create an element for the token, append the new element to the 1938 current node and push it onto the stack of open elements. */ 1939 $element = $this->insertElement($token); 1940 1941 /* Set the head element pointer to this new element node. */ 1942 $this->head_pointer = $element; 1943 1944 /* Change the insertion mode to "in head". */ 1945 $this->mode = self::IN_HEAD; 1946 1947 /* A start tag token whose tag name is one of: "base", "link", "meta", 1948 "script", "style", "title". Or an end tag with the tag name "html". 1949 Or a character token that is not one of U+0009 CHARACTER TABULATION, 1950 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1951 or U+0020 SPACE. Or any other start tag token */ 1952 } elseif ($token['type'] === HTML5::STARTTAG || 1953 ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') || 1954 ($token['type'] === HTML5::CHARACTR && !preg_match( 1955 '/^[\t\n\x0b\x0c ]$/', 1956 $token['data'] 1957 )) 1958 ) { 1959 /* Act as if a start tag token with the tag name "head" and no 1960 attributes had been seen, then reprocess the current token. */ 1961 $this->beforeHead( 1962 array( 1963 'name' => 'head', 1964 'type' => HTML5::STARTTAG, 1965 'attr' => array() 1966 ) 1967 ); 1968 1969 return $this->inHead($token); 1970 1971 /* Any other end tag */ 1972 } elseif ($token['type'] === HTML5::ENDTAG) { 1973 /* Parse error. Ignore the token. */ 1974 } 1975 } 1976 1977 private function inHead($token) 1978 { 1979 /* Handle the token as follows: */ 1980 1981 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 1982 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 1983 or U+0020 SPACE. 1984 1985 THIS DIFFERS FROM THE SPEC: If the current node is either a title, style 1986 or script element, append the character to the current node regardless 1987 of its content. */ 1988 if (($token['type'] === HTML5::CHARACTR && 1989 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || ( 1990 $token['type'] === HTML5::CHARACTR && in_array( 1991 end($this->stack)->nodeName, 1992 array('title', 'style', 'script') 1993 )) 1994 ) { 1995 /* Append the character to the current node. */ 1996 $this->insertText($token['data']); 1997 1998 /* A comment token */ 1999 } elseif ($token['type'] === HTML5::COMMENT) { 2000 /* Append a Comment node to the current node with the data attribute 2001 set to the data given in the comment token. */ 2002 $this->insertComment($token['data']); 2003 2004 } elseif ($token['type'] === HTML5::ENDTAG && 2005 in_array($token['name'], array('title', 'style', 'script')) 2006 ) { 2007 array_pop($this->stack); 2008 return HTML5::PCDATA; 2009 2010 /* A start tag with the tag name "title" */ 2011 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') { 2012 /* Create an element for the token and append the new element to the 2013 node pointed to by the head element pointer, or, if that is null 2014 (innerHTML case), to the current node. */ 2015 if ($this->head_pointer !== null) { 2016 $element = $this->insertElement($token, false); 2017 $this->head_pointer->appendChild($element); 2018 2019 } else { 2020 $element = $this->insertElement($token); 2021 } 2022 2023 /* Switch the tokeniser's content model flag to the RCDATA state. */ 2024 return HTML5::RCDATA; 2025 2026 /* A start tag with the tag name "style" */ 2027 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') { 2028 /* Create an element for the token and append the new element to the 2029 node pointed to by the head element pointer, or, if that is null 2030 (innerHTML case), to the current node. */ 2031 if ($this->head_pointer !== null) { 2032 $element = $this->insertElement($token, false); 2033 $this->head_pointer->appendChild($element); 2034 2035 } else { 2036 $this->insertElement($token); 2037 } 2038 2039 /* Switch the tokeniser's content model flag to the CDATA state. */ 2040 return HTML5::CDATA; 2041 2042 /* A start tag with the tag name "script" */ 2043 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') { 2044 /* Create an element for the token. */ 2045 $element = $this->insertElement($token, false); 2046 $this->head_pointer->appendChild($element); 2047 2048 /* Switch the tokeniser's content model flag to the CDATA state. */ 2049 return HTML5::CDATA; 2050 2051 /* A start tag with the tag name "base", "link", or "meta" */ 2052 } elseif ($token['type'] === HTML5::STARTTAG && in_array( 2053 $token['name'], 2054 array('base', 'link', 'meta') 2055 ) 2056 ) { 2057 /* Create an element for the token and append the new element to the 2058 node pointed to by the head element pointer, or, if that is null 2059 (innerHTML case), to the current node. */ 2060 if ($this->head_pointer !== null) { 2061 $element = $this->insertElement($token, false); 2062 $this->head_pointer->appendChild($element); 2063 array_pop($this->stack); 2064 2065 } else { 2066 $this->insertElement($token); 2067 } 2068 2069 /* An end tag with the tag name "head" */ 2070 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') { 2071 /* If the current node is a head element, pop the current node off 2072 the stack of open elements. */ 2073 if ($this->head_pointer->isSameNode(end($this->stack))) { 2074 array_pop($this->stack); 2075 2076 /* Otherwise, this is a parse error. */ 2077 } else { 2078 // k 2079 } 2080 2081 /* Change the insertion mode to "after head". */ 2082 $this->mode = self::AFTER_HEAD; 2083 2084 /* A start tag with the tag name "head" or an end tag except "html". */ 2085 } elseif (($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') || 2086 ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html') 2087 ) { 2088 // Parse error. Ignore the token. 2089 2090 /* Anything else */ 2091 } else { 2092 /* If the current node is a head element, act as if an end tag 2093 token with the tag name "head" had been seen. */ 2094 if ($this->head_pointer->isSameNode(end($this->stack))) { 2095 $this->inHead( 2096 array( 2097 'name' => 'head', 2098 'type' => HTML5::ENDTAG 2099 ) 2100 ); 2101 2102 /* Otherwise, change the insertion mode to "after head". */ 2103 } else { 2104 $this->mode = self::AFTER_HEAD; 2105 } 2106 2107 /* Then, reprocess the current token. */ 2108 return $this->afterHead($token); 2109 } 2110 } 2111 2112 private function afterHead($token) 2113 { 2114 /* Handle the token as follows: */ 2115 2116 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 2117 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 2118 or U+0020 SPACE */ 2119 if ($token['type'] === HTML5::CHARACTR && 2120 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 2121 ) { 2122 /* Append the character to the current node. */ 2123 $this->insertText($token['data']); 2124 2125 /* A comment token */ 2126 } elseif ($token['type'] === HTML5::COMMENT) { 2127 /* Append a Comment node to the current node with the data attribute 2128 set to the data given in the comment token. */ 2129 $this->insertComment($token['data']); 2130 2131 /* A start tag token with the tag name "body" */ 2132 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') { 2133 /* Insert a body element for the token. */ 2134 $this->insertElement($token); 2135 2136 /* Change the insertion mode to "in body". */ 2137 $this->mode = self::IN_BODY; 2138 2139 /* A start tag token with the tag name "frameset" */ 2140 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') { 2141 /* Insert a frameset element for the token. */ 2142 $this->insertElement($token); 2143 2144 /* Change the insertion mode to "in frameset". */ 2145 $this->mode = self::IN_FRAME; 2146 2147 /* A start tag token whose tag name is one of: "base", "link", "meta", 2148 "script", "style", "title" */ 2149 } elseif ($token['type'] === HTML5::STARTTAG && in_array( 2150 $token['name'], 2151 array('base', 'link', 'meta', 'script', 'style', 'title') 2152 ) 2153 ) { 2154 /* Parse error. Switch the insertion mode back to "in head" and 2155 reprocess the token. */ 2156 $this->mode = self::IN_HEAD; 2157 return $this->inHead($token); 2158 2159 /* Anything else */ 2160 } else { 2161 /* Act as if a start tag token with the tag name "body" and no 2162 attributes had been seen, and then reprocess the current token. */ 2163 $this->afterHead( 2164 array( 2165 'name' => 'body', 2166 'type' => HTML5::STARTTAG, 2167 'attr' => array() 2168 ) 2169 ); 2170 2171 return $this->inBody($token); 2172 } 2173 } 2174 2175 private function inBody($token) 2176 { 2177 /* Handle the token as follows: */ 2178 2179 switch ($token['type']) { 2180 /* A character token */ 2181 case HTML5::CHARACTR: 2182 /* Reconstruct the active formatting elements, if any. */ 2183 $this->reconstructActiveFormattingElements(); 2184 2185 /* Append the token's character to the current node. */ 2186 $this->insertText($token['data']); 2187 break; 2188 2189 /* A comment token */ 2190 case HTML5::COMMENT: 2191 /* Append a Comment node to the current node with the data 2192 attribute set to the data given in the comment token. */ 2193 $this->insertComment($token['data']); 2194 break; 2195 2196 case HTML5::STARTTAG: 2197 switch ($token['name']) { 2198 /* A start tag token whose tag name is one of: "script", 2199 "style" */ 2200 case 'script': 2201 case 'style': 2202 /* Process the token as if the insertion mode had been "in 2203 head". */ 2204 return $this->inHead($token); 2205 break; 2206 2207 /* A start tag token whose tag name is one of: "base", "link", 2208 "meta", "title" */ 2209 case 'base': 2210 case 'link': 2211 case 'meta': 2212 case 'title': 2213 /* Parse error. Process the token as if the insertion mode 2214 had been "in head". */ 2215 return $this->inHead($token); 2216 break; 2217 2218 /* A start tag token with the tag name "body" */ 2219 case 'body': 2220 /* Parse error. If the second element on the stack of open 2221 elements is not a body element, or, if the stack of open 2222 elements has only one node on it, then ignore the token. 2223 (innerHTML case) */ 2224 if (count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') { 2225 // Ignore 2226 2227 /* Otherwise, for each attribute on the token, check to see 2228 if the attribute is already present on the body element (the 2229 second element) on the stack of open elements. If it is not, 2230 add the attribute and its corresponding value to that 2231 element. */ 2232 } else { 2233 foreach ($token['attr'] as $attr) { 2234 if (!$this->stack[1]->hasAttribute($attr['name'])) { 2235 $this->stack[1]->setAttribute($attr['name'], $attr['value']); 2236 } 2237 } 2238 } 2239 break; 2240 2241 /* A start tag whose tag name is one of: "address", 2242 "blockquote", "center", "dir", "div", "dl", "fieldset", 2243 "listing", "menu", "ol", "p", "ul" */ 2244 case 'address': 2245 case 'blockquote': 2246 case 'center': 2247 case 'dir': 2248 case 'div': 2249 case 'dl': 2250 case 'fieldset': 2251 case 'listing': 2252 case 'menu': 2253 case 'ol': 2254 case 'p': 2255 case 'ul': 2256 /* If the stack of open elements has a p element in scope, 2257 then act as if an end tag with the tag name p had been 2258 seen. */ 2259 if ($this->elementInScope('p')) { 2260 $this->emitToken( 2261 array( 2262 'name' => 'p', 2263 'type' => HTML5::ENDTAG 2264 ) 2265 ); 2266 } 2267 2268 /* Insert an HTML element for the token. */ 2269 $this->insertElement($token); 2270 break; 2271 2272 /* A start tag whose tag name is "form" */ 2273 case 'form': 2274 /* If the form element pointer is not null, ignore the 2275 token with a parse error. */ 2276 if ($this->form_pointer !== null) { 2277 // Ignore. 2278 2279 /* Otherwise: */ 2280 } else { 2281 /* If the stack of open elements has a p element in 2282 scope, then act as if an end tag with the tag name p 2283 had been seen. */ 2284 if ($this->elementInScope('p')) { 2285 $this->emitToken( 2286 array( 2287 'name' => 'p', 2288 'type' => HTML5::ENDTAG 2289 ) 2290 ); 2291 } 2292 2293 /* Insert an HTML element for the token, and set the 2294 form element pointer to point to the element created. */ 2295 $element = $this->insertElement($token); 2296 $this->form_pointer = $element; 2297 } 2298 break; 2299 2300 /* A start tag whose tag name is "li", "dd" or "dt" */ 2301 case 'li': 2302 case 'dd': 2303 case 'dt': 2304 /* If the stack of open elements has a p element in scope, 2305 then act as if an end tag with the tag name p had been 2306 seen. */ 2307 if ($this->elementInScope('p')) { 2308 $this->emitToken( 2309 array( 2310 'name' => 'p', 2311 'type' => HTML5::ENDTAG 2312 ) 2313 ); 2314 } 2315 2316 $stack_length = count($this->stack) - 1; 2317 2318 for ($n = $stack_length; 0 <= $n; $n--) { 2319 /* 1. Initialise node to be the current node (the 2320 bottommost node of the stack). */ 2321 $stop = false; 2322 $node = $this->stack[$n]; 2323 $cat = $this->getElementCategory($node->tagName); 2324 2325 /* 2. If node is an li, dd or dt element, then pop all 2326 the nodes from the current node up to node, including 2327 node, then stop this algorithm. */ 2328 if ($token['name'] === $node->tagName || ($token['name'] !== 'li' 2329 && ($node->tagName === 'dd' || $node->tagName === 'dt')) 2330 ) { 2331 for ($x = $stack_length; $x >= $n; $x--) { 2332 array_pop($this->stack); 2333 } 2334 2335 break; 2336 } 2337 2338 /* 3. If node is not in the formatting category, and is 2339 not in the phrasing category, and is not an address or 2340 div element, then stop this algorithm. */ 2341 if ($cat !== self::FORMATTING && $cat !== self::PHRASING && 2342 $node->tagName !== 'address' && $node->tagName !== 'div' 2343 ) { 2344 break; 2345 } 2346 } 2347 2348 /* Finally, insert an HTML element with the same tag 2349 name as the token's. */ 2350 $this->insertElement($token); 2351 break; 2352 2353 /* A start tag token whose tag name is "plaintext" */ 2354 case 'plaintext': 2355 /* If the stack of open elements has a p element in scope, 2356 then act as if an end tag with the tag name p had been 2357 seen. */ 2358 if ($this->elementInScope('p')) { 2359 $this->emitToken( 2360 array( 2361 'name' => 'p', 2362 'type' => HTML5::ENDTAG 2363 ) 2364 ); 2365 } 2366 2367 /* Insert an HTML element for the token. */ 2368 $this->insertElement($token); 2369 2370 return HTML5::PLAINTEXT; 2371 break; 2372 2373 /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4", 2374 "h5", "h6" */ 2375 case 'h1': 2376 case 'h2': 2377 case 'h3': 2378 case 'h4': 2379 case 'h5': 2380 case 'h6': 2381 /* If the stack of open elements has a p element in scope, 2382 then act as if an end tag with the tag name p had been seen. */ 2383 if ($this->elementInScope('p')) { 2384 $this->emitToken( 2385 array( 2386 'name' => 'p', 2387 'type' => HTML5::ENDTAG 2388 ) 2389 ); 2390 } 2391 2392 /* If the stack of open elements has in scope an element whose 2393 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then 2394 this is a parse error; pop elements from the stack until an 2395 element with one of those tag names has been popped from the 2396 stack. */ 2397 while ($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) { 2398 array_pop($this->stack); 2399 } 2400 2401 /* Insert an HTML element for the token. */ 2402 $this->insertElement($token); 2403 break; 2404 2405 /* A start tag whose tag name is "a" */ 2406 case 'a': 2407 /* If the list of active formatting elements contains 2408 an element whose tag name is "a" between the end of the 2409 list and the last marker on the list (or the start of 2410 the list if there is no marker on the list), then this 2411 is a parse error; act as if an end tag with the tag name 2412 "a" had been seen, then remove that element from the list 2413 of active formatting elements and the stack of open 2414 elements if the end tag didn't already remove it (it 2415 might not have if the element is not in table scope). */ 2416 $leng = count($this->a_formatting); 2417 2418 for ($n = $leng - 1; $n >= 0; $n--) { 2419 if ($this->a_formatting[$n] === self::MARKER) { 2420 break; 2421 2422 } elseif ($this->a_formatting[$n]->nodeName === 'a') { 2423 $this->emitToken( 2424 array( 2425 'name' => 'a', 2426 'type' => HTML5::ENDTAG 2427 ) 2428 ); 2429 break; 2430 } 2431 } 2432 2433 /* Reconstruct the active formatting elements, if any. */ 2434 $this->reconstructActiveFormattingElements(); 2435 2436 /* Insert an HTML element for the token. */ 2437 $el = $this->insertElement($token); 2438 2439 /* Add that element to the list of active formatting 2440 elements. */ 2441 $this->a_formatting[] = $el; 2442 break; 2443 2444 /* A start tag whose tag name is one of: "b", "big", "em", "font", 2445 "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ 2446 case 'b': 2447 case 'big': 2448 case 'em': 2449 case 'font': 2450 case 'i': 2451 case 'nobr': 2452 case 's': 2453 case 'small': 2454 case 'strike': 2455 case 'strong': 2456 case 'tt': 2457 case 'u': 2458 /* Reconstruct the active formatting elements, if any. */ 2459 $this->reconstructActiveFormattingElements(); 2460 2461 /* Insert an HTML element for the token. */ 2462 $el = $this->insertElement($token); 2463 2464 /* Add that element to the list of active formatting 2465 elements. */ 2466 $this->a_formatting[] = $el; 2467 break; 2468 2469 /* A start tag token whose tag name is "button" */ 2470 case 'button': 2471 /* If the stack of open elements has a button element in scope, 2472 then this is a parse error; act as if an end tag with the tag 2473 name "button" had been seen, then reprocess the token. (We don't 2474 do that. Unnecessary.) */ 2475 if ($this->elementInScope('button')) { 2476 $this->inBody( 2477 array( 2478 'name' => 'button', 2479 'type' => HTML5::ENDTAG 2480 ) 2481 ); 2482 } 2483 2484 /* Reconstruct the active formatting elements, if any. */ 2485 $this->reconstructActiveFormattingElements(); 2486 2487 /* Insert an HTML element for the token. */ 2488 $this->insertElement($token); 2489 2490 /* Insert a marker at the end of the list of active 2491 formatting elements. */ 2492 $this->a_formatting[] = self::MARKER; 2493 break; 2494 2495 /* A start tag token whose tag name is one of: "marquee", "object" */ 2496 case 'marquee': 2497 case 'object': 2498 /* Reconstruct the active formatting elements, if any. */ 2499 $this->reconstructActiveFormattingElements(); 2500 2501 /* Insert an HTML element for the token. */ 2502 $this->insertElement($token); 2503 2504 /* Insert a marker at the end of the list of active 2505 formatting elements. */ 2506 $this->a_formatting[] = self::MARKER; 2507 break; 2508 2509 /* A start tag token whose tag name is "xmp" */ 2510 case 'xmp': 2511 /* Reconstruct the active formatting elements, if any. */ 2512 $this->reconstructActiveFormattingElements(); 2513 2514 /* Insert an HTML element for the token. */ 2515 $this->insertElement($token); 2516 2517 /* Switch the content model flag to the CDATA state. */ 2518 return HTML5::CDATA; 2519 break; 2520 2521 /* A start tag whose tag name is "table" */ 2522 case 'table': 2523 /* If the stack of open elements has a p element in scope, 2524 then act as if an end tag with the tag name p had been seen. */ 2525 if ($this->elementInScope('p')) { 2526 $this->emitToken( 2527 array( 2528 'name' => 'p', 2529 'type' => HTML5::ENDTAG 2530 ) 2531 ); 2532 } 2533 2534 /* Insert an HTML element for the token. */ 2535 $this->insertElement($token); 2536 2537 /* Change the insertion mode to "in table". */ 2538 $this->mode = self::IN_TABLE; 2539 break; 2540 2541 /* A start tag whose tag name is one of: "area", "basefont", 2542 "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */ 2543 case 'area': 2544 case 'basefont': 2545 case 'bgsound': 2546 case 'br': 2547 case 'embed': 2548 case 'img': 2549 case 'param': 2550 case 'spacer': 2551 case 'wbr': 2552 /* Reconstruct the active formatting elements, if any. */ 2553 $this->reconstructActiveFormattingElements(); 2554 2555 /* Insert an HTML element for the token. */ 2556 $this->insertElement($token); 2557 2558 /* Immediately pop the current node off the stack of open elements. */ 2559 array_pop($this->stack); 2560 break; 2561 2562 /* A start tag whose tag name is "hr" */ 2563 case 'hr': 2564 /* If the stack of open elements has a p element in scope, 2565 then act as if an end tag with the tag name p had been seen. */ 2566 if ($this->elementInScope('p')) { 2567 $this->emitToken( 2568 array( 2569 'name' => 'p', 2570 'type' => HTML5::ENDTAG 2571 ) 2572 ); 2573 } 2574 2575 /* Insert an HTML element for the token. */ 2576 $this->insertElement($token); 2577 2578 /* Immediately pop the current node off the stack of open elements. */ 2579 array_pop($this->stack); 2580 break; 2581 2582 /* A start tag whose tag name is "image" */ 2583 case 'image': 2584 /* Parse error. Change the token's tag name to "img" and 2585 reprocess it. (Don't ask.) */ 2586 $token['name'] = 'img'; 2587 return $this->inBody($token); 2588 break; 2589 2590 /* A start tag whose tag name is "input" */ 2591 case 'input': 2592 /* Reconstruct the active formatting elements, if any. */ 2593 $this->reconstructActiveFormattingElements(); 2594 2595 /* Insert an input element for the token. */ 2596 $element = $this->insertElement($token, false); 2597 2598 /* If the form element pointer is not null, then associate the 2599 input element with the form element pointed to by the form 2600 element pointer. */ 2601 $this->form_pointer !== null 2602 ? $this->form_pointer->appendChild($element) 2603 : end($this->stack)->appendChild($element); 2604 2605 /* Pop that input element off the stack of open elements. */ 2606 array_pop($this->stack); 2607 break; 2608 2609 /* A start tag whose tag name is "isindex" */ 2610 case 'isindex': 2611 /* Parse error. */ 2612 // w/e 2613 2614 /* If the form element pointer is not null, 2615 then ignore the token. */ 2616 if ($this->form_pointer === null) { 2617 /* Act as if a start tag token with the tag name "form" had 2618 been seen. */ 2619 $this->inBody( 2620 array( 2621 'name' => 'body', 2622 'type' => HTML5::STARTTAG, 2623 'attr' => array() 2624 ) 2625 ); 2626 2627 /* Act as if a start tag token with the tag name "hr" had 2628 been seen. */ 2629 $this->inBody( 2630 array( 2631 'name' => 'hr', 2632 'type' => HTML5::STARTTAG, 2633 'attr' => array() 2634 ) 2635 ); 2636 2637 /* Act as if a start tag token with the tag name "p" had 2638 been seen. */ 2639 $this->inBody( 2640 array( 2641 'name' => 'p', 2642 'type' => HTML5::STARTTAG, 2643 'attr' => array() 2644 ) 2645 ); 2646 2647 /* Act as if a start tag token with the tag name "label" 2648 had been seen. */ 2649 $this->inBody( 2650 array( 2651 'name' => 'label', 2652 'type' => HTML5::STARTTAG, 2653 'attr' => array() 2654 ) 2655 ); 2656 2657 /* Act as if a stream of character tokens had been seen. */ 2658 $this->insertText( 2659 'This is a searchable index. ' . 2660 'Insert your search keywords here: ' 2661 ); 2662 2663 /* Act as if a start tag token with the tag name "input" 2664 had been seen, with all the attributes from the "isindex" 2665 token, except with the "name" attribute set to the value 2666 "isindex" (ignoring any explicit "name" attribute). */ 2667 $attr = $token['attr']; 2668 $attr[] = array('name' => 'name', 'value' => 'isindex'); 2669 2670 $this->inBody( 2671 array( 2672 'name' => 'input', 2673 'type' => HTML5::STARTTAG, 2674 'attr' => $attr 2675 ) 2676 ); 2677 2678 /* Act as if a stream of character tokens had been seen 2679 (see below for what they should say). */ 2680 $this->insertText( 2681 'This is a searchable index. ' . 2682 'Insert your search keywords here: ' 2683 ); 2684 2685 /* Act as if an end tag token with the tag name "label" 2686 had been seen. */ 2687 $this->inBody( 2688 array( 2689 'name' => 'label', 2690 'type' => HTML5::ENDTAG 2691 ) 2692 ); 2693 2694 /* Act as if an end tag token with the tag name "p" had 2695 been seen. */ 2696 $this->inBody( 2697 array( 2698 'name' => 'p', 2699 'type' => HTML5::ENDTAG 2700 ) 2701 ); 2702 2703 /* Act as if a start tag token with the tag name "hr" had 2704 been seen. */ 2705 $this->inBody( 2706 array( 2707 'name' => 'hr', 2708 'type' => HTML5::ENDTAG 2709 ) 2710 ); 2711 2712 /* Act as if an end tag token with the tag name "form" had 2713 been seen. */ 2714 $this->inBody( 2715 array( 2716 'name' => 'form', 2717 'type' => HTML5::ENDTAG 2718 ) 2719 ); 2720 } 2721 break; 2722 2723 /* A start tag whose tag name is "textarea" */ 2724 case 'textarea': 2725 $this->insertElement($token); 2726 2727 /* Switch the tokeniser's content model flag to the 2728 RCDATA state. */ 2729 return HTML5::RCDATA; 2730 break; 2731 2732 /* A start tag whose tag name is one of: "iframe", "noembed", 2733 "noframes" */ 2734 case 'iframe': 2735 case 'noembed': 2736 case 'noframes': 2737 $this->insertElement($token); 2738 2739 /* Switch the tokeniser's content model flag to the CDATA state. */ 2740 return HTML5::CDATA; 2741 break; 2742 2743 /* A start tag whose tag name is "select" */ 2744 case 'select': 2745 /* Reconstruct the active formatting elements, if any. */ 2746 $this->reconstructActiveFormattingElements(); 2747 2748 /* Insert an HTML element for the token. */ 2749 $this->insertElement($token); 2750 2751 /* Change the insertion mode to "in select". */ 2752 $this->mode = self::IN_SELECT; 2753 break; 2754 2755 /* A start or end tag whose tag name is one of: "caption", "col", 2756 "colgroup", "frame", "frameset", "head", "option", "optgroup", 2757 "tbody", "td", "tfoot", "th", "thead", "tr". */ 2758 case 'caption': 2759 case 'col': 2760 case 'colgroup': 2761 case 'frame': 2762 case 'frameset': 2763 case 'head': 2764 case 'option': 2765 case 'optgroup': 2766 case 'tbody': 2767 case 'td': 2768 case 'tfoot': 2769 case 'th': 2770 case 'thead': 2771 case 'tr': 2772 // Parse error. Ignore the token. 2773 break; 2774 2775 /* A start or end tag whose tag name is one of: "event-source", 2776 "section", "nav", "article", "aside", "header", "footer", 2777 "datagrid", "command" */ 2778 case 'event-source': 2779 case 'section': 2780 case 'nav': 2781 case 'article': 2782 case 'aside': 2783 case 'header': 2784 case 'footer': 2785 case 'datagrid': 2786 case 'command': 2787 // Work in progress! 2788 break; 2789 2790 /* A start tag token not covered by the previous entries */ 2791 default: 2792 /* Reconstruct the active formatting elements, if any. */ 2793 $this->reconstructActiveFormattingElements(); 2794 2795 $this->insertElement($token, true, true); 2796 break; 2797 } 2798 break; 2799 2800 case HTML5::ENDTAG: 2801 switch ($token['name']) { 2802 /* An end tag with the tag name "body" */ 2803 case 'body': 2804 /* If the second element in the stack of open elements is 2805 not a body element, this is a parse error. Ignore the token. 2806 (innerHTML case) */ 2807 if (count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') { 2808 // Ignore. 2809 2810 /* If the current node is not the body element, then this 2811 is a parse error. */ 2812 } elseif (end($this->stack)->nodeName !== 'body') { 2813 // Parse error. 2814 } 2815 2816 /* Change the insertion mode to "after body". */ 2817 $this->mode = self::AFTER_BODY; 2818 break; 2819 2820 /* An end tag with the tag name "html" */ 2821 case 'html': 2822 /* Act as if an end tag with tag name "body" had been seen, 2823 then, if that token wasn't ignored, reprocess the current 2824 token. */ 2825 $this->inBody( 2826 array( 2827 'name' => 'body', 2828 'type' => HTML5::ENDTAG 2829 ) 2830 ); 2831 2832 return $this->afterBody($token); 2833 break; 2834 2835 /* An end tag whose tag name is one of: "address", "blockquote", 2836 "center", "dir", "div", "dl", "fieldset", "listing", "menu", 2837 "ol", "pre", "ul" */ 2838 case 'address': 2839 case 'blockquote': 2840 case 'center': 2841 case 'dir': 2842 case 'div': 2843 case 'dl': 2844 case 'fieldset': 2845 case 'listing': 2846 case 'menu': 2847 case 'ol': 2848 case 'pre': 2849 case 'ul': 2850 /* If the stack of open elements has an element in scope 2851 with the same tag name as that of the token, then generate 2852 implied end tags. */ 2853 if ($this->elementInScope($token['name'])) { 2854 $this->generateImpliedEndTags(); 2855 2856 /* Now, if the current node is not an element with 2857 the same tag name as that of the token, then this 2858 is a parse error. */ 2859 // w/e 2860 2861 /* If the stack of open elements has an element in 2862 scope with the same tag name as that of the token, 2863 then pop elements from this stack until an element 2864 with that tag name has been popped from the stack. */ 2865 for ($n = count($this->stack) - 1; $n >= 0; $n--) { 2866 if ($this->stack[$n]->nodeName === $token['name']) { 2867 $n = -1; 2868 } 2869 2870 array_pop($this->stack); 2871 } 2872 } 2873 break; 2874 2875 /* An end tag whose tag name is "form" */ 2876 case 'form': 2877 /* If the stack of open elements has an element in scope 2878 with the same tag name as that of the token, then generate 2879 implied end tags. */ 2880 if ($this->elementInScope($token['name'])) { 2881 $this->generateImpliedEndTags(); 2882 2883 } 2884 2885 if (end($this->stack)->nodeName !== $token['name']) { 2886 /* Now, if the current node is not an element with the 2887 same tag name as that of the token, then this is a parse 2888 error. */ 2889 // w/e 2890 2891 } else { 2892 /* Otherwise, if the current node is an element with 2893 the same tag name as that of the token pop that element 2894 from the stack. */ 2895 array_pop($this->stack); 2896 } 2897 2898 /* In any case, set the form element pointer to null. */ 2899 $this->form_pointer = null; 2900 break; 2901 2902 /* An end tag whose tag name is "p" */ 2903 case 'p': 2904 /* If the stack of open elements has a p element in scope, 2905 then generate implied end tags, except for p elements. */ 2906 if ($this->elementInScope('p')) { 2907 $this->generateImpliedEndTags(array('p')); 2908 2909 /* If the current node is not a p element, then this is 2910 a parse error. */ 2911 // k 2912 2913 /* If the stack of open elements has a p element in 2914 scope, then pop elements from this stack until the stack 2915 no longer has a p element in scope. */ 2916 for ($n = count($this->stack) - 1; $n >= 0; $n--) { 2917 if ($this->elementInScope('p')) { 2918 array_pop($this->stack); 2919 2920 } else { 2921 break; 2922 } 2923 } 2924 } 2925 break; 2926 2927 /* An end tag whose tag name is "dd", "dt", or "li" */ 2928 case 'dd': 2929 case 'dt': 2930 case 'li': 2931 /* If the stack of open elements has an element in scope 2932 whose tag name matches the tag name of the token, then 2933 generate implied end tags, except for elements with the 2934 same tag name as the token. */ 2935 if ($this->elementInScope($token['name'])) { 2936 $this->generateImpliedEndTags(array($token['name'])); 2937 2938 /* If the current node is not an element with the same 2939 tag name as the token, then this is a parse error. */ 2940 // w/e 2941 2942 /* If the stack of open elements has an element in scope 2943 whose tag name matches the tag name of the token, then 2944 pop elements from this stack until an element with that 2945 tag name has been popped from the stack. */ 2946 for ($n = count($this->stack) - 1; $n >= 0; $n--) { 2947 if ($this->stack[$n]->nodeName === $token['name']) { 2948 $n = -1; 2949 } 2950 2951 array_pop($this->stack); 2952 } 2953 } 2954 break; 2955 2956 /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4", 2957 "h5", "h6" */ 2958 case 'h1': 2959 case 'h2': 2960 case 'h3': 2961 case 'h4': 2962 case 'h5': 2963 case 'h6': 2964 $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'); 2965 2966 /* If the stack of open elements has in scope an element whose 2967 tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then 2968 generate implied end tags. */ 2969 if ($this->elementInScope($elements)) { 2970 $this->generateImpliedEndTags(); 2971 2972 /* Now, if the current node is not an element with the same 2973 tag name as that of the token, then this is a parse error. */ 2974 // w/e 2975 2976 /* If the stack of open elements has in scope an element 2977 whose tag name is one of "h1", "h2", "h3", "h4", "h5", or 2978 "h6", then pop elements from the stack until an element 2979 with one of those tag names has been popped from the stack. */ 2980 while ($this->elementInScope($elements)) { 2981 array_pop($this->stack); 2982 } 2983 } 2984 break; 2985 2986 /* An end tag whose tag name is one of: "a", "b", "big", "em", 2987 "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */ 2988 case 'a': 2989 case 'b': 2990 case 'big': 2991 case 'em': 2992 case 'font': 2993 case 'i': 2994 case 'nobr': 2995 case 's': 2996 case 'small': 2997 case 'strike': 2998 case 'strong': 2999 case 'tt': 3000 case 'u': 3001 /* 1. Let the formatting element be the last element in 3002 the list of active formatting elements that: 3003 * is between the end of the list and the last scope 3004 marker in the list, if any, or the start of the list 3005 otherwise, and 3006 * has the same tag name as the token. 3007 */ 3008 while (true) { 3009 for ($a = count($this->a_formatting) - 1; $a >= 0; $a--) { 3010 if ($this->a_formatting[$a] === self::MARKER) { 3011 break; 3012 3013 } elseif ($this->a_formatting[$a]->tagName === $token['name']) { 3014 $formatting_element = $this->a_formatting[$a]; 3015 $in_stack = in_array($formatting_element, $this->stack, true); 3016 $fe_af_pos = $a; 3017 break; 3018 } 3019 } 3020 3021 /* If there is no such node, or, if that node is 3022 also in the stack of open elements but the element 3023 is not in scope, then this is a parse error. Abort 3024 these steps. The token is ignored. */ 3025 if (!isset($formatting_element) || ($in_stack && 3026 !$this->elementInScope($token['name'])) 3027 ) { 3028 break; 3029 3030 /* Otherwise, if there is such a node, but that node 3031 is not in the stack of open elements, then this is a 3032 parse error; remove the element from the list, and 3033 abort these steps. */ 3034 } elseif (isset($formatting_element) && !$in_stack) { 3035 unset($this->a_formatting[$fe_af_pos]); 3036 $this->a_formatting = array_merge($this->a_formatting); 3037 break; 3038 } 3039 3040 /* 2. Let the furthest block be the topmost node in the 3041 stack of open elements that is lower in the stack 3042 than the formatting element, and is not an element in 3043 the phrasing or formatting categories. There might 3044 not be one. */ 3045 $fe_s_pos = array_search($formatting_element, $this->stack, true); 3046 $length = count($this->stack); 3047 3048 for ($s = $fe_s_pos + 1; $s < $length; $s++) { 3049 $category = $this->getElementCategory($this->stack[$s]->nodeName); 3050 3051 if ($category !== self::PHRASING && $category !== self::FORMATTING) { 3052 $furthest_block = $this->stack[$s]; 3053 } 3054 } 3055 3056 /* 3. If there is no furthest block, then the UA must 3057 skip the subsequent steps and instead just pop all 3058 the nodes from the bottom of the stack of open 3059 elements, from the current node up to the formatting 3060 element, and remove the formatting element from the 3061 list of active formatting elements. */ 3062 if (!isset($furthest_block)) { 3063 for ($n = $length - 1; $n >= $fe_s_pos; $n--) { 3064 array_pop($this->stack); 3065 } 3066 3067 unset($this->a_formatting[$fe_af_pos]); 3068 $this->a_formatting = array_merge($this->a_formatting); 3069 break; 3070 } 3071 3072 /* 4. Let the common ancestor be the element 3073 immediately above the formatting element in the stack 3074 of open elements. */ 3075 $common_ancestor = $this->stack[$fe_s_pos - 1]; 3076 3077 /* 5. If the furthest block has a parent node, then 3078 remove the furthest block from its parent node. */ 3079 if ($furthest_block->parentNode !== null) { 3080 $furthest_block->parentNode->removeChild($furthest_block); 3081 } 3082 3083 /* 6. Let a bookmark note the position of the 3084 formatting element in the list of active formatting 3085 elements relative to the elements on either side 3086 of it in the list. */ 3087 $bookmark = $fe_af_pos; 3088 3089 /* 7. Let node and last node be the furthest block. 3090 Follow these steps: */ 3091 $node = $furthest_block; 3092 $last_node = $furthest_block; 3093 3094 while (true) { 3095 for ($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) { 3096 /* 7.1 Let node be the element immediately 3097 prior to node in the stack of open elements. */ 3098 $node = $this->stack[$n]; 3099 3100 /* 7.2 If node is not in the list of active 3101 formatting elements, then remove node from 3102 the stack of open elements and then go back 3103 to step 1. */ 3104 if (!in_array($node, $this->a_formatting, true)) { 3105 unset($this->stack[$n]); 3106 $this->stack = array_merge($this->stack); 3107 3108 } else { 3109 break; 3110 } 3111 } 3112 3113 /* 7.3 Otherwise, if node is the formatting 3114 element, then go to the next step in the overall 3115 algorithm. */ 3116 if ($node === $formatting_element) { 3117 break; 3118 3119 /* 7.4 Otherwise, if last node is the furthest 3120 block, then move the aforementioned bookmark to 3121 be immediately after the node in the list of 3122 active formatting elements. */ 3123 } elseif ($last_node === $furthest_block) { 3124 $bookmark = array_search($node, $this->a_formatting, true) + 1; 3125 } 3126 3127 /* 7.5 If node has any children, perform a 3128 shallow clone of node, replace the entry for 3129 node in the list of active formatting elements 3130 with an entry for the clone, replace the entry 3131 for node in the stack of open elements with an 3132 entry for the clone, and let node be the clone. */ 3133 if ($node->hasChildNodes()) { 3134 $clone = $node->cloneNode(); 3135 $s_pos = array_search($node, $this->stack, true); 3136 $a_pos = array_search($node, $this->a_formatting, true); 3137 3138 $this->stack[$s_pos] = $clone; 3139 $this->a_formatting[$a_pos] = $clone; 3140 $node = $clone; 3141 } 3142 3143 /* 7.6 Insert last node into node, first removing 3144 it from its previous parent node if any. */ 3145 if ($last_node->parentNode !== null) { 3146 $last_node->parentNode->removeChild($last_node); 3147 } 3148 3149 $node->appendChild($last_node); 3150 3151 /* 7.7 Let last node be node. */ 3152 $last_node = $node; 3153 } 3154 3155 /* 8. Insert whatever last node ended up being in 3156 the previous step into the common ancestor node, 3157 first removing it from its previous parent node if 3158 any. */ 3159 if ($last_node->parentNode !== null) { 3160 $last_node->parentNode->removeChild($last_node); 3161 } 3162 3163 $common_ancestor->appendChild($last_node); 3164 3165 /* 9. Perform a shallow clone of the formatting 3166 element. */ 3167 $clone = $formatting_element->cloneNode(); 3168 3169 /* 10. Take all of the child nodes of the furthest 3170 block and append them to the clone created in the 3171 last step. */ 3172 while ($furthest_block->hasChildNodes()) { 3173 $child = $furthest_block->firstChild; 3174 $furthest_block->removeChild($child); 3175 $clone->appendChild($child); 3176 } 3177 3178 /* 11. Append that clone to the furthest block. */ 3179 $furthest_block->appendChild($clone); 3180 3181 /* 12. Remove the formatting element from the list 3182 of active formatting elements, and insert the clone 3183 into the list of active formatting elements at the 3184 position of the aforementioned bookmark. */ 3185 $fe_af_pos = array_search($formatting_element, $this->a_formatting, true); 3186 unset($this->a_formatting[$fe_af_pos]); 3187 $this->a_formatting = array_merge($this->a_formatting); 3188 3189 $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1); 3190 $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting)); 3191 $this->a_formatting = array_merge($af_part1, array($clone), $af_part2); 3192 3193 /* 13. Remove the formatting element from the stack 3194 of open elements, and insert the clone into the stack 3195 of open elements immediately after (i.e. in a more 3196 deeply nested position than) the position of the 3197 furthest block in that stack. */ 3198 $fe_s_pos = array_search($formatting_element, $this->stack, true); 3199 $fb_s_pos = array_search($furthest_block, $this->stack, true); 3200 unset($this->stack[$fe_s_pos]); 3201 3202 $s_part1 = array_slice($this->stack, 0, $fb_s_pos); 3203 $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack)); 3204 $this->stack = array_merge($s_part1, array($clone), $s_part2); 3205 3206 /* 14. Jump back to step 1 in this series of steps. */ 3207 unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block); 3208 } 3209 break; 3210 3211 /* An end tag token whose tag name is one of: "button", 3212 "marquee", "object" */ 3213 case 'button': 3214 case 'marquee': 3215 case 'object': 3216 /* If the stack of open elements has an element in scope whose 3217 tag name matches the tag name of the token, then generate implied 3218 tags. */ 3219 if ($this->elementInScope($token['name'])) { 3220 $this->generateImpliedEndTags(); 3221 3222 /* Now, if the current node is not an element with the same 3223 tag name as the token, then this is a parse error. */ 3224 // k 3225 3226 /* Now, if the stack of open elements has an element in scope 3227 whose tag name matches the tag name of the token, then pop 3228 elements from the stack until that element has been popped from 3229 the stack, and clear the list of active formatting elements up 3230 to the last marker. */ 3231 for ($n = count($this->stack) - 1; $n >= 0; $n--) { 3232 if ($this->stack[$n]->nodeName === $token['name']) { 3233 $n = -1; 3234 } 3235 3236 array_pop($this->stack); 3237 } 3238 3239 $marker = end(array_keys($this->a_formatting, self::MARKER, true)); 3240 3241 for ($n = count($this->a_formatting) - 1; $n > $marker; $n--) { 3242 array_pop($this->a_formatting); 3243 } 3244 } 3245 break; 3246 3247 /* Or an end tag whose tag name is one of: "area", "basefont", 3248 "bgsound", "br", "embed", "hr", "iframe", "image", "img", 3249 "input", "isindex", "noembed", "noframes", "param", "select", 3250 "spacer", "table", "textarea", "wbr" */ 3251 case 'area': 3252 case 'basefont': 3253 case 'bgsound': 3254 case 'br': 3255 case 'embed': 3256 case 'hr': 3257 case 'iframe': 3258 case 'image': 3259 case 'img': 3260 case 'input': 3261 case 'isindex': 3262 case 'noembed': 3263 case 'noframes': 3264 case 'param': 3265 case 'select': 3266 case 'spacer': 3267 case 'table': 3268 case 'textarea': 3269 case 'wbr': 3270 // Parse error. Ignore the token. 3271 break; 3272 3273 /* An end tag token not covered by the previous entries */ 3274 default: 3275 for ($n = count($this->stack) - 1; $n >= 0; $n--) { 3276 /* Initialise node to be the current node (the bottommost 3277 node of the stack). */ 3278 $node = end($this->stack); 3279 3280 /* If node has the same tag name as the end tag token, 3281 then: */ 3282 if ($token['name'] === $node->nodeName) { 3283 /* Generate implied end tags. */ 3284 $this->generateImpliedEndTags(); 3285 3286 /* If the tag name of the end tag token does not 3287 match the tag name of the current node, this is a 3288 parse error. */ 3289 // k 3290 3291 /* Pop all the nodes from the current node up to 3292 node, including node, then stop this algorithm. */ 3293 for ($x = count($this->stack) - $n; $x >= $n; $x--) { 3294 array_pop($this->stack); 3295 } 3296 3297 } else { 3298 $category = $this->getElementCategory($node); 3299 3300 if ($category !== self::SPECIAL && $category !== self::SCOPING) { 3301 /* Otherwise, if node is in neither the formatting 3302 category nor the phrasing category, then this is a 3303 parse error. Stop this algorithm. The end tag token 3304 is ignored. */ 3305 return false; 3306 } 3307 } 3308 } 3309 break; 3310 } 3311 break; 3312 } 3313 } 3314 3315 private function inTable($token) 3316 { 3317 $clear = array('html', 'table'); 3318 3319 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 3320 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 3321 or U+0020 SPACE */ 3322 if ($token['type'] === HTML5::CHARACTR && 3323 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 3324 ) { 3325 /* Append the character to the current node. */ 3326 $text = $this->dom->createTextNode($token['data']); 3327 end($this->stack)->appendChild($text); 3328 3329 /* A comment token */ 3330 } elseif ($token['type'] === HTML5::COMMENT) { 3331 /* Append a Comment node to the current node with the data 3332 attribute set to the data given in the comment token. */ 3333 $comment = $this->dom->createComment($token['data']); 3334 end($this->stack)->appendChild($comment); 3335 3336 /* A start tag whose tag name is "caption" */ 3337 } elseif ($token['type'] === HTML5::STARTTAG && 3338 $token['name'] === 'caption' 3339 ) { 3340 /* Clear the stack back to a table context. */ 3341 $this->clearStackToTableContext($clear); 3342 3343 /* Insert a marker at the end of the list of active 3344 formatting elements. */ 3345 $this->a_formatting[] = self::MARKER; 3346 3347 /* Insert an HTML element for the token, then switch the 3348 insertion mode to "in caption". */ 3349 $this->insertElement($token); 3350 $this->mode = self::IN_CAPTION; 3351 3352 /* A start tag whose tag name is "colgroup" */ 3353 } elseif ($token['type'] === HTML5::STARTTAG && 3354 $token['name'] === 'colgroup' 3355 ) { 3356 /* Clear the stack back to a table context. */ 3357 $this->clearStackToTableContext($clear); 3358 3359 /* Insert an HTML element for the token, then switch the 3360 insertion mode to "in column group". */ 3361 $this->insertElement($token); 3362 $this->mode = self::IN_CGROUP; 3363 3364 /* A start tag whose tag name is "col" */ 3365 } elseif ($token['type'] === HTML5::STARTTAG && 3366 $token['name'] === 'col' 3367 ) { 3368 $this->inTable( 3369 array( 3370 'name' => 'colgroup', 3371 'type' => HTML5::STARTTAG, 3372 'attr' => array() 3373 ) 3374 ); 3375 3376 $this->inColumnGroup($token); 3377 3378 /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */ 3379 } elseif ($token['type'] === HTML5::STARTTAG && in_array( 3380 $token['name'], 3381 array('tbody', 'tfoot', 'thead') 3382 ) 3383 ) { 3384 /* Clear the stack back to a table context. */ 3385 $this->clearStackToTableContext($clear); 3386 3387 /* Insert an HTML element for the token, then switch the insertion 3388 mode to "in table body". */ 3389 $this->insertElement($token); 3390 $this->mode = self::IN_TBODY; 3391 3392 /* A start tag whose tag name is one of: "td", "th", "tr" */ 3393 } elseif ($token['type'] === HTML5::STARTTAG && 3394 in_array($token['name'], array('td', 'th', 'tr')) 3395 ) { 3396 /* Act as if a start tag token with the tag name "tbody" had been 3397 seen, then reprocess the current token. */ 3398 $this->inTable( 3399 array( 3400 'name' => 'tbody', 3401 'type' => HTML5::STARTTAG, 3402 'attr' => array() 3403 ) 3404 ); 3405 3406 return $this->inTableBody($token); 3407 3408 /* A start tag whose tag name is "table" */ 3409 } elseif ($token['type'] === HTML5::STARTTAG && 3410 $token['name'] === 'table' 3411 ) { 3412 /* Parse error. Act as if an end tag token with the tag name "table" 3413 had been seen, then, if that token wasn't ignored, reprocess the 3414 current token. */ 3415 $this->inTable( 3416 array( 3417 'name' => 'table', 3418 'type' => HTML5::ENDTAG 3419 ) 3420 ); 3421 3422 return $this->mainPhase($token); 3423 3424 /* An end tag whose tag name is "table" */ 3425 } elseif ($token['type'] === HTML5::ENDTAG && 3426 $token['name'] === 'table' 3427 ) { 3428 /* If the stack of open elements does not have an element in table 3429 scope with the same tag name as the token, this is a parse error. 3430 Ignore the token. (innerHTML case) */ 3431 if (!$this->elementInScope($token['name'], true)) { 3432 return false; 3433 3434 /* Otherwise: */ 3435 } else { 3436 /* Generate implied end tags. */ 3437 $this->generateImpliedEndTags(); 3438 3439 /* Now, if the current node is not a table element, then this 3440 is a parse error. */ 3441 // w/e 3442 3443 /* Pop elements from this stack until a table element has been 3444 popped from the stack. */ 3445 while (true) { 3446 $current = end($this->stack)->nodeName; 3447 array_pop($this->stack); 3448 3449 if ($current === 'table') { 3450 break; 3451 } 3452 } 3453 3454 /* Reset the insertion mode appropriately. */ 3455 $this->resetInsertionMode(); 3456 } 3457 3458 /* An end tag whose tag name is one of: "body", "caption", "col", 3459 "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ 3460 } elseif ($token['type'] === HTML5::ENDTAG && in_array( 3461 $token['name'], 3462 array( 3463 'body', 3464 'caption', 3465 'col', 3466 'colgroup', 3467 'html', 3468 'tbody', 3469 'td', 3470 'tfoot', 3471 'th', 3472 'thead', 3473 'tr' 3474 ) 3475 ) 3476 ) { 3477 // Parse error. Ignore the token. 3478 3479 /* Anything else */ 3480 } else { 3481 /* Parse error. Process the token as if the insertion mode was "in 3482 body", with the following exception: */ 3483 3484 /* If the current node is a table, tbody, tfoot, thead, or tr 3485 element, then, whenever a node would be inserted into the current 3486 node, it must instead be inserted into the foster parent element. */ 3487 if (in_array( 3488 end($this->stack)->nodeName, 3489 array('table', 'tbody', 'tfoot', 'thead', 'tr') 3490 ) 3491 ) { 3492 /* The foster parent element is the parent element of the last 3493 table element in the stack of open elements, if there is a 3494 table element and it has such a parent element. If there is no 3495 table element in the stack of open elements (innerHTML case), 3496 then the foster parent element is the first element in the 3497 stack of open elements (the html element). Otherwise, if there 3498 is a table element in the stack of open elements, but the last 3499 table element in the stack of open elements has no parent, or 3500 its parent node is not an element, then the foster parent 3501 element is the element before the last table element in the 3502 stack of open elements. */ 3503 for ($n = count($this->stack) - 1; $n >= 0; $n--) { 3504 if ($this->stack[$n]->nodeName === 'table') { 3505 $table = $this->stack[$n]; 3506 break; 3507 } 3508 } 3509 3510 if (isset($table) && $table->parentNode !== null) { 3511 $this->foster_parent = $table->parentNode; 3512 3513 } elseif (!isset($table)) { 3514 $this->foster_parent = $this->stack[0]; 3515 3516 } elseif (isset($table) && ($table->parentNode === null || 3517 $table->parentNode->nodeType !== XML_ELEMENT_NODE) 3518 ) { 3519 $this->foster_parent = $this->stack[$n - 1]; 3520 } 3521 } 3522 3523 $this->inBody($token); 3524 } 3525 } 3526 3527 private function inCaption($token) 3528 { 3529 /* An end tag whose tag name is "caption" */ 3530 if ($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') { 3531 /* If the stack of open elements does not have an element in table 3532 scope with the same tag name as the token, this is a parse error. 3533 Ignore the token. (innerHTML case) */ 3534 if (!$this->elementInScope($token['name'], true)) { 3535 // Ignore 3536 3537 /* Otherwise: */ 3538 } else { 3539 /* Generate implied end tags. */ 3540 $this->generateImpliedEndTags(); 3541 3542 /* Now, if the current node is not a caption element, then this 3543 is a parse error. */ 3544 // w/e 3545 3546 /* Pop elements from this stack until a caption element has 3547 been popped from the stack. */ 3548 while (true) { 3549 $node = end($this->stack)->nodeName; 3550 array_pop($this->stack); 3551 3552 if ($node === 'caption') { 3553 break; 3554 } 3555 } 3556 3557 /* Clear the list of active formatting elements up to the last 3558 marker. */ 3559 $this->clearTheActiveFormattingElementsUpToTheLastMarker(); 3560 3561 /* Switch the insertion mode to "in table". */ 3562 $this->mode = self::IN_TABLE; 3563 } 3564 3565 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3566 "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag 3567 name is "table" */ 3568 } elseif (($token['type'] === HTML5::STARTTAG && in_array( 3569 $token['name'], 3570 array( 3571 'caption', 3572 'col', 3573 'colgroup', 3574 'tbody', 3575 'td', 3576 'tfoot', 3577 'th', 3578 'thead', 3579 'tr' 3580 ) 3581 )) || ($token['type'] === HTML5::ENDTAG && 3582 $token['name'] === 'table') 3583 ) { 3584 /* Parse error. Act as if an end tag with the tag name "caption" 3585 had been seen, then, if that token wasn't ignored, reprocess the 3586 current token. */ 3587 $this->inCaption( 3588 array( 3589 'name' => 'caption', 3590 'type' => HTML5::ENDTAG 3591 ) 3592 ); 3593 3594 return $this->inTable($token); 3595 3596 /* An end tag whose tag name is one of: "body", "col", "colgroup", 3597 "html", "tbody", "td", "tfoot", "th", "thead", "tr" */ 3598 } elseif ($token['type'] === HTML5::ENDTAG && in_array( 3599 $token['name'], 3600 array( 3601 'body', 3602 'col', 3603 'colgroup', 3604 'html', 3605 'tbody', 3606 'tfoot', 3607 'th', 3608 'thead', 3609 'tr' 3610 ) 3611 ) 3612 ) { 3613 // Parse error. Ignore the token. 3614 3615 /* Anything else */ 3616 } else { 3617 /* Process the token as if the insertion mode was "in body". */ 3618 $this->inBody($token); 3619 } 3620 } 3621 3622 private function inColumnGroup($token) 3623 { 3624 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 3625 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 3626 or U+0020 SPACE */ 3627 if ($token['type'] === HTML5::CHARACTR && 3628 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 3629 ) { 3630 /* Append the character to the current node. */ 3631 $text = $this->dom->createTextNode($token['data']); 3632 end($this->stack)->appendChild($text); 3633 3634 /* A comment token */ 3635 } elseif ($token['type'] === HTML5::COMMENT) { 3636 /* Append a Comment node to the current node with the data 3637 attribute set to the data given in the comment token. */ 3638 $comment = $this->dom->createComment($token['data']); 3639 end($this->stack)->appendChild($comment); 3640 3641 /* A start tag whose tag name is "col" */ 3642 } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') { 3643 /* Insert a col element for the token. Immediately pop the current 3644 node off the stack of open elements. */ 3645 $this->insertElement($token); 3646 array_pop($this->stack); 3647 3648 /* An end tag whose tag name is "colgroup" */ 3649 } elseif ($token['type'] === HTML5::ENDTAG && 3650 $token['name'] === 'colgroup' 3651 ) { 3652 /* If the current node is the root html element, then this is a 3653 parse error, ignore the token. (innerHTML case) */ 3654 if (end($this->stack)->nodeName === 'html') { 3655 // Ignore 3656 3657 /* Otherwise, pop the current node (which will be a colgroup 3658 element) from the stack of open elements. Switch the insertion 3659 mode to "in table". */ 3660 } else { 3661 array_pop($this->stack); 3662 $this->mode = self::IN_TABLE; 3663 } 3664 3665 /* An end tag whose tag name is "col" */ 3666 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') { 3667 /* Parse error. Ignore the token. */ 3668 3669 /* Anything else */ 3670 } else { 3671 /* Act as if an end tag with the tag name "colgroup" had been seen, 3672 and then, if that token wasn't ignored, reprocess the current token. */ 3673 $this->inColumnGroup( 3674 array( 3675 'name' => 'colgroup', 3676 'type' => HTML5::ENDTAG 3677 ) 3678 ); 3679 3680 return $this->inTable($token); 3681 } 3682 } 3683 3684 private function inTableBody($token) 3685 { 3686 $clear = array('tbody', 'tfoot', 'thead', 'html'); 3687 3688 /* A start tag whose tag name is "tr" */ 3689 if ($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') { 3690 /* Clear the stack back to a table body context. */ 3691 $this->clearStackToTableContext($clear); 3692 3693 /* Insert a tr element for the token, then switch the insertion 3694 mode to "in row". */ 3695 $this->insertElement($token); 3696 $this->mode = self::IN_ROW; 3697 3698 /* A start tag whose tag name is one of: "th", "td" */ 3699 } elseif ($token['type'] === HTML5::STARTTAG && 3700 ($token['name'] === 'th' || $token['name'] === 'td') 3701 ) { 3702 /* Parse error. Act as if a start tag with the tag name "tr" had 3703 been seen, then reprocess the current token. */ 3704 $this->inTableBody( 3705 array( 3706 'name' => 'tr', 3707 'type' => HTML5::STARTTAG, 3708 'attr' => array() 3709 ) 3710 ); 3711 3712 return $this->inRow($token); 3713 3714 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ 3715 } elseif ($token['type'] === HTML5::ENDTAG && 3716 in_array($token['name'], array('tbody', 'tfoot', 'thead')) 3717 ) { 3718 /* If the stack of open elements does not have an element in table 3719 scope with the same tag name as the token, this is a parse error. 3720 Ignore the token. */ 3721 if (!$this->elementInScope($token['name'], true)) { 3722 // Ignore 3723 3724 /* Otherwise: */ 3725 } else { 3726 /* Clear the stack back to a table body context. */ 3727 $this->clearStackToTableContext($clear); 3728 3729 /* Pop the current node from the stack of open elements. Switch 3730 the insertion mode to "in table". */ 3731 array_pop($this->stack); 3732 $this->mode = self::IN_TABLE; 3733 } 3734 3735 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3736 "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */ 3737 } elseif (($token['type'] === HTML5::STARTTAG && in_array( 3738 $token['name'], 3739 array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead') 3740 )) || 3741 ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table') 3742 ) { 3743 /* If the stack of open elements does not have a tbody, thead, or 3744 tfoot element in table scope, this is a parse error. Ignore the 3745 token. (innerHTML case) */ 3746 if (!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) { 3747 // Ignore. 3748 3749 /* Otherwise: */ 3750 } else { 3751 /* Clear the stack back to a table body context. */ 3752 $this->clearStackToTableContext($clear); 3753 3754 /* Act as if an end tag with the same tag name as the current 3755 node ("tbody", "tfoot", or "thead") had been seen, then 3756 reprocess the current token. */ 3757 $this->inTableBody( 3758 array( 3759 'name' => end($this->stack)->nodeName, 3760 'type' => HTML5::ENDTAG 3761 ) 3762 ); 3763 3764 return $this->mainPhase($token); 3765 } 3766 3767 /* An end tag whose tag name is one of: "body", "caption", "col", 3768 "colgroup", "html", "td", "th", "tr" */ 3769 } elseif ($token['type'] === HTML5::ENDTAG && in_array( 3770 $token['name'], 3771 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr') 3772 ) 3773 ) { 3774 /* Parse error. Ignore the token. */ 3775 3776 /* Anything else */ 3777 } else { 3778 /* Process the token as if the insertion mode was "in table". */ 3779 $this->inTable($token); 3780 } 3781 } 3782 3783 private function inRow($token) 3784 { 3785 $clear = array('tr', 'html'); 3786 3787 /* A start tag whose tag name is one of: "th", "td" */ 3788 if ($token['type'] === HTML5::STARTTAG && 3789 ($token['name'] === 'th' || $token['name'] === 'td') 3790 ) { 3791 /* Clear the stack back to a table row context. */ 3792 $this->clearStackToTableContext($clear); 3793 3794 /* Insert an HTML element for the token, then switch the insertion 3795 mode to "in cell". */ 3796 $this->insertElement($token); 3797 $this->mode = self::IN_CELL; 3798 3799 /* Insert a marker at the end of the list of active formatting 3800 elements. */ 3801 $this->a_formatting[] = self::MARKER; 3802 3803 /* An end tag whose tag name is "tr" */ 3804 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') { 3805 /* If the stack of open elements does not have an element in table 3806 scope with the same tag name as the token, this is a parse error. 3807 Ignore the token. (innerHTML case) */ 3808 if (!$this->elementInScope($token['name'], true)) { 3809 // Ignore. 3810 3811 /* Otherwise: */ 3812 } else { 3813 /* Clear the stack back to a table row context. */ 3814 $this->clearStackToTableContext($clear); 3815 3816 /* Pop the current node (which will be a tr element) from the 3817 stack of open elements. Switch the insertion mode to "in table 3818 body". */ 3819 array_pop($this->stack); 3820 $this->mode = self::IN_TBODY; 3821 } 3822 3823 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3824 "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */ 3825 } elseif ($token['type'] === HTML5::STARTTAG && in_array( 3826 $token['name'], 3827 array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr') 3828 ) 3829 ) { 3830 /* Act as if an end tag with the tag name "tr" had been seen, then, 3831 if that token wasn't ignored, reprocess the current token. */ 3832 $this->inRow( 3833 array( 3834 'name' => 'tr', 3835 'type' => HTML5::ENDTAG 3836 ) 3837 ); 3838 3839 return $this->inCell($token); 3840 3841 /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */ 3842 } elseif ($token['type'] === HTML5::ENDTAG && 3843 in_array($token['name'], array('tbody', 'tfoot', 'thead')) 3844 ) { 3845 /* If the stack of open elements does not have an element in table 3846 scope with the same tag name as the token, this is a parse error. 3847 Ignore the token. */ 3848 if (!$this->elementInScope($token['name'], true)) { 3849 // Ignore. 3850 3851 /* Otherwise: */ 3852 } else { 3853 /* Otherwise, act as if an end tag with the tag name "tr" had 3854 been seen, then reprocess the current token. */ 3855 $this->inRow( 3856 array( 3857 'name' => 'tr', 3858 'type' => HTML5::ENDTAG 3859 ) 3860 ); 3861 3862 return $this->inCell($token); 3863 } 3864 3865 /* An end tag whose tag name is one of: "body", "caption", "col", 3866 "colgroup", "html", "td", "th" */ 3867 } elseif ($token['type'] === HTML5::ENDTAG && in_array( 3868 $token['name'], 3869 array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr') 3870 ) 3871 ) { 3872 /* Parse error. Ignore the token. */ 3873 3874 /* Anything else */ 3875 } else { 3876 /* Process the token as if the insertion mode was "in table". */ 3877 $this->inTable($token); 3878 } 3879 } 3880 3881 private function inCell($token) 3882 { 3883 /* An end tag whose tag name is one of: "td", "th" */ 3884 if ($token['type'] === HTML5::ENDTAG && 3885 ($token['name'] === 'td' || $token['name'] === 'th') 3886 ) { 3887 /* If the stack of open elements does not have an element in table 3888 scope with the same tag name as that of the token, then this is a 3889 parse error and the token must be ignored. */ 3890 if (!$this->elementInScope($token['name'], true)) { 3891 // Ignore. 3892 3893 /* Otherwise: */ 3894 } else { 3895 /* Generate implied end tags, except for elements with the same 3896 tag name as the token. */ 3897 $this->generateImpliedEndTags(array($token['name'])); 3898 3899 /* Now, if the current node is not an element with the same tag 3900 name as the token, then this is a parse error. */ 3901 // k 3902 3903 /* Pop elements from this stack until an element with the same 3904 tag name as the token has been popped from the stack. */ 3905 while (true) { 3906 $node = end($this->stack)->nodeName; 3907 array_pop($this->stack); 3908 3909 if ($node === $token['name']) { 3910 break; 3911 } 3912 } 3913 3914 /* Clear the list of active formatting elements up to the last 3915 marker. */ 3916 $this->clearTheActiveFormattingElementsUpToTheLastMarker(); 3917 3918 /* Switch the insertion mode to "in row". (The current node 3919 will be a tr element at this point.) */ 3920 $this->mode = self::IN_ROW; 3921 } 3922 3923 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3924 "tbody", "td", "tfoot", "th", "thead", "tr" */ 3925 } elseif ($token['type'] === HTML5::STARTTAG && in_array( 3926 $token['name'], 3927 array( 3928 'caption', 3929 'col', 3930 'colgroup', 3931 'tbody', 3932 'td', 3933 'tfoot', 3934 'th', 3935 'thead', 3936 'tr' 3937 ) 3938 ) 3939 ) { 3940 /* If the stack of open elements does not have a td or th element 3941 in table scope, then this is a parse error; ignore the token. 3942 (innerHTML case) */ 3943 if (!$this->elementInScope(array('td', 'th'), true)) { 3944 // Ignore. 3945 3946 /* Otherwise, close the cell (see below) and reprocess the current 3947 token. */ 3948 } else { 3949 $this->closeCell(); 3950 return $this->inRow($token); 3951 } 3952 3953 /* A start tag whose tag name is one of: "caption", "col", "colgroup", 3954 "tbody", "td", "tfoot", "th", "thead", "tr" */ 3955 } elseif ($token['type'] === HTML5::STARTTAG && in_array( 3956 $token['name'], 3957 array( 3958 'caption', 3959 'col', 3960 'colgroup', 3961 'tbody', 3962 'td', 3963 'tfoot', 3964 'th', 3965 'thead', 3966 'tr' 3967 ) 3968 ) 3969 ) { 3970 /* If the stack of open elements does not have a td or th element 3971 in table scope, then this is a parse error; ignore the token. 3972 (innerHTML case) */ 3973 if (!$this->elementInScope(array('td', 'th'), true)) { 3974 // Ignore. 3975 3976 /* Otherwise, close the cell (see below) and reprocess the current 3977 token. */ 3978 } else { 3979 $this->closeCell(); 3980 return $this->inRow($token); 3981 } 3982 3983 /* An end tag whose tag name is one of: "body", "caption", "col", 3984 "colgroup", "html" */ 3985 } elseif ($token['type'] === HTML5::ENDTAG && in_array( 3986 $token['name'], 3987 array('body', 'caption', 'col', 'colgroup', 'html') 3988 ) 3989 ) { 3990 /* Parse error. Ignore the token. */ 3991 3992 /* An end tag whose tag name is one of: "table", "tbody", "tfoot", 3993 "thead", "tr" */ 3994 } elseif ($token['type'] === HTML5::ENDTAG && in_array( 3995 $token['name'], 3996 array('table', 'tbody', 'tfoot', 'thead', 'tr') 3997 ) 3998 ) { 3999 /* If the stack of open elements does not have an element in table 4000 scope with the same tag name as that of the token (which can only 4001 happen for "tbody", "tfoot" and "thead", or, in the innerHTML case), 4002 then this is a parse error and the token must be ignored. */ 4003 if (!$this->elementInScope($token['name'], true)) { 4004 // Ignore. 4005 4006 /* Otherwise, close the cell (see below) and reprocess the current 4007 token. */ 4008 } else { 4009 $this->closeCell(); 4010 return $this->inRow($token); 4011 } 4012 4013 /* Anything else */ 4014 } else { 4015 /* Process the token as if the insertion mode was "in body". */ 4016 $this->inBody($token); 4017 } 4018 } 4019 4020 private function inSelect($token) 4021 { 4022 /* Handle the token as follows: */ 4023 4024 /* A character token */ 4025 if ($token['type'] === HTML5::CHARACTR) { 4026 /* Append the token's character to the current node. */ 4027 $this->insertText($token['data']); 4028 4029 /* A comment token */ 4030 } elseif ($token['type'] === HTML5::COMMENT) { 4031 /* Append a Comment node to the current node with the data 4032 attribute set to the data given in the comment token. */ 4033 $this->insertComment($token['data']); 4034 4035 /* A start tag token whose tag name is "option" */ 4036 } elseif ($token['type'] === HTML5::STARTTAG && 4037 $token['name'] === 'option' 4038 ) { 4039 /* If the current node is an option element, act as if an end tag 4040 with the tag name "option" had been seen. */ 4041 if (end($this->stack)->nodeName === 'option') { 4042 $this->inSelect( 4043 array( 4044 'name' => 'option', 4045 'type' => HTML5::ENDTAG 4046 ) 4047 ); 4048 } 4049 4050 /* Insert an HTML element for the token. */ 4051 $this->insertElement($token); 4052 4053 /* A start tag token whose tag name is "optgroup" */ 4054 } elseif ($token['type'] === HTML5::STARTTAG && 4055 $token['name'] === 'optgroup' 4056 ) { 4057 /* If the current node is an option element, act as if an end tag 4058 with the tag name "option" had been seen. */ 4059 if (end($this->stack)->nodeName === 'option') { 4060 $this->inSelect( 4061 array( 4062 'name' => 'option', 4063 'type' => HTML5::ENDTAG 4064 ) 4065 ); 4066 } 4067 4068 /* If the current node is an optgroup element, act as if an end tag 4069 with the tag name "optgroup" had been seen. */ 4070 if (end($this->stack)->nodeName === 'optgroup') { 4071 $this->inSelect( 4072 array( 4073 'name' => 'optgroup', 4074 'type' => HTML5::ENDTAG 4075 ) 4076 ); 4077 } 4078 4079 /* Insert an HTML element for the token. */ 4080 $this->insertElement($token); 4081 4082 /* An end tag token whose tag name is "optgroup" */ 4083 } elseif ($token['type'] === HTML5::ENDTAG && 4084 $token['name'] === 'optgroup' 4085 ) { 4086 /* First, if the current node is an option element, and the node 4087 immediately before it in the stack of open elements is an optgroup 4088 element, then act as if an end tag with the tag name "option" had 4089 been seen. */ 4090 $elements_in_stack = count($this->stack); 4091 4092 if ($this->stack[$elements_in_stack - 1]->nodeName === 'option' && 4093 $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup' 4094 ) { 4095 $this->inSelect( 4096 array( 4097 'name' => 'option', 4098 'type' => HTML5::ENDTAG 4099 ) 4100 ); 4101 } 4102 4103 /* If the current node is an optgroup element, then pop that node 4104 from the stack of open elements. Otherwise, this is a parse error, 4105 ignore the token. */ 4106 if ($this->stack[$elements_in_stack - 1] === 'optgroup') { 4107 array_pop($this->stack); 4108 } 4109 4110 /* An end tag token whose tag name is "option" */ 4111 } elseif ($token['type'] === HTML5::ENDTAG && 4112 $token['name'] === 'option' 4113 ) { 4114 /* If the current node is an option element, then pop that node 4115 from the stack of open elements. Otherwise, this is a parse error, 4116 ignore the token. */ 4117 if (end($this->stack)->nodeName === 'option') { 4118 array_pop($this->stack); 4119 } 4120 4121 /* An end tag whose tag name is "select" */ 4122 } elseif ($token['type'] === HTML5::ENDTAG && 4123 $token['name'] === 'select' 4124 ) { 4125 /* If the stack of open elements does not have an element in table 4126 scope with the same tag name as the token, this is a parse error. 4127 Ignore the token. (innerHTML case) */ 4128 if (!$this->elementInScope($token['name'], true)) { 4129 // w/e 4130 4131 /* Otherwise: */ 4132 } else { 4133 /* Pop elements from the stack of open elements until a select 4134 element has been popped from the stack. */ 4135 while (true) { 4136 $current = end($this->stack)->nodeName; 4137 array_pop($this->stack); 4138 4139 if ($current === 'select') { 4140 break; 4141 } 4142 } 4143 4144 /* Reset the insertion mode appropriately. */ 4145 $this->resetInsertionMode(); 4146 } 4147 4148 /* A start tag whose tag name is "select" */ 4149 } elseif ($token['name'] === 'select' && 4150 $token['type'] === HTML5::STARTTAG 4151 ) { 4152 /* Parse error. Act as if the token had been an end tag with the 4153 tag name "select" instead. */ 4154 $this->inSelect( 4155 array( 4156 'name' => 'select', 4157 'type' => HTML5::ENDTAG 4158 ) 4159 ); 4160 4161 /* An end tag whose tag name is one of: "caption", "table", "tbody", 4162 "tfoot", "thead", "tr", "td", "th" */ 4163 } elseif (in_array( 4164 $token['name'], 4165 array( 4166 'caption', 4167 'table', 4168 'tbody', 4169 'tfoot', 4170 'thead', 4171 'tr', 4172 'td', 4173 'th' 4174 ) 4175 ) && $token['type'] === HTML5::ENDTAG 4176 ) { 4177 /* Parse error. */ 4178 // w/e 4179 4180 /* If the stack of open elements has an element in table scope with 4181 the same tag name as that of the token, then act as if an end tag 4182 with the tag name "select" had been seen, and reprocess the token. 4183 Otherwise, ignore the token. */ 4184 if ($this->elementInScope($token['name'], true)) { 4185 $this->inSelect( 4186 array( 4187 'name' => 'select', 4188 'type' => HTML5::ENDTAG 4189 ) 4190 ); 4191 4192 $this->mainPhase($token); 4193 } 4194 4195 /* Anything else */ 4196 } else { 4197 /* Parse error. Ignore the token. */ 4198 } 4199 } 4200 4201 private function afterBody($token) 4202 { 4203 /* Handle the token as follows: */ 4204 4205 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 4206 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 4207 or U+0020 SPACE */ 4208 if ($token['type'] === HTML5::CHARACTR && 4209 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 4210 ) { 4211 /* Process the token as it would be processed if the insertion mode 4212 was "in body". */ 4213 $this->inBody($token); 4214 4215 /* A comment token */ 4216 } elseif ($token['type'] === HTML5::COMMENT) { 4217 /* Append a Comment node to the first element in the stack of open 4218 elements (the html element), with the data attribute set to the 4219 data given in the comment token. */ 4220 $comment = $this->dom->createComment($token['data']); 4221 $this->stack[0]->appendChild($comment); 4222 4223 /* An end tag with the tag name "html" */ 4224 } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') { 4225 /* If the parser was originally created in order to handle the 4226 setting of an element's innerHTML attribute, this is a parse error; 4227 ignore the token. (The element will be an html element in this 4228 case.) (innerHTML case) */ 4229 4230 /* Otherwise, switch to the trailing end phase. */ 4231 $this->phase = self::END_PHASE; 4232 4233 /* Anything else */ 4234 } else { 4235 /* Parse error. Set the insertion mode to "in body" and reprocess 4236 the token. */ 4237 $this->mode = self::IN_BODY; 4238 return $this->inBody($token); 4239 } 4240 } 4241 4242 private function inFrameset($token) 4243 { 4244 /* Handle the token as follows: */ 4245 4246 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 4247 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 4248 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ 4249 if ($token['type'] === HTML5::CHARACTR && 4250 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 4251 ) { 4252 /* Append the character to the current node. */ 4253 $this->insertText($token['data']); 4254 4255 /* A comment token */ 4256 } elseif ($token['type'] === HTML5::COMMENT) { 4257 /* Append a Comment node to the current node with the data 4258 attribute set to the data given in the comment token. */ 4259 $this->insertComment($token['data']); 4260 4261 /* A start tag with the tag name "frameset" */ 4262 } elseif ($token['name'] === 'frameset' && 4263 $token['type'] === HTML5::STARTTAG 4264 ) { 4265 $this->insertElement($token); 4266 4267 /* An end tag with the tag name "frameset" */ 4268 } elseif ($token['name'] === 'frameset' && 4269 $token['type'] === HTML5::ENDTAG 4270 ) { 4271 /* If the current node is the root html element, then this is a 4272 parse error; ignore the token. (innerHTML case) */ 4273 if (end($this->stack)->nodeName === 'html') { 4274 // Ignore 4275 4276 } else { 4277 /* Otherwise, pop the current node from the stack of open 4278 elements. */ 4279 array_pop($this->stack); 4280 4281 /* If the parser was not originally created in order to handle 4282 the setting of an element's innerHTML attribute (innerHTML case), 4283 and the current node is no longer a frameset element, then change 4284 the insertion mode to "after frameset". */ 4285 $this->mode = self::AFTR_FRAME; 4286 } 4287 4288 /* A start tag with the tag name "frame" */ 4289 } elseif ($token['name'] === 'frame' && 4290 $token['type'] === HTML5::STARTTAG 4291 ) { 4292 /* Insert an HTML element for the token. */ 4293 $this->insertElement($token); 4294 4295 /* Immediately pop the current node off the stack of open elements. */ 4296 array_pop($this->stack); 4297 4298 /* A start tag with the tag name "noframes" */ 4299 } elseif ($token['name'] === 'noframes' && 4300 $token['type'] === HTML5::STARTTAG 4301 ) { 4302 /* Process the token as if the insertion mode had been "in body". */ 4303 $this->inBody($token); 4304 4305 /* Anything else */ 4306 } else { 4307 /* Parse error. Ignore the token. */ 4308 } 4309 } 4310 4311 private function afterFrameset($token) 4312 { 4313 /* Handle the token as follows: */ 4314 4315 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 4316 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 4317 U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */ 4318 if ($token['type'] === HTML5::CHARACTR && 4319 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 4320 ) { 4321 /* Append the character to the current node. */ 4322 $this->insertText($token['data']); 4323 4324 /* A comment token */ 4325 } elseif ($token['type'] === HTML5::COMMENT) { 4326 /* Append a Comment node to the current node with the data 4327 attribute set to the data given in the comment token. */ 4328 $this->insertComment($token['data']); 4329 4330 /* An end tag with the tag name "html" */ 4331 } elseif ($token['name'] === 'html' && 4332 $token['type'] === HTML5::ENDTAG 4333 ) { 4334 /* Switch to the trailing end phase. */ 4335 $this->phase = self::END_PHASE; 4336 4337 /* A start tag with the tag name "noframes" */ 4338 } elseif ($token['name'] === 'noframes' && 4339 $token['type'] === HTML5::STARTTAG 4340 ) { 4341 /* Process the token as if the insertion mode had been "in body". */ 4342 $this->inBody($token); 4343 4344 /* Anything else */ 4345 } else { 4346 /* Parse error. Ignore the token. */ 4347 } 4348 } 4349 4350 private function trailingEndPhase($token) 4351 { 4352 /* After the main phase, as each token is emitted from the tokenisation 4353 stage, it must be processed as described in this section. */ 4354 4355 /* A DOCTYPE token */ 4356 if ($token['type'] === HTML5::DOCTYPE) { 4357 // Parse error. Ignore the token. 4358 4359 /* A comment token */ 4360 } elseif ($token['type'] === HTML5::COMMENT) { 4361 /* Append a Comment node to the Document object with the data 4362 attribute set to the data given in the comment token. */ 4363 $comment = $this->dom->createComment($token['data']); 4364 $this->dom->appendChild($comment); 4365 4366 /* A character token that is one of one of U+0009 CHARACTER TABULATION, 4367 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 4368 or U+0020 SPACE */ 4369 } elseif ($token['type'] === HTML5::CHARACTR && 4370 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']) 4371 ) { 4372 /* Process the token as it would be processed in the main phase. */ 4373 $this->mainPhase($token); 4374 4375 /* A character token that is not one of U+0009 CHARACTER TABULATION, 4376 U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF), 4377 or U+0020 SPACE. Or a start tag token. Or an end tag token. */ 4378 } elseif (($token['type'] === HTML5::CHARACTR && 4379 preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || 4380 $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG 4381 ) { 4382 /* Parse error. Switch back to the main phase and reprocess the 4383 token. */ 4384 $this->phase = self::MAIN_PHASE; 4385 return $this->mainPhase($token); 4386 4387 /* An end-of-file token */ 4388 } elseif ($token['type'] === HTML5::EOF) { 4389 /* OMG DONE!! */ 4390 } 4391 } 4392 4393 private function insertElement($token, $append = true, $check = false) 4394 { 4395 // Proprietary workaround for libxml2's limitations with tag names 4396 if ($check) { 4397 // Slightly modified HTML5 tag-name modification, 4398 // removing anything that's not an ASCII letter, digit, or hyphen 4399 $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']); 4400 // Remove leading hyphens and numbers 4401 $token['name'] = ltrim($token['name'], '-0..9'); 4402 // In theory, this should ever be needed, but just in case 4403 if ($token['name'] === '') { 4404 $token['name'] = 'span'; 4405 } // arbitrary generic choice 4406 } 4407 4408 $el = $this->dom->createElement($token['name']); 4409 4410 foreach ($token['attr'] as $attr) { 4411 if (!$el->hasAttribute($attr['name'])) { 4412 $el->setAttribute($attr['name'], $attr['value']); 4413 } 4414 } 4415 4416 $this->appendToRealParent($el); 4417 $this->stack[] = $el; 4418 4419 return $el; 4420 } 4421 4422 private function insertText($data) 4423 { 4424 $text = $this->dom->createTextNode($data); 4425 $this->appendToRealParent($text); 4426 } 4427 4428 private function insertComment($data) 4429 { 4430 $comment = $this->dom->createComment($data); 4431 $this->appendToRealParent($comment); 4432 } 4433 4434 private function appendToRealParent($node) 4435 { 4436 if ($this->foster_parent === null) { 4437 end($this->stack)->appendChild($node); 4438 4439 } elseif ($this->foster_parent !== null) { 4440 /* If the foster parent element is the parent element of the 4441 last table element in the stack of open elements, then the new 4442 node must be inserted immediately before the last table element 4443 in the stack of open elements in the foster parent element; 4444 otherwise, the new node must be appended to the foster parent 4445 element. */ 4446 for ($n = count($this->stack) - 1; $n >= 0; $n--) { 4447 if ($this->stack[$n]->nodeName === 'table' && 4448 $this->stack[$n]->parentNode !== null 4449 ) { 4450 $table = $this->stack[$n]; 4451 break; 4452 } 4453 } 4454 4455 if (isset($table) && $this->foster_parent->isSameNode($table->parentNode)) { 4456 $this->foster_parent->insertBefore($node, $table); 4457 } else { 4458 $this->foster_parent->appendChild($node); 4459 } 4460 4461 $this->foster_parent = null; 4462 } 4463 } 4464 4465 private function elementInScope($el, $table = false) 4466 { 4467 if (is_array($el)) { 4468 foreach ($el as $element) { 4469 if ($this->elementInScope($element, $table)) { 4470 return true; 4471 } 4472 } 4473 4474 return false; 4475 } 4476 4477 $leng = count($this->stack); 4478 4479 for ($n = 0; $n < $leng; $n++) { 4480 /* 1. Initialise node to be the current node (the bottommost node of 4481 the stack). */ 4482 $node = $this->stack[$leng - 1 - $n]; 4483 4484 if ($node->tagName === $el) { 4485 /* 2. If node is the target node, terminate in a match state. */ 4486 return true; 4487 4488 } elseif ($node->tagName === 'table') { 4489 /* 3. Otherwise, if node is a table element, terminate in a failure 4490 state. */ 4491 return false; 4492 4493 } elseif ($table === true && in_array( 4494 $node->tagName, 4495 array( 4496 'caption', 4497 'td', 4498 'th', 4499 'button', 4500 'marquee', 4501 'object' 4502 ) 4503 ) 4504 ) { 4505 /* 4. Otherwise, if the algorithm is the "has an element in scope" 4506 variant (rather than the "has an element in table scope" variant), 4507 and node is one of the following, terminate in a failure state. */ 4508 return false; 4509 4510 } elseif ($node === $node->ownerDocument->documentElement) { 4511 /* 5. Otherwise, if node is an html element (root element), terminate 4512 in a failure state. (This can only happen if the node is the topmost 4513 node of the stack of open elements, and prevents the next step from 4514 being invoked if there are no more elements in the stack.) */ 4515 return false; 4516 } 4517 4518 /* Otherwise, set node to the previous entry in the stack of open 4519 elements and return to step 2. (This will never fail, since the loop 4520 will always terminate in the previous step if the top of the stack 4521 is reached.) */ 4522 } 4523 } 4524 4525 private function reconstructActiveFormattingElements() 4526 { 4527 /* 1. If there are no entries in the list of active formatting elements, 4528 then there is nothing to reconstruct; stop this algorithm. */ 4529 $formatting_elements = count($this->a_formatting); 4530 4531 if ($formatting_elements === 0) { 4532 return false; 4533 } 4534 4535 /* 3. Let entry be the last (most recently added) element in the list 4536 of active formatting elements. */ 4537 $entry = end($this->a_formatting); 4538 4539 /* 2. If the last (most recently added) entry in the list of active 4540 formatting elements is a marker, or if it is an element that is in the 4541 stack of open elements, then there is nothing to reconstruct; stop this 4542 algorithm. */ 4543 if ($entry === self::MARKER || in_array($entry, $this->stack, true)) { 4544 return false; 4545 } 4546 4547 for ($a = $formatting_elements - 1; $a >= 0; true) { 4548 /* 4. If there are no entries before entry in the list of active 4549 formatting elements, then jump to step 8. */ 4550 if ($a === 0) { 4551 $step_seven = false; 4552 break; 4553 } 4554 4555 /* 5. Let entry be the entry one earlier than entry in the list of 4556 active formatting elements. */ 4557 $a--; 4558 $entry = $this->a_formatting[$a]; 4559 4560 /* 6. If entry is neither a marker nor an element that is also in 4561 thetack of open elements, go to step 4. */ 4562 if ($entry === self::MARKER || in_array($entry, $this->stack, true)) { 4563 break; 4564 } 4565 } 4566 4567 while (true) { 4568 /* 7. Let entry be the element one later than entry in the list of 4569 active formatting elements. */ 4570 if (isset($step_seven) && $step_seven === true) { 4571 $a++; 4572 $entry = $this->a_formatting[$a]; 4573 } 4574 4575 /* 8. Perform a shallow clone of the element entry to obtain clone. */ 4576 $clone = $entry->cloneNode(); 4577 4578 /* 9. Append clone to the current node and push it onto the stack 4579 of open elements so that it is the new current node. */ 4580 end($this->stack)->appendChild($clone); 4581 $this->stack[] = $clone; 4582 4583 /* 10. Replace the entry for entry in the list with an entry for 4584 clone. */ 4585 $this->a_formatting[$a] = $clone; 4586 4587 /* 11. If the entry for clone in the list of active formatting 4588 elements is not the last entry in the list, return to step 7. */ 4589 if (end($this->a_formatting) !== $clone) { 4590 $step_seven = true; 4591 } else { 4592 break; 4593 } 4594 } 4595 } 4596 4597 private function clearTheActiveFormattingElementsUpToTheLastMarker() 4598 { 4599 /* When the steps below require the UA to clear the list of active 4600 formatting elements up to the last marker, the UA must perform the 4601 following steps: */ 4602 4603 while (true) { 4604 /* 1. Let entry be the last (most recently added) entry in the list 4605 of active formatting elements. */ 4606 $entry = end($this->a_formatting); 4607 4608 /* 2. Remove entry from the list of active formatting elements. */ 4609 array_pop($this->a_formatting); 4610 4611 /* 3. If entry was a marker, then stop the algorithm at this point. 4612 The list has been cleared up to the last marker. */ 4613 if ($entry === self::MARKER) { 4614 break; 4615 } 4616 } 4617 } 4618 4619 private function generateImpliedEndTags($exclude = array()) 4620 { 4621 /* When the steps below require the UA to generate implied end tags, 4622 then, if the current node is a dd element, a dt element, an li element, 4623 a p element, a td element, a th element, or a tr element, the UA must 4624 act as if an end tag with the respective tag name had been seen and 4625 then generate implied end tags again. */ 4626 $node = end($this->stack); 4627 $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude); 4628 4629 while (in_array(end($this->stack)->nodeName, $elements)) { 4630 array_pop($this->stack); 4631 } 4632 } 4633 4634 private function getElementCategory($node) 4635 { 4636 $name = $node->tagName; 4637 if (in_array($name, $this->special)) { 4638 return self::SPECIAL; 4639 } elseif (in_array($name, $this->scoping)) { 4640 return self::SCOPING; 4641 } elseif (in_array($name, $this->formatting)) { 4642 return self::FORMATTING; 4643 } else { 4644 return self::PHRASING; 4645 } 4646 } 4647 4648 private function clearStackToTableContext($elements) 4649 { 4650 /* When the steps above require the UA to clear the stack back to a 4651 table context, it means that the UA must, while the current node is not 4652 a table element or an html element, pop elements from the stack of open 4653 elements. If this causes any elements to be popped from the stack, then 4654 this is a parse error. */ 4655 while (true) { 4656 $node = end($this->stack)->nodeName; 4657 4658 if (in_array($node, $elements)) { 4659 break; 4660 } else { 4661 array_pop($this->stack); 4662 } 4663 } 4664 } 4665 4666 private function resetInsertionMode() 4667 { 4668 /* 1. Let last be false. */ 4669 $last = false; 4670 $leng = count($this->stack); 4671 4672 for ($n = $leng - 1; $n >= 0; $n--) { 4673 /* 2. Let node be the last node in the stack of open elements. */ 4674 $node = $this->stack[$n]; 4675 4676 /* 3. If node is the first node in the stack of open elements, then 4677 set last to true. If the element whose innerHTML attribute is being 4678 set is neither a td element nor a th element, then set node to the 4679 element whose innerHTML attribute is being set. (innerHTML case) */ 4680 if ($this->stack[0]->isSameNode($node)) { 4681 $last = true; 4682 } 4683 4684 /* 4. If node is a select element, then switch the insertion mode to 4685 "in select" and abort these steps. (innerHTML case) */ 4686 if ($node->nodeName === 'select') { 4687 $this->mode = self::IN_SELECT; 4688 break; 4689 4690 /* 5. If node is a td or th element, then switch the insertion mode 4691 to "in cell" and abort these steps. */ 4692 } elseif ($node->nodeName === 'td' || $node->nodeName === 'th') { 4693 $this->mode = self::IN_CELL; 4694 break; 4695 4696 /* 6. If node is a tr element, then switch the insertion mode to 4697 "in row" and abort these steps. */ 4698 } elseif ($node->nodeName === 'tr') { 4699 $this->mode = self::IN_ROW; 4700 break; 4701 4702 /* 7. If node is a tbody, thead, or tfoot element, then switch the 4703 insertion mode to "in table body" and abort these steps. */ 4704 } elseif (in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) { 4705 $this->mode = self::IN_TBODY; 4706 break; 4707 4708 /* 8. If node is a caption element, then switch the insertion mode 4709 to "in caption" and abort these steps. */ 4710 } elseif ($node->nodeName === 'caption') { 4711 $this->mode = self::IN_CAPTION; 4712 break; 4713 4714 /* 9. If node is a colgroup element, then switch the insertion mode 4715 to "in column group" and abort these steps. (innerHTML case) */ 4716 } elseif ($node->nodeName === 'colgroup') { 4717 $this->mode = self::IN_CGROUP; 4718 break; 4719 4720 /* 10. If node is a table element, then switch the insertion mode 4721 to "in table" and abort these steps. */ 4722 } elseif ($node->nodeName === 'table') { 4723 $this->mode = self::IN_TABLE; 4724 break; 4725 4726 /* 11. If node is a head element, then switch the insertion mode 4727 to "in body" ("in body"! not "in head"!) and abort these steps. 4728 (innerHTML case) */ 4729 } elseif ($node->nodeName === 'head') { 4730 $this->mode = self::IN_BODY; 4731 break; 4732 4733 /* 12. If node is a body element, then switch the insertion mode to 4734 "in body" and abort these steps. */ 4735 } elseif ($node->nodeName === 'body') { 4736 $this->mode = self::IN_BODY; 4737 break; 4738 4739 /* 13. If node is a frameset element, then switch the insertion 4740 mode to "in frameset" and abort these steps. (innerHTML case) */ 4741 } elseif ($node->nodeName === 'frameset') { 4742 $this->mode = self::IN_FRAME; 4743 break; 4744 4745 /* 14. If node is an html element, then: if the head element 4746 pointer is null, switch the insertion mode to "before head", 4747 otherwise, switch the insertion mode to "after head". In either 4748 case, abort these steps. (innerHTML case) */ 4749 } elseif ($node->nodeName === 'html') { 4750 $this->mode = ($this->head_pointer === null) 4751 ? self::BEFOR_HEAD 4752 : self::AFTER_HEAD; 4753 4754 break; 4755 4756 /* 15. If last is true, then set the insertion mode to "in body" 4757 and abort these steps. (innerHTML case) */ 4758 } elseif ($last) { 4759 $this->mode = self::IN_BODY; 4760 break; 4761 } 4762 } 4763 } 4764 4765 private function closeCell() 4766 { 4767 /* If the stack of open elements has a td or th element in table scope, 4768 then act as if an end tag token with that tag name had been seen. */ 4769 foreach (array('td', 'th') as $cell) { 4770 if ($this->elementInScope($cell, true)) { 4771 $this->inCell( 4772 array( 4773 'name' => $cell, 4774 'type' => HTML5::ENDTAG 4775 ) 4776 ); 4777 4778 break; 4779 } 4780 } 4781 } 4782 4783 public function save() 4784 { 4785 return $this->dom; 4786 } 4787 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Thu Aug 11 10:00:09 2016 | Cross-referenced by PHPXref 0.7.1 |