[ Index ]

PHP Cross Reference of Unnamed Project

title

Body

[close]

/lib/htmlpurifier/HTMLPurifier/Lexer/ -> PH5P.php (source)

   1  <?php
   2  
   3  /**
   4   * Experimental HTML5-based parser using Jeroen van der Meer's PH5P library.
   5   * Occupies space in the HTML5 pseudo-namespace, which may cause conflicts.
   6   *
   7   * @note
   8   *    Recent changes to PHP's DOM extension have resulted in some fatal
   9   *    error conditions with the original version of PH5P. Pending changes,
  10   *    this lexer will punt to DirectLex if DOM throws an exception.
  11   */
  12  
  13  class HTMLPurifier_Lexer_PH5P extends HTMLPurifier_Lexer_DOMLex
  14  {
  15      /**
  16       * @param string $html
  17       * @param HTMLPurifier_Config $config
  18       * @param HTMLPurifier_Context $context
  19       * @return HTMLPurifier_Token[]
  20       */
  21      public function tokenizeHTML($html, $config, $context)
  22      {
  23          $new_html = $this->normalize($html, $config, $context);
  24          $new_html = $this->wrapHTML($new_html, $config, $context);
  25          try {
  26              $parser = new HTML5($new_html);
  27              $doc = $parser->save();
  28          } catch (DOMException $e) {
  29              // Uh oh, it failed. Punt to DirectLex.
  30              $lexer = new HTMLPurifier_Lexer_DirectLex();
  31              $context->register('PH5PError', $e); // save the error, so we can detect it
  32              return $lexer->tokenizeHTML($html, $config, $context); // use original HTML
  33          }
  34          $tokens = array();
  35          $this->tokenizeDOM(
  36              $doc->getElementsByTagName('html')->item(0)-> // <html>
  37                  getElementsByTagName('body')->item(0) //   <body>
  38              ,
  39              $tokens
  40          );
  41          return $tokens;
  42      }
  43  }
  44  
  45  /*
  46  
  47  Copyright 2007 Jeroen van der Meer <http://jero.net/>
  48  
  49  Permission is hereby granted, free of charge, to any person obtaining a
  50  copy of this software and associated documentation files (the
  51  "Software"), to deal in the Software without restriction, including
  52  without limitation the rights to use, copy, modify, merge, publish,
  53  distribute, sublicense, and/or sell copies of the Software, and to
  54  permit persons to whom the Software is furnished to do so, subject to
  55  the following conditions:
  56  
  57  The above copyright notice and this permission notice shall be included
  58  in all copies or substantial portions of the Software.
  59  
  60  THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  61  OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
  62  MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
  63  IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
  64  CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
  65  TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
  66  SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  67  
  68  */
  69  
  70  class HTML5
  71  {
  72      private $data;
  73      private $char;
  74      private $EOF;
  75      private $state;
  76      private $tree;
  77      private $token;
  78      private $content_model;
  79      private $escape = false;
  80      private $entities = array(
  81          'AElig;',
  82          'AElig',
  83          'AMP;',
  84          'AMP',
  85          'Aacute;',
  86          'Aacute',
  87          'Acirc;',
  88          'Acirc',
  89          'Agrave;',
  90          'Agrave',
  91          'Alpha;',
  92          'Aring;',
  93          'Aring',
  94          'Atilde;',
  95          'Atilde',
  96          'Auml;',
  97          'Auml',
  98          'Beta;',
  99          'COPY;',
 100          'COPY',
 101          'Ccedil;',
 102          'Ccedil',
 103          'Chi;',
 104          'Dagger;',
 105          'Delta;',
 106          'ETH;',
 107          'ETH',
 108          'Eacute;',
 109          'Eacute',
 110          'Ecirc;',
 111          'Ecirc',
 112          'Egrave;',
 113          'Egrave',
 114          'Epsilon;',
 115          'Eta;',
 116          'Euml;',
 117          'Euml',
 118          'GT;',
 119          'GT',
 120          'Gamma;',
 121          'Iacute;',
 122          'Iacute',
 123          'Icirc;',
 124          'Icirc',
 125          'Igrave;',
 126          'Igrave',
 127          'Iota;',
 128          'Iuml;',
 129          'Iuml',
 130          'Kappa;',
 131          'LT;',
 132          'LT',
 133          'Lambda;',
 134          'Mu;',
 135          'Ntilde;',
 136          'Ntilde',
 137          'Nu;',
 138          'OElig;',
 139          'Oacute;',
 140          'Oacute',
 141          'Ocirc;',
 142          'Ocirc',
 143          'Ograve;',
 144          'Ograve',
 145          'Omega;',
 146          'Omicron;',
 147          'Oslash;',
 148          'Oslash',
 149          'Otilde;',
 150          'Otilde',
 151          'Ouml;',
 152          'Ouml',
 153          'Phi;',
 154          'Pi;',
 155          'Prime;',
 156          'Psi;',
 157          'QUOT;',
 158          'QUOT',
 159          'REG;',
 160          'REG',
 161          'Rho;',
 162          'Scaron;',
 163          'Sigma;',
 164          'THORN;',
 165          'THORN',
 166          'TRADE;',
 167          'Tau;',
 168          'Theta;',
 169          'Uacute;',
 170          'Uacute',
 171          'Ucirc;',
 172          'Ucirc',
 173          'Ugrave;',
 174          'Ugrave',
 175          'Upsilon;',
 176          'Uuml;',
 177          'Uuml',
 178          'Xi;',
 179          'Yacute;',
 180          'Yacute',
 181          'Yuml;',
 182          'Zeta;',
 183          'aacute;',
 184          'aacute',
 185          'acirc;',
 186          'acirc',
 187          'acute;',
 188          'acute',
 189          'aelig;',
 190          'aelig',
 191          'agrave;',
 192          'agrave',
 193          'alefsym;',
 194          'alpha;',
 195          'amp;',
 196          'amp',
 197          'and;',
 198          'ang;',
 199          'apos;',
 200          'aring;',
 201          'aring',
 202          'asymp;',
 203          'atilde;',
 204          'atilde',
 205          'auml;',
 206          'auml',
 207          'bdquo;',
 208          'beta;',
 209          'brvbar;',
 210          'brvbar',
 211          'bull;',
 212          'cap;',
 213          'ccedil;',
 214          'ccedil',
 215          'cedil;',
 216          'cedil',
 217          'cent;',
 218          'cent',
 219          'chi;',
 220          'circ;',
 221          'clubs;',
 222          'cong;',
 223          'copy;',
 224          'copy',
 225          'crarr;',
 226          'cup;',
 227          'curren;',
 228          'curren',
 229          'dArr;',
 230          'dagger;',
 231          'darr;',
 232          'deg;',
 233          'deg',
 234          'delta;',
 235          'diams;',
 236          'divide;',
 237          'divide',
 238          'eacute;',
 239          'eacute',
 240          'ecirc;',
 241          'ecirc',
 242          'egrave;',
 243          'egrave',
 244          'empty;',
 245          'emsp;',
 246          'ensp;',
 247          'epsilon;',
 248          'equiv;',
 249          'eta;',
 250          'eth;',
 251          'eth',
 252          'euml;',
 253          'euml',
 254          'euro;',
 255          'exist;',
 256          'fnof;',
 257          'forall;',
 258          'frac12;',
 259          'frac12',
 260          'frac14;',
 261          'frac14',
 262          'frac34;',
 263          'frac34',
 264          'frasl;',
 265          'gamma;',
 266          'ge;',
 267          'gt;',
 268          'gt',
 269          'hArr;',
 270          'harr;',
 271          'hearts;',
 272          'hellip;',
 273          'iacute;',
 274          'iacute',
 275          'icirc;',
 276          'icirc',
 277          'iexcl;',
 278          'iexcl',
 279          'igrave;',
 280          'igrave',
 281          'image;',
 282          'infin;',
 283          'int;',
 284          'iota;',
 285          'iquest;',
 286          'iquest',
 287          'isin;',
 288          'iuml;',
 289          'iuml',
 290          'kappa;',
 291          'lArr;',
 292          'lambda;',
 293          'lang;',
 294          'laquo;',
 295          'laquo',
 296          'larr;',
 297          'lceil;',
 298          'ldquo;',
 299          'le;',
 300          'lfloor;',
 301          'lowast;',
 302          'loz;',
 303          'lrm;',
 304          'lsaquo;',
 305          'lsquo;',
 306          'lt;',
 307          'lt',
 308          'macr;',
 309          'macr',
 310          'mdash;',
 311          'micro;',
 312          'micro',
 313          'middot;',
 314          'middot',
 315          'minus;',
 316          'mu;',
 317          'nabla;',
 318          'nbsp;',
 319          'nbsp',
 320          'ndash;',
 321          'ne;',
 322          'ni;',
 323          'not;',
 324          'not',
 325          'notin;',
 326          'nsub;',
 327          'ntilde;',
 328          'ntilde',
 329          'nu;',
 330          'oacute;',
 331          'oacute',
 332          'ocirc;',
 333          'ocirc',
 334          'oelig;',
 335          'ograve;',
 336          'ograve',
 337          'oline;',
 338          'omega;',
 339          'omicron;',
 340          'oplus;',
 341          'or;',
 342          'ordf;',
 343          'ordf',
 344          'ordm;',
 345          'ordm',
 346          'oslash;',
 347          'oslash',
 348          'otilde;',
 349          'otilde',
 350          'otimes;',
 351          'ouml;',
 352          'ouml',
 353          'para;',
 354          'para',
 355          'part;',
 356          'permil;',
 357          'perp;',
 358          'phi;',
 359          'pi;',
 360          'piv;',
 361          'plusmn;',
 362          'plusmn',
 363          'pound;',
 364          'pound',
 365          'prime;',
 366          'prod;',
 367          'prop;',
 368          'psi;',
 369          'quot;',
 370          'quot',
 371          'rArr;',
 372          'radic;',
 373          'rang;',
 374          'raquo;',
 375          'raquo',
 376          'rarr;',
 377          'rceil;',
 378          'rdquo;',
 379          'real;',
 380          'reg;',
 381          'reg',
 382          'rfloor;',
 383          'rho;',
 384          'rlm;',
 385          'rsaquo;',
 386          'rsquo;',
 387          'sbquo;',
 388          'scaron;',
 389          'sdot;',
 390          'sect;',
 391          'sect',
 392          'shy;',
 393          'shy',
 394          'sigma;',
 395          'sigmaf;',
 396          'sim;',
 397          'spades;',
 398          'sub;',
 399          'sube;',
 400          'sum;',
 401          'sup1;',
 402          'sup1',
 403          'sup2;',
 404          'sup2',
 405          'sup3;',
 406          'sup3',
 407          'sup;',
 408          'supe;',
 409          'szlig;',
 410          'szlig',
 411          'tau;',
 412          'there4;',
 413          'theta;',
 414          'thetasym;',
 415          'thinsp;',
 416          'thorn;',
 417          'thorn',
 418          'tilde;',
 419          'times;',
 420          'times',
 421          'trade;',
 422          'uArr;',
 423          'uacute;',
 424          'uacute',
 425          'uarr;',
 426          'ucirc;',
 427          'ucirc',
 428          'ugrave;',
 429          'ugrave',
 430          'uml;',
 431          'uml',
 432          'upsih;',
 433          'upsilon;',
 434          'uuml;',
 435          'uuml',
 436          'weierp;',
 437          'xi;',
 438          'yacute;',
 439          'yacute',
 440          'yen;',
 441          'yen',
 442          'yuml;',
 443          'yuml',
 444          'zeta;',
 445          'zwj;',
 446          'zwnj;'
 447      );
 448  
 449      const PCDATA = 0;
 450      const RCDATA = 1;
 451      const CDATA = 2;
 452      const PLAINTEXT = 3;
 453  
 454      const DOCTYPE = 0;
 455      const STARTTAG = 1;
 456      const ENDTAG = 2;
 457      const COMMENT = 3;
 458      const CHARACTR = 4;
 459      const EOF = 5;
 460  
 461      public function __construct($data)
 462      {
 463          $this->data = $data;
 464          $this->char = -1;
 465          $this->EOF = strlen($data);
 466          $this->tree = new HTML5TreeConstructer;
 467          $this->content_model = self::PCDATA;
 468  
 469          $this->state = 'data';
 470  
 471          while ($this->state !== null) {
 472              $this->{$this->state . 'State'}();
 473          }
 474      }
 475  
 476      public function save()
 477      {
 478          return $this->tree->save();
 479      }
 480  
 481      private function char()
 482      {
 483          return ($this->char < $this->EOF)
 484              ? $this->data[$this->char]
 485              : false;
 486      }
 487  
 488      private function character($s, $l = 0)
 489      {
 490          if ($s + $l < $this->EOF) {
 491              if ($l === 0) {
 492                  return $this->data[$s];
 493              } else {
 494                  return substr($this->data, $s, $l);
 495              }
 496          }
 497      }
 498  
 499      private function characters($char_class, $start)
 500      {
 501          return preg_replace('#^([' . $char_class . ']+).*#s', '\\1', substr($this->data, $start));
 502      }
 503  
 504      private function dataState()
 505      {
 506          // Consume the next input character
 507          $this->char++;
 508          $char = $this->char();
 509  
 510          if ($char === '&' && ($this->content_model === self::PCDATA || $this->content_model === self::RCDATA)) {
 511              /* U+0026 AMPERSAND (&)
 512              When the content model flag is set to one of the PCDATA or RCDATA
 513              states: switch to the entity data state. Otherwise: treat it as per
 514              the "anything else"    entry below. */
 515              $this->state = 'entityData';
 516  
 517          } elseif ($char === '-') {
 518              /* If the content model flag is set to either the RCDATA state or
 519              the CDATA state, and the escape flag is false, and there are at
 520              least three characters before this one in the input stream, and the
 521              last four characters in the input stream, including this one, are
 522              U+003C LESS-THAN SIGN, U+0021 EXCLAMATION MARK, U+002D HYPHEN-MINUS,
 523              and U+002D HYPHEN-MINUS ("<!--"), then set the escape flag to true. */
 524              if (($this->content_model === self::RCDATA || $this->content_model ===
 525                      self::CDATA) && $this->escape === false &&
 526                  $this->char >= 3 && $this->character($this->char - 4, 4) === '<!--'
 527              ) {
 528                  $this->escape = true;
 529              }
 530  
 531              /* In any case, emit the input character as a character token. Stay
 532              in the data state. */
 533              $this->emitToken(
 534                  array(
 535                      'type' => self::CHARACTR,
 536                      'data' => $char
 537                  )
 538              );
 539  
 540              /* U+003C LESS-THAN SIGN (<) */
 541          } elseif ($char === '<' && ($this->content_model === self::PCDATA ||
 542                  (($this->content_model === self::RCDATA ||
 543                          $this->content_model === self::CDATA) && $this->escape === false))
 544          ) {
 545              /* When the content model flag is set to the PCDATA state: switch
 546              to the tag open state.
 547  
 548              When the content model flag is set to either the RCDATA state or
 549              the CDATA state and the escape flag is false: switch to the tag
 550              open state.
 551  
 552              Otherwise: treat it as per the "anything else" entry below. */
 553              $this->state = 'tagOpen';
 554  
 555              /* U+003E GREATER-THAN SIGN (>) */
 556          } elseif ($char === '>') {
 557              /* If the content model flag is set to either the RCDATA state or
 558              the CDATA state, and the escape flag is true, and the last three
 559              characters in the input stream including this one are U+002D
 560              HYPHEN-MINUS, U+002D HYPHEN-MINUS, U+003E GREATER-THAN SIGN ("-->"),
 561              set the escape flag to false. */
 562              if (($this->content_model === self::RCDATA ||
 563                      $this->content_model === self::CDATA) && $this->escape === true &&
 564                  $this->character($this->char, 3) === '-->'
 565              ) {
 566                  $this->escape = false;
 567              }
 568  
 569              /* In any case, emit the input character as a character token.
 570              Stay in the data state. */
 571              $this->emitToken(
 572                  array(
 573                      'type' => self::CHARACTR,
 574                      'data' => $char
 575                  )
 576              );
 577  
 578          } elseif ($this->char === $this->EOF) {
 579              /* EOF
 580              Emit an end-of-file token. */
 581              $this->EOF();
 582  
 583          } elseif ($this->content_model === self::PLAINTEXT) {
 584              /* When the content model flag is set to the PLAINTEXT state
 585              THIS DIFFERS GREATLY FROM THE SPEC: Get the remaining characters of
 586              the text and emit it as a character token. */
 587              $this->emitToken(
 588                  array(
 589                      'type' => self::CHARACTR,
 590                      'data' => substr($this->data, $this->char)
 591                  )
 592              );
 593  
 594              $this->EOF();
 595  
 596          } else {
 597              /* Anything else
 598              THIS DIFFERS GREATLY FROM THE SPEC: Get as many character that
 599              otherwise would also be treated as a character token and emit it
 600              as a single character token. Stay in the data state. */
 601              $len = strcspn($this->data, '<&', $this->char);
 602              $char = substr($this->data, $this->char, $len);
 603              $this->char += $len - 1;
 604  
 605              $this->emitToken(
 606                  array(
 607                      'type' => self::CHARACTR,
 608                      'data' => $char
 609                  )
 610              );
 611  
 612              $this->state = 'data';
 613          }
 614      }
 615  
 616      private function entityDataState()
 617      {
 618          // Attempt to consume an entity.
 619          $entity = $this->entity();
 620  
 621          // If nothing is returned, emit a U+0026 AMPERSAND character token.
 622          // Otherwise, emit the character token that was returned.
 623          $char = (!$entity) ? '&' : $entity;
 624          $this->emitToken(
 625              array(
 626                  'type' => self::CHARACTR,
 627                  'data' => $char
 628              )
 629          );
 630  
 631          // Finally, switch to the data state.
 632          $this->state = 'data';
 633      }
 634  
 635      private function tagOpenState()
 636      {
 637          switch ($this->content_model) {
 638              case self::RCDATA:
 639              case self::CDATA:
 640                  /* If the next input character is a U+002F SOLIDUS (/) character,
 641                  consume it and switch to the close tag open state. If the next
 642                  input character is not a U+002F SOLIDUS (/) character, emit a
 643                  U+003C LESS-THAN SIGN character token and switch to the data
 644                  state to process the next input character. */
 645                  if ($this->character($this->char + 1) === '/') {
 646                      $this->char++;
 647                      $this->state = 'closeTagOpen';
 648  
 649                  } else {
 650                      $this->emitToken(
 651                          array(
 652                              'type' => self::CHARACTR,
 653                              'data' => '<'
 654                          )
 655                      );
 656  
 657                      $this->state = 'data';
 658                  }
 659                  break;
 660  
 661              case self::PCDATA:
 662                  // If the content model flag is set to the PCDATA state
 663                  // Consume the next input character:
 664                  $this->char++;
 665                  $char = $this->char();
 666  
 667                  if ($char === '!') {
 668                      /* U+0021 EXCLAMATION MARK (!)
 669                      Switch to the markup declaration open state. */
 670                      $this->state = 'markupDeclarationOpen';
 671  
 672                  } elseif ($char === '/') {
 673                      /* U+002F SOLIDUS (/)
 674                      Switch to the close tag open state. */
 675                      $this->state = 'closeTagOpen';
 676  
 677                  } elseif (preg_match('/^[A-Za-z]$/', $char)) {
 678                      /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
 679                      Create a new start tag token, set its tag name to the lowercase
 680                      version of the input character (add 0x0020 to the character's code
 681                      point), then switch to the tag name state. (Don't emit the token
 682                      yet; further details will be filled in before it is emitted.) */
 683                      $this->token = array(
 684                          'name' => strtolower($char),
 685                          'type' => self::STARTTAG,
 686                          'attr' => array()
 687                      );
 688  
 689                      $this->state = 'tagName';
 690  
 691                  } elseif ($char === '>') {
 692                      /* U+003E GREATER-THAN SIGN (>)
 693                      Parse error. Emit a U+003C LESS-THAN SIGN character token and a
 694                      U+003E GREATER-THAN SIGN character token. Switch to the data state. */
 695                      $this->emitToken(
 696                          array(
 697                              'type' => self::CHARACTR,
 698                              'data' => '<>'
 699                          )
 700                      );
 701  
 702                      $this->state = 'data';
 703  
 704                  } elseif ($char === '?') {
 705                      /* U+003F QUESTION MARK (?)
 706                      Parse error. Switch to the bogus comment state. */
 707                      $this->state = 'bogusComment';
 708  
 709                  } else {
 710                      /* Anything else
 711                      Parse error. Emit a U+003C LESS-THAN SIGN character token and
 712                      reconsume the current input character in the data state. */
 713                      $this->emitToken(
 714                          array(
 715                              'type' => self::CHARACTR,
 716                              'data' => '<'
 717                          )
 718                      );
 719  
 720                      $this->char--;
 721                      $this->state = 'data';
 722                  }
 723                  break;
 724          }
 725      }
 726  
 727      private function closeTagOpenState()
 728      {
 729          $next_node = strtolower($this->characters('A-Za-z', $this->char + 1));
 730          $the_same = count($this->tree->stack) > 0 && $next_node === end($this->tree->stack)->nodeName;
 731  
 732          if (($this->content_model === self::RCDATA || $this->content_model === self::CDATA) &&
 733              (!$the_same || ($the_same && (!preg_match(
 734                              '/[\t\n\x0b\x0c >\/]/',
 735                              $this->character($this->char + 1 + strlen($next_node))
 736                          ) || $this->EOF === $this->char)))
 737          ) {
 738              /* If the content model flag is set to the RCDATA or CDATA states then
 739              examine the next few characters. If they do not match the tag name of
 740              the last start tag token emitted (case insensitively), or if they do but
 741              they are not immediately followed by one of the following characters:
 742                  * U+0009 CHARACTER TABULATION
 743                  * U+000A LINE FEED (LF)
 744                  * U+000B LINE TABULATION
 745                  * U+000C FORM FEED (FF)
 746                  * U+0020 SPACE
 747                  * U+003E GREATER-THAN SIGN (>)
 748                  * U+002F SOLIDUS (/)
 749                  * EOF
 750              ...then there is a parse error. Emit a U+003C LESS-THAN SIGN character
 751              token, a U+002F SOLIDUS character token, and switch to the data state
 752              to process the next input character. */
 753              $this->emitToken(
 754                  array(
 755                      'type' => self::CHARACTR,
 756                      'data' => '</'
 757                  )
 758              );
 759  
 760              $this->state = 'data';
 761  
 762          } else {
 763              /* Otherwise, if the content model flag is set to the PCDATA state,
 764              or if the next few characters do match that tag name, consume the
 765              next input character: */
 766              $this->char++;
 767              $char = $this->char();
 768  
 769              if (preg_match('/^[A-Za-z]$/', $char)) {
 770                  /* U+0041 LATIN LETTER A through to U+005A LATIN LETTER Z
 771                  Create a new end tag token, set its tag name to the lowercase version
 772                  of the input character (add 0x0020 to the character's code point), then
 773                  switch to the tag name state. (Don't emit the token yet; further details
 774                  will be filled in before it is emitted.) */
 775                  $this->token = array(
 776                      'name' => strtolower($char),
 777                      'type' => self::ENDTAG
 778                  );
 779  
 780                  $this->state = 'tagName';
 781  
 782              } elseif ($char === '>') {
 783                  /* U+003E GREATER-THAN SIGN (>)
 784                  Parse error. Switch to the data state. */
 785                  $this->state = 'data';
 786  
 787              } elseif ($this->char === $this->EOF) {
 788                  /* EOF
 789                  Parse error. Emit a U+003C LESS-THAN SIGN character token and a U+002F
 790                  SOLIDUS character token. Reconsume the EOF character in the data state. */
 791                  $this->emitToken(
 792                      array(
 793                          'type' => self::CHARACTR,
 794                          'data' => '</'
 795                      )
 796                  );
 797  
 798                  $this->char--;
 799                  $this->state = 'data';
 800  
 801              } else {
 802                  /* Parse error. Switch to the bogus comment state. */
 803                  $this->state = 'bogusComment';
 804              }
 805          }
 806      }
 807  
 808      private function tagNameState()
 809      {
 810          // Consume the next input character:
 811          $this->char++;
 812          $char = $this->character($this->char);
 813  
 814          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 815              /* U+0009 CHARACTER TABULATION
 816              U+000A LINE FEED (LF)
 817              U+000B LINE TABULATION
 818              U+000C FORM FEED (FF)
 819              U+0020 SPACE
 820              Switch to the before attribute name state. */
 821              $this->state = 'beforeAttributeName';
 822  
 823          } elseif ($char === '>') {
 824              /* U+003E GREATER-THAN SIGN (>)
 825              Emit the current tag token. Switch to the data state. */
 826              $this->emitToken($this->token);
 827              $this->state = 'data';
 828  
 829          } elseif ($this->char === $this->EOF) {
 830              /* EOF
 831              Parse error. Emit the current tag token. Reconsume the EOF
 832              character in the data state. */
 833              $this->emitToken($this->token);
 834  
 835              $this->char--;
 836              $this->state = 'data';
 837  
 838          } elseif ($char === '/') {
 839              /* U+002F SOLIDUS (/)
 840              Parse error unless this is a permitted slash. Switch to the before
 841              attribute name state. */
 842              $this->state = 'beforeAttributeName';
 843  
 844          } else {
 845              /* Anything else
 846              Append the current input character to the current tag token's tag name.
 847              Stay in the tag name state. */
 848              $this->token['name'] .= strtolower($char);
 849              $this->state = 'tagName';
 850          }
 851      }
 852  
 853      private function beforeAttributeNameState()
 854      {
 855          // Consume the next input character:
 856          $this->char++;
 857          $char = $this->character($this->char);
 858  
 859          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 860              /* U+0009 CHARACTER TABULATION
 861              U+000A LINE FEED (LF)
 862              U+000B LINE TABULATION
 863              U+000C FORM FEED (FF)
 864              U+0020 SPACE
 865              Stay in the before attribute name state. */
 866              $this->state = 'beforeAttributeName';
 867  
 868          } elseif ($char === '>') {
 869              /* U+003E GREATER-THAN SIGN (>)
 870              Emit the current tag token. Switch to the data state. */
 871              $this->emitToken($this->token);
 872              $this->state = 'data';
 873  
 874          } elseif ($char === '/') {
 875              /* U+002F SOLIDUS (/)
 876              Parse error unless this is a permitted slash. Stay in the before
 877              attribute name state. */
 878              $this->state = 'beforeAttributeName';
 879  
 880          } elseif ($this->char === $this->EOF) {
 881              /* EOF
 882              Parse error. Emit the current tag token. Reconsume the EOF
 883              character in the data state. */
 884              $this->emitToken($this->token);
 885  
 886              $this->char--;
 887              $this->state = 'data';
 888  
 889          } else {
 890              /* Anything else
 891              Start a new attribute in the current tag token. Set that attribute's
 892              name to the current input character, and its value to the empty string.
 893              Switch to the attribute name state. */
 894              $this->token['attr'][] = array(
 895                  'name' => strtolower($char),
 896                  'value' => null
 897              );
 898  
 899              $this->state = 'attributeName';
 900          }
 901      }
 902  
 903      private function attributeNameState()
 904      {
 905          // Consume the next input character:
 906          $this->char++;
 907          $char = $this->character($this->char);
 908  
 909          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 910              /* U+0009 CHARACTER TABULATION
 911              U+000A LINE FEED (LF)
 912              U+000B LINE TABULATION
 913              U+000C FORM FEED (FF)
 914              U+0020 SPACE
 915              Stay in the before attribute name state. */
 916              $this->state = 'afterAttributeName';
 917  
 918          } elseif ($char === '=') {
 919              /* U+003D EQUALS SIGN (=)
 920              Switch to the before attribute value state. */
 921              $this->state = 'beforeAttributeValue';
 922  
 923          } elseif ($char === '>') {
 924              /* U+003E GREATER-THAN SIGN (>)
 925              Emit the current tag token. Switch to the data state. */
 926              $this->emitToken($this->token);
 927              $this->state = 'data';
 928  
 929          } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
 930              /* U+002F SOLIDUS (/)
 931              Parse error unless this is a permitted slash. Switch to the before
 932              attribute name state. */
 933              $this->state = 'beforeAttributeName';
 934  
 935          } elseif ($this->char === $this->EOF) {
 936              /* EOF
 937              Parse error. Emit the current tag token. Reconsume the EOF
 938              character in the data state. */
 939              $this->emitToken($this->token);
 940  
 941              $this->char--;
 942              $this->state = 'data';
 943  
 944          } else {
 945              /* Anything else
 946              Append the current input character to the current attribute's name.
 947              Stay in the attribute name state. */
 948              $last = count($this->token['attr']) - 1;
 949              $this->token['attr'][$last]['name'] .= strtolower($char);
 950  
 951              $this->state = 'attributeName';
 952          }
 953      }
 954  
 955      private function afterAttributeNameState()
 956      {
 957          // Consume the next input character:
 958          $this->char++;
 959          $char = $this->character($this->char);
 960  
 961          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
 962              /* U+0009 CHARACTER TABULATION
 963              U+000A LINE FEED (LF)
 964              U+000B LINE TABULATION
 965              U+000C FORM FEED (FF)
 966              U+0020 SPACE
 967              Stay in the after attribute name state. */
 968              $this->state = 'afterAttributeName';
 969  
 970          } elseif ($char === '=') {
 971              /* U+003D EQUALS SIGN (=)
 972              Switch to the before attribute value state. */
 973              $this->state = 'beforeAttributeValue';
 974  
 975          } elseif ($char === '>') {
 976              /* U+003E GREATER-THAN SIGN (>)
 977              Emit the current tag token. Switch to the data state. */
 978              $this->emitToken($this->token);
 979              $this->state = 'data';
 980  
 981          } elseif ($char === '/' && $this->character($this->char + 1) !== '>') {
 982              /* U+002F SOLIDUS (/)
 983              Parse error unless this is a permitted slash. Switch to the
 984              before attribute name state. */
 985              $this->state = 'beforeAttributeName';
 986  
 987          } elseif ($this->char === $this->EOF) {
 988              /* EOF
 989              Parse error. Emit the current tag token. Reconsume the EOF
 990              character in the data state. */
 991              $this->emitToken($this->token);
 992  
 993              $this->char--;
 994              $this->state = 'data';
 995  
 996          } else {
 997              /* Anything else
 998              Start a new attribute in the current tag token. Set that attribute's
 999              name to the current input character, and its value to the empty string.
1000              Switch to the attribute name state. */
1001              $this->token['attr'][] = array(
1002                  'name' => strtolower($char),
1003                  'value' => null
1004              );
1005  
1006              $this->state = 'attributeName';
1007          }
1008      }
1009  
1010      private function beforeAttributeValueState()
1011      {
1012          // Consume the next input character:
1013          $this->char++;
1014          $char = $this->character($this->char);
1015  
1016          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1017              /* U+0009 CHARACTER TABULATION
1018              U+000A LINE FEED (LF)
1019              U+000B LINE TABULATION
1020              U+000C FORM FEED (FF)
1021              U+0020 SPACE
1022              Stay in the before attribute value state. */
1023              $this->state = 'beforeAttributeValue';
1024  
1025          } elseif ($char === '"') {
1026              /* U+0022 QUOTATION MARK (")
1027              Switch to the attribute value (double-quoted) state. */
1028              $this->state = 'attributeValueDoubleQuoted';
1029  
1030          } elseif ($char === '&') {
1031              /* U+0026 AMPERSAND (&)
1032              Switch to the attribute value (unquoted) state and reconsume
1033              this input character. */
1034              $this->char--;
1035              $this->state = 'attributeValueUnquoted';
1036  
1037          } elseif ($char === '\'') {
1038              /* U+0027 APOSTROPHE (')
1039              Switch to the attribute value (single-quoted) state. */
1040              $this->state = 'attributeValueSingleQuoted';
1041  
1042          } elseif ($char === '>') {
1043              /* U+003E GREATER-THAN SIGN (>)
1044              Emit the current tag token. Switch to the data state. */
1045              $this->emitToken($this->token);
1046              $this->state = 'data';
1047  
1048          } else {
1049              /* Anything else
1050              Append the current input character to the current attribute's value.
1051              Switch to the attribute value (unquoted) state. */
1052              $last = count($this->token['attr']) - 1;
1053              $this->token['attr'][$last]['value'] .= $char;
1054  
1055              $this->state = 'attributeValueUnquoted';
1056          }
1057      }
1058  
1059      private function attributeValueDoubleQuotedState()
1060      {
1061          // Consume the next input character:
1062          $this->char++;
1063          $char = $this->character($this->char);
1064  
1065          if ($char === '"') {
1066              /* U+0022 QUOTATION MARK (")
1067              Switch to the before attribute name state. */
1068              $this->state = 'beforeAttributeName';
1069  
1070          } elseif ($char === '&') {
1071              /* U+0026 AMPERSAND (&)
1072              Switch to the entity in attribute value state. */
1073              $this->entityInAttributeValueState('double');
1074  
1075          } elseif ($this->char === $this->EOF) {
1076              /* EOF
1077              Parse error. Emit the current tag token. Reconsume the character
1078              in the data state. */
1079              $this->emitToken($this->token);
1080  
1081              $this->char--;
1082              $this->state = 'data';
1083  
1084          } else {
1085              /* Anything else
1086              Append the current input character to the current attribute's value.
1087              Stay in the attribute value (double-quoted) state. */
1088              $last = count($this->token['attr']) - 1;
1089              $this->token['attr'][$last]['value'] .= $char;
1090  
1091              $this->state = 'attributeValueDoubleQuoted';
1092          }
1093      }
1094  
1095      private function attributeValueSingleQuotedState()
1096      {
1097          // Consume the next input character:
1098          $this->char++;
1099          $char = $this->character($this->char);
1100  
1101          if ($char === '\'') {
1102              /* U+0022 QUOTATION MARK (')
1103              Switch to the before attribute name state. */
1104              $this->state = 'beforeAttributeName';
1105  
1106          } elseif ($char === '&') {
1107              /* U+0026 AMPERSAND (&)
1108              Switch to the entity in attribute value state. */
1109              $this->entityInAttributeValueState('single');
1110  
1111          } elseif ($this->char === $this->EOF) {
1112              /* EOF
1113              Parse error. Emit the current tag token. Reconsume the character
1114              in the data state. */
1115              $this->emitToken($this->token);
1116  
1117              $this->char--;
1118              $this->state = 'data';
1119  
1120          } else {
1121              /* Anything else
1122              Append the current input character to the current attribute's value.
1123              Stay in the attribute value (single-quoted) state. */
1124              $last = count($this->token['attr']) - 1;
1125              $this->token['attr'][$last]['value'] .= $char;
1126  
1127              $this->state = 'attributeValueSingleQuoted';
1128          }
1129      }
1130  
1131      private function attributeValueUnquotedState()
1132      {
1133          // Consume the next input character:
1134          $this->char++;
1135          $char = $this->character($this->char);
1136  
1137          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1138              /* U+0009 CHARACTER TABULATION
1139              U+000A LINE FEED (LF)
1140              U+000B LINE TABULATION
1141              U+000C FORM FEED (FF)
1142              U+0020 SPACE
1143              Switch to the before attribute name state. */
1144              $this->state = 'beforeAttributeName';
1145  
1146          } elseif ($char === '&') {
1147              /* U+0026 AMPERSAND (&)
1148              Switch to the entity in attribute value state. */
1149              $this->entityInAttributeValueState();
1150  
1151          } elseif ($char === '>') {
1152              /* U+003E GREATER-THAN SIGN (>)
1153              Emit the current tag token. Switch to the data state. */
1154              $this->emitToken($this->token);
1155              $this->state = 'data';
1156  
1157          } else {
1158              /* Anything else
1159              Append the current input character to the current attribute's value.
1160              Stay in the attribute value (unquoted) state. */
1161              $last = count($this->token['attr']) - 1;
1162              $this->token['attr'][$last]['value'] .= $char;
1163  
1164              $this->state = 'attributeValueUnquoted';
1165          }
1166      }
1167  
1168      private function entityInAttributeValueState()
1169      {
1170          // Attempt to consume an entity.
1171          $entity = $this->entity();
1172  
1173          // If nothing is returned, append a U+0026 AMPERSAND character to the
1174          // current attribute's value. Otherwise, emit the character token that
1175          // was returned.
1176          $char = (!$entity)
1177              ? '&'
1178              : $entity;
1179  
1180          $last = count($this->token['attr']) - 1;
1181          $this->token['attr'][$last]['value'] .= $char;
1182      }
1183  
1184      private function bogusCommentState()
1185      {
1186          /* Consume every character up to the first U+003E GREATER-THAN SIGN
1187          character (>) or the end of the file (EOF), whichever comes first. Emit
1188          a comment token whose data is the concatenation of all the characters
1189          starting from and including the character that caused the state machine
1190          to switch into the bogus comment state, up to and including the last
1191          consumed character before the U+003E character, if any, or up to the
1192          end of the file otherwise. (If the comment was started by the end of
1193          the file (EOF), the token is empty.) */
1194          $data = $this->characters('^>', $this->char);
1195          $this->emitToken(
1196              array(
1197                  'data' => $data,
1198                  'type' => self::COMMENT
1199              )
1200          );
1201  
1202          $this->char += strlen($data);
1203  
1204          /* Switch to the data state. */
1205          $this->state = 'data';
1206  
1207          /* If the end of the file was reached, reconsume the EOF character. */
1208          if ($this->char === $this->EOF) {
1209              $this->char = $this->EOF - 1;
1210          }
1211      }
1212  
1213      private function markupDeclarationOpenState()
1214      {
1215          /* If the next two characters are both U+002D HYPHEN-MINUS (-)
1216          characters, consume those two characters, create a comment token whose
1217          data is the empty string, and switch to the comment state. */
1218          if ($this->character($this->char + 1, 2) === '--') {
1219              $this->char += 2;
1220              $this->state = 'comment';
1221              $this->token = array(
1222                  'data' => null,
1223                  'type' => self::COMMENT
1224              );
1225  
1226              /* Otherwise if the next seven chacacters are a case-insensitive match
1227              for the word "DOCTYPE", then consume those characters and switch to the
1228              DOCTYPE state. */
1229          } elseif (strtolower($this->character($this->char + 1, 7)) === 'doctype') {
1230              $this->char += 7;
1231              $this->state = 'doctype';
1232  
1233              /* Otherwise, is is a parse error. Switch to the bogus comment state.
1234              The next character that is consumed, if any, is the first character
1235              that will be in the comment. */
1236          } else {
1237              $this->char++;
1238              $this->state = 'bogusComment';
1239          }
1240      }
1241  
1242      private function commentState()
1243      {
1244          /* Consume the next input character: */
1245          $this->char++;
1246          $char = $this->char();
1247  
1248          /* U+002D HYPHEN-MINUS (-) */
1249          if ($char === '-') {
1250              /* Switch to the comment dash state  */
1251              $this->state = 'commentDash';
1252  
1253              /* EOF */
1254          } elseif ($this->char === $this->EOF) {
1255              /* Parse error. Emit the comment token. Reconsume the EOF character
1256              in the data state. */
1257              $this->emitToken($this->token);
1258              $this->char--;
1259              $this->state = 'data';
1260  
1261              /* Anything else */
1262          } else {
1263              /* Append the input character to the comment token's data. Stay in
1264              the comment state. */
1265              $this->token['data'] .= $char;
1266          }
1267      }
1268  
1269      private function commentDashState()
1270      {
1271          /* Consume the next input character: */
1272          $this->char++;
1273          $char = $this->char();
1274  
1275          /* U+002D HYPHEN-MINUS (-) */
1276          if ($char === '-') {
1277              /* Switch to the comment end state  */
1278              $this->state = 'commentEnd';
1279  
1280              /* EOF */
1281          } elseif ($this->char === $this->EOF) {
1282              /* Parse error. Emit the comment token. Reconsume the EOF character
1283              in the data state. */
1284              $this->emitToken($this->token);
1285              $this->char--;
1286              $this->state = 'data';
1287  
1288              /* Anything else */
1289          } else {
1290              /* Append a U+002D HYPHEN-MINUS (-) character and the input
1291              character to the comment token's data. Switch to the comment state. */
1292              $this->token['data'] .= '-' . $char;
1293              $this->state = 'comment';
1294          }
1295      }
1296  
1297      private function commentEndState()
1298      {
1299          /* Consume the next input character: */
1300          $this->char++;
1301          $char = $this->char();
1302  
1303          if ($char === '>') {
1304              $this->emitToken($this->token);
1305              $this->state = 'data';
1306  
1307          } elseif ($char === '-') {
1308              $this->token['data'] .= '-';
1309  
1310          } elseif ($this->char === $this->EOF) {
1311              $this->emitToken($this->token);
1312              $this->char--;
1313              $this->state = 'data';
1314  
1315          } else {
1316              $this->token['data'] .= '--' . $char;
1317              $this->state = 'comment';
1318          }
1319      }
1320  
1321      private function doctypeState()
1322      {
1323          /* Consume the next input character: */
1324          $this->char++;
1325          $char = $this->char();
1326  
1327          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1328              $this->state = 'beforeDoctypeName';
1329  
1330          } else {
1331              $this->char--;
1332              $this->state = 'beforeDoctypeName';
1333          }
1334      }
1335  
1336      private function beforeDoctypeNameState()
1337      {
1338          /* Consume the next input character: */
1339          $this->char++;
1340          $char = $this->char();
1341  
1342          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1343              // Stay in the before DOCTYPE name state.
1344  
1345          } elseif (preg_match('/^[a-z]$/', $char)) {
1346              $this->token = array(
1347                  'name' => strtoupper($char),
1348                  'type' => self::DOCTYPE,
1349                  'error' => true
1350              );
1351  
1352              $this->state = 'doctypeName';
1353  
1354          } elseif ($char === '>') {
1355              $this->emitToken(
1356                  array(
1357                      'name' => null,
1358                      'type' => self::DOCTYPE,
1359                      'error' => true
1360                  )
1361              );
1362  
1363              $this->state = 'data';
1364  
1365          } elseif ($this->char === $this->EOF) {
1366              $this->emitToken(
1367                  array(
1368                      'name' => null,
1369                      'type' => self::DOCTYPE,
1370                      'error' => true
1371                  )
1372              );
1373  
1374              $this->char--;
1375              $this->state = 'data';
1376  
1377          } else {
1378              $this->token = array(
1379                  'name' => $char,
1380                  'type' => self::DOCTYPE,
1381                  'error' => true
1382              );
1383  
1384              $this->state = 'doctypeName';
1385          }
1386      }
1387  
1388      private function doctypeNameState()
1389      {
1390          /* Consume the next input character: */
1391          $this->char++;
1392          $char = $this->char();
1393  
1394          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1395              $this->state = 'AfterDoctypeName';
1396  
1397          } elseif ($char === '>') {
1398              $this->emitToken($this->token);
1399              $this->state = 'data';
1400  
1401          } elseif (preg_match('/^[a-z]$/', $char)) {
1402              $this->token['name'] .= strtoupper($char);
1403  
1404          } elseif ($this->char === $this->EOF) {
1405              $this->emitToken($this->token);
1406              $this->char--;
1407              $this->state = 'data';
1408  
1409          } else {
1410              $this->token['name'] .= $char;
1411          }
1412  
1413          $this->token['error'] = ($this->token['name'] === 'HTML')
1414              ? false
1415              : true;
1416      }
1417  
1418      private function afterDoctypeNameState()
1419      {
1420          /* Consume the next input character: */
1421          $this->char++;
1422          $char = $this->char();
1423  
1424          if (preg_match('/^[\t\n\x0b\x0c ]$/', $char)) {
1425              // Stay in the DOCTYPE name state.
1426  
1427          } elseif ($char === '>') {
1428              $this->emitToken($this->token);
1429              $this->state = 'data';
1430  
1431          } elseif ($this->char === $this->EOF) {
1432              $this->emitToken($this->token);
1433              $this->char--;
1434              $this->state = 'data';
1435  
1436          } else {
1437              $this->token['error'] = true;
1438              $this->state = 'bogusDoctype';
1439          }
1440      }
1441  
1442      private function bogusDoctypeState()
1443      {
1444          /* Consume the next input character: */
1445          $this->char++;
1446          $char = $this->char();
1447  
1448          if ($char === '>') {
1449              $this->emitToken($this->token);
1450              $this->state = 'data';
1451  
1452          } elseif ($this->char === $this->EOF) {
1453              $this->emitToken($this->token);
1454              $this->char--;
1455              $this->state = 'data';
1456  
1457          } else {
1458              // Stay in the bogus DOCTYPE state.
1459          }
1460      }
1461  
1462      private function entity()
1463      {
1464          $start = $this->char;
1465  
1466          // This section defines how to consume an entity. This definition is
1467          // used when parsing entities in text and in attributes.
1468  
1469          // The behaviour depends on the identity of the next character (the
1470          // one immediately after the U+0026 AMPERSAND character):
1471  
1472          switch ($this->character($this->char + 1)) {
1473              // U+0023 NUMBER SIGN (#)
1474              case '#':
1475  
1476                  // The behaviour further depends on the character after the
1477                  // U+0023 NUMBER SIGN:
1478                  switch ($this->character($this->char + 1)) {
1479                      // U+0078 LATIN SMALL LETTER X
1480                      // U+0058 LATIN CAPITAL LETTER X
1481                      case 'x':
1482                      case 'X':
1483                          // Follow the steps below, but using the range of
1484                          // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1485                          // NINE, U+0061 LATIN SMALL LETTER A through to U+0066
1486                          // LATIN SMALL LETTER F, and U+0041 LATIN CAPITAL LETTER
1487                          // A, through to U+0046 LATIN CAPITAL LETTER F (in other
1488                          // words, 0-9, A-F, a-f).
1489                          $char = 1;
1490                          $char_class = '0-9A-Fa-f';
1491                          break;
1492  
1493                      // Anything else
1494                      default:
1495                          // Follow the steps below, but using the range of
1496                          // characters U+0030 DIGIT ZERO through to U+0039 DIGIT
1497                          // NINE (i.e. just 0-9).
1498                          $char = 0;
1499                          $char_class = '0-9';
1500                          break;
1501                  }
1502  
1503                  // Consume as many characters as match the range of characters
1504                  // given above.
1505                  $this->char++;
1506                  $e_name = $this->characters($char_class, $this->char + $char + 1);
1507                  $entity = $this->character($start, $this->char);
1508                  $cond = strlen($e_name) > 0;
1509  
1510                  // The rest of the parsing happens bellow.
1511                  break;
1512  
1513              // Anything else
1514              default:
1515                  // Consume the maximum number of characters possible, with the
1516                  // consumed characters case-sensitively matching one of the
1517                  // identifiers in the first column of the entities table.
1518                  $e_name = $this->characters('0-9A-Za-z;', $this->char + 1);
1519                  $len = strlen($e_name);
1520  
1521                  for ($c = 1; $c <= $len; $c++) {
1522                      $id = substr($e_name, 0, $c);
1523                      $this->char++;
1524  
1525                      if (in_array($id, $this->entities)) {
1526                          if ($e_name[$c - 1] !== ';') {
1527                              if ($c < $len && $e_name[$c] == ';') {
1528                                  $this->char++; // consume extra semicolon
1529                              }
1530                          }
1531                          $entity = $id;
1532                          break;
1533                      }
1534                  }
1535  
1536                  $cond = isset($entity);
1537                  // The rest of the parsing happens bellow.
1538                  break;
1539          }
1540  
1541          if (!$cond) {
1542              // If no match can be made, then this is a parse error. No
1543              // characters are consumed, and nothing is returned.
1544              $this->char = $start;
1545              return false;
1546          }
1547  
1548          // Return a character token for the character corresponding to the
1549          // entity name (as given by the second column of the entities table).
1550          return html_entity_decode('&' . $entity . ';', ENT_QUOTES, 'UTF-8');
1551      }
1552  
1553      private function emitToken($token)
1554      {
1555          $emit = $this->tree->emitToken($token);
1556  
1557          if (is_int($emit)) {
1558              $this->content_model = $emit;
1559  
1560          } elseif ($token['type'] === self::ENDTAG) {
1561              $this->content_model = self::PCDATA;
1562          }
1563      }
1564  
1565      private function EOF()
1566      {
1567          $this->state = null;
1568          $this->tree->emitToken(
1569              array(
1570                  'type' => self::EOF
1571              )
1572          );
1573      }
1574  }
1575  
1576  class HTML5TreeConstructer
1577  {
1578      public $stack = array();
1579  
1580      private $phase;
1581      private $mode;
1582      private $dom;
1583      private $foster_parent = null;
1584      private $a_formatting = array();
1585  
1586      private $head_pointer = null;
1587      private $form_pointer = null;
1588  
1589      private $scoping = array('button', 'caption', 'html', 'marquee', 'object', 'table', 'td', 'th');
1590      private $formatting = array(
1591          'a',
1592          'b',
1593          'big',
1594          'em',
1595          'font',
1596          'i',
1597          'nobr',
1598          's',
1599          'small',
1600          'strike',
1601          'strong',
1602          'tt',
1603          'u'
1604      );
1605      private $special = array(
1606          'address',
1607          'area',
1608          'base',
1609          'basefont',
1610          'bgsound',
1611          'blockquote',
1612          'body',
1613          'br',
1614          'center',
1615          'col',
1616          'colgroup',
1617          'dd',
1618          'dir',
1619          'div',
1620          'dl',
1621          'dt',
1622          'embed',
1623          'fieldset',
1624          'form',
1625          'frame',
1626          'frameset',
1627          'h1',
1628          'h2',
1629          'h3',
1630          'h4',
1631          'h5',
1632          'h6',
1633          'head',
1634          'hr',
1635          'iframe',
1636          'image',
1637          'img',
1638          'input',
1639          'isindex',
1640          'li',
1641          'link',
1642          'listing',
1643          'menu',
1644          'meta',
1645          'noembed',
1646          'noframes',
1647          'noscript',
1648          'ol',
1649          'optgroup',
1650          'option',
1651          'p',
1652          'param',
1653          'plaintext',
1654          'pre',
1655          'script',
1656          'select',
1657          'spacer',
1658          'style',
1659          'tbody',
1660          'textarea',
1661          'tfoot',
1662          'thead',
1663          'title',
1664          'tr',
1665          'ul',
1666          'wbr'
1667      );
1668  
1669      // The different phases.
1670      const INIT_PHASE = 0;
1671      const ROOT_PHASE = 1;
1672      const MAIN_PHASE = 2;
1673      const END_PHASE = 3;
1674  
1675      // The different insertion modes for the main phase.
1676      const BEFOR_HEAD = 0;
1677      const IN_HEAD = 1;
1678      const AFTER_HEAD = 2;
1679      const IN_BODY = 3;
1680      const IN_TABLE = 4;
1681      const IN_CAPTION = 5;
1682      const IN_CGROUP = 6;
1683      const IN_TBODY = 7;
1684      const IN_ROW = 8;
1685      const IN_CELL = 9;
1686      const IN_SELECT = 10;
1687      const AFTER_BODY = 11;
1688      const IN_FRAME = 12;
1689      const AFTR_FRAME = 13;
1690  
1691      // The different types of elements.
1692      const SPECIAL = 0;
1693      const SCOPING = 1;
1694      const FORMATTING = 2;
1695      const PHRASING = 3;
1696  
1697      const MARKER = 0;
1698  
1699      public function __construct()
1700      {
1701          $this->phase = self::INIT_PHASE;
1702          $this->mode = self::BEFOR_HEAD;
1703          $this->dom = new DOMDocument;
1704  
1705          $this->dom->encoding = 'UTF-8';
1706          $this->dom->preserveWhiteSpace = true;
1707          $this->dom->substituteEntities = true;
1708          $this->dom->strictErrorChecking = false;
1709      }
1710  
1711      // Process tag tokens
1712      public function emitToken($token)
1713      {
1714          switch ($this->phase) {
1715              case self::INIT_PHASE:
1716                  return $this->initPhase($token);
1717                  break;
1718              case self::ROOT_PHASE:
1719                  return $this->rootElementPhase($token);
1720                  break;
1721              case self::MAIN_PHASE:
1722                  return $this->mainPhase($token);
1723                  break;
1724              case self::END_PHASE :
1725                  return $this->trailingEndPhase($token);
1726                  break;
1727          }
1728      }
1729  
1730      private function initPhase($token)
1731      {
1732          /* Initially, the tree construction stage must handle each token
1733          emitted from the tokenisation stage as follows: */
1734  
1735          /* A DOCTYPE token that is marked as being in error
1736          A comment token
1737          A start tag token
1738          An end tag token
1739          A character token that is not one of one of U+0009 CHARACTER TABULATION,
1740              U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1741              or U+0020 SPACE
1742          An end-of-file token */
1743          if ((isset($token['error']) && $token['error']) ||
1744              $token['type'] === HTML5::COMMENT ||
1745              $token['type'] === HTML5::STARTTAG ||
1746              $token['type'] === HTML5::ENDTAG ||
1747              $token['type'] === HTML5::EOF ||
1748              ($token['type'] === HTML5::CHARACTR && isset($token['data']) &&
1749                  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data']))
1750          ) {
1751              /* This specification does not define how to handle this case. In
1752              particular, user agents may ignore the entirety of this specification
1753              altogether for such documents, and instead invoke special parse modes
1754              with a greater emphasis on backwards compatibility. */
1755  
1756              $this->phase = self::ROOT_PHASE;
1757              return $this->rootElementPhase($token);
1758  
1759              /* A DOCTYPE token marked as being correct */
1760          } elseif (isset($token['error']) && !$token['error']) {
1761              /* Append a DocumentType node to the Document  node, with the name
1762              attribute set to the name given in the DOCTYPE token (which will be
1763              "HTML"), and the other attributes specific to DocumentType objects
1764              set to null, empty lists, or the empty string as appropriate. */
1765              $doctype = new DOMDocumentType(null, null, 'HTML');
1766  
1767              /* Then, switch to the root element phase of the tree construction
1768              stage. */
1769              $this->phase = self::ROOT_PHASE;
1770  
1771              /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1772              U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1773              or U+0020 SPACE */
1774          } elseif (isset($token['data']) && preg_match(
1775                  '/^[\t\n\x0b\x0c ]+$/',
1776                  $token['data']
1777              )
1778          ) {
1779              /* Append that character  to the Document node. */
1780              $text = $this->dom->createTextNode($token['data']);
1781              $this->dom->appendChild($text);
1782          }
1783      }
1784  
1785      private function rootElementPhase($token)
1786      {
1787          /* After the initial phase, as each token is emitted from the tokenisation
1788          stage, it must be processed as described in this section. */
1789  
1790          /* A DOCTYPE token */
1791          if ($token['type'] === HTML5::DOCTYPE) {
1792              // Parse error. Ignore the token.
1793  
1794              /* A comment token */
1795          } elseif ($token['type'] === HTML5::COMMENT) {
1796              /* Append a Comment node to the Document object with the data
1797              attribute set to the data given in the comment token. */
1798              $comment = $this->dom->createComment($token['data']);
1799              $this->dom->appendChild($comment);
1800  
1801              /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1802              U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1803              or U+0020 SPACE */
1804          } elseif ($token['type'] === HTML5::CHARACTR &&
1805              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1806          ) {
1807              /* Append that character  to the Document node. */
1808              $text = $this->dom->createTextNode($token['data']);
1809              $this->dom->appendChild($text);
1810  
1811              /* A character token that is not one of U+0009 CHARACTER TABULATION,
1812                  U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED
1813                  (FF), or U+0020 SPACE
1814              A start tag token
1815              An end tag token
1816              An end-of-file token */
1817          } elseif (($token['type'] === HTML5::CHARACTR &&
1818                  !preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
1819              $token['type'] === HTML5::STARTTAG ||
1820              $token['type'] === HTML5::ENDTAG ||
1821              $token['type'] === HTML5::EOF
1822          ) {
1823              /* Create an HTMLElement node with the tag name html, in the HTML
1824              namespace. Append it to the Document object. Switch to the main
1825              phase and reprocess the current token. */
1826              $html = $this->dom->createElement('html');
1827              $this->dom->appendChild($html);
1828              $this->stack[] = $html;
1829  
1830              $this->phase = self::MAIN_PHASE;
1831              return $this->mainPhase($token);
1832          }
1833      }
1834  
1835      private function mainPhase($token)
1836      {
1837          /* Tokens in the main phase must be handled as follows: */
1838  
1839          /* A DOCTYPE token */
1840          if ($token['type'] === HTML5::DOCTYPE) {
1841              // Parse error. Ignore the token.
1842  
1843              /* A start tag token with the tag name "html" */
1844          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'html') {
1845              /* If this start tag token was not the first start tag token, then
1846              it is a parse error. */
1847  
1848              /* For each attribute on the token, check to see if the attribute
1849              is already present on the top element of the stack of open elements.
1850              If it is not, add the attribute and its corresponding value to that
1851              element. */
1852              foreach ($token['attr'] as $attr) {
1853                  if (!$this->stack[0]->hasAttribute($attr['name'])) {
1854                      $this->stack[0]->setAttribute($attr['name'], $attr['value']);
1855                  }
1856              }
1857  
1858              /* An end-of-file token */
1859          } elseif ($token['type'] === HTML5::EOF) {
1860              /* Generate implied end tags. */
1861              $this->generateImpliedEndTags();
1862  
1863              /* Anything else. */
1864          } else {
1865              /* Depends on the insertion mode: */
1866              switch ($this->mode) {
1867                  case self::BEFOR_HEAD:
1868                      return $this->beforeHead($token);
1869                      break;
1870                  case self::IN_HEAD:
1871                      return $this->inHead($token);
1872                      break;
1873                  case self::AFTER_HEAD:
1874                      return $this->afterHead($token);
1875                      break;
1876                  case self::IN_BODY:
1877                      return $this->inBody($token);
1878                      break;
1879                  case self::IN_TABLE:
1880                      return $this->inTable($token);
1881                      break;
1882                  case self::IN_CAPTION:
1883                      return $this->inCaption($token);
1884                      break;
1885                  case self::IN_CGROUP:
1886                      return $this->inColumnGroup($token);
1887                      break;
1888                  case self::IN_TBODY:
1889                      return $this->inTableBody($token);
1890                      break;
1891                  case self::IN_ROW:
1892                      return $this->inRow($token);
1893                      break;
1894                  case self::IN_CELL:
1895                      return $this->inCell($token);
1896                      break;
1897                  case self::IN_SELECT:
1898                      return $this->inSelect($token);
1899                      break;
1900                  case self::AFTER_BODY:
1901                      return $this->afterBody($token);
1902                      break;
1903                  case self::IN_FRAME:
1904                      return $this->inFrameset($token);
1905                      break;
1906                  case self::AFTR_FRAME:
1907                      return $this->afterFrameset($token);
1908                      break;
1909                  case self::END_PHASE:
1910                      return $this->trailingEndPhase($token);
1911                      break;
1912              }
1913          }
1914      }
1915  
1916      private function beforeHead($token)
1917      {
1918          /* Handle the token as follows: */
1919  
1920          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1921          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1922          or U+0020 SPACE */
1923          if ($token['type'] === HTML5::CHARACTR &&
1924              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
1925          ) {
1926              /* Append the character to the current node. */
1927              $this->insertText($token['data']);
1928  
1929              /* A comment token */
1930          } elseif ($token['type'] === HTML5::COMMENT) {
1931              /* Append a Comment node to the current node with the data attribute
1932              set to the data given in the comment token. */
1933              $this->insertComment($token['data']);
1934  
1935              /* A start tag token with the tag name "head" */
1936          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') {
1937              /* Create an element for the token, append the new element to the
1938              current node and push it onto the stack of open elements. */
1939              $element = $this->insertElement($token);
1940  
1941              /* Set the head element pointer to this new element node. */
1942              $this->head_pointer = $element;
1943  
1944              /* Change the insertion mode to "in head". */
1945              $this->mode = self::IN_HEAD;
1946  
1947              /* A start tag token whose tag name is one of: "base", "link", "meta",
1948              "script", "style", "title". Or an end tag with the tag name "html".
1949              Or a character token that is not one of U+0009 CHARACTER TABULATION,
1950              U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1951              or U+0020 SPACE. Or any other start tag token */
1952          } elseif ($token['type'] === HTML5::STARTTAG ||
1953              ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') ||
1954              ($token['type'] === HTML5::CHARACTR && !preg_match(
1955                      '/^[\t\n\x0b\x0c ]$/',
1956                      $token['data']
1957                  ))
1958          ) {
1959              /* Act as if a start tag token with the tag name "head" and no
1960              attributes had been seen, then reprocess the current token. */
1961              $this->beforeHead(
1962                  array(
1963                      'name' => 'head',
1964                      'type' => HTML5::STARTTAG,
1965                      'attr' => array()
1966                  )
1967              );
1968  
1969              return $this->inHead($token);
1970  
1971              /* Any other end tag */
1972          } elseif ($token['type'] === HTML5::ENDTAG) {
1973              /* Parse error. Ignore the token. */
1974          }
1975      }
1976  
1977      private function inHead($token)
1978      {
1979          /* Handle the token as follows: */
1980  
1981          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1982          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1983          or U+0020 SPACE.
1984  
1985          THIS DIFFERS FROM THE SPEC: If the current node is either a title, style
1986          or script element, append the character to the current node regardless
1987          of its content. */
1988          if (($token['type'] === HTML5::CHARACTR &&
1989                  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) || (
1990                  $token['type'] === HTML5::CHARACTR && in_array(
1991                      end($this->stack)->nodeName,
1992                      array('title', 'style', 'script')
1993                  ))
1994          ) {
1995              /* Append the character to the current node. */
1996              $this->insertText($token['data']);
1997  
1998              /* A comment token */
1999          } elseif ($token['type'] === HTML5::COMMENT) {
2000              /* Append a Comment node to the current node with the data attribute
2001              set to the data given in the comment token. */
2002              $this->insertComment($token['data']);
2003  
2004          } elseif ($token['type'] === HTML5::ENDTAG &&
2005              in_array($token['name'], array('title', 'style', 'script'))
2006          ) {
2007              array_pop($this->stack);
2008              return HTML5::PCDATA;
2009  
2010              /* A start tag with the tag name "title" */
2011          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'title') {
2012              /* Create an element for the token and append the new element to the
2013              node pointed to by the head element pointer, or, if that is null
2014              (innerHTML case), to the current node. */
2015              if ($this->head_pointer !== null) {
2016                  $element = $this->insertElement($token, false);
2017                  $this->head_pointer->appendChild($element);
2018  
2019              } else {
2020                  $element = $this->insertElement($token);
2021              }
2022  
2023              /* Switch the tokeniser's content model flag  to the RCDATA state. */
2024              return HTML5::RCDATA;
2025  
2026              /* A start tag with the tag name "style" */
2027          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'style') {
2028              /* Create an element for the token and append the new element to the
2029              node pointed to by the head element pointer, or, if that is null
2030              (innerHTML case), to the current node. */
2031              if ($this->head_pointer !== null) {
2032                  $element = $this->insertElement($token, false);
2033                  $this->head_pointer->appendChild($element);
2034  
2035              } else {
2036                  $this->insertElement($token);
2037              }
2038  
2039              /* Switch the tokeniser's content model flag  to the CDATA state. */
2040              return HTML5::CDATA;
2041  
2042              /* A start tag with the tag name "script" */
2043          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'script') {
2044              /* Create an element for the token. */
2045              $element = $this->insertElement($token, false);
2046              $this->head_pointer->appendChild($element);
2047  
2048              /* Switch the tokeniser's content model flag  to the CDATA state. */
2049              return HTML5::CDATA;
2050  
2051              /* A start tag with the tag name "base", "link", or "meta" */
2052          } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2053                  $token['name'],
2054                  array('base', 'link', 'meta')
2055              )
2056          ) {
2057              /* Create an element for the token and append the new element to the
2058              node pointed to by the head element pointer, or, if that is null
2059              (innerHTML case), to the current node. */
2060              if ($this->head_pointer !== null) {
2061                  $element = $this->insertElement($token, false);
2062                  $this->head_pointer->appendChild($element);
2063                  array_pop($this->stack);
2064  
2065              } else {
2066                  $this->insertElement($token);
2067              }
2068  
2069              /* An end tag with the tag name "head" */
2070          } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'head') {
2071              /* If the current node is a head element, pop the current node off
2072              the stack of open elements. */
2073              if ($this->head_pointer->isSameNode(end($this->stack))) {
2074                  array_pop($this->stack);
2075  
2076                  /* Otherwise, this is a parse error. */
2077              } else {
2078                  // k
2079              }
2080  
2081              /* Change the insertion mode to "after head". */
2082              $this->mode = self::AFTER_HEAD;
2083  
2084              /* A start tag with the tag name "head" or an end tag except "html". */
2085          } elseif (($token['type'] === HTML5::STARTTAG && $token['name'] === 'head') ||
2086              ($token['type'] === HTML5::ENDTAG && $token['name'] !== 'html')
2087          ) {
2088              // Parse error. Ignore the token.
2089  
2090              /* Anything else */
2091          } else {
2092              /* If the current node is a head element, act as if an end tag
2093              token with the tag name "head" had been seen. */
2094              if ($this->head_pointer->isSameNode(end($this->stack))) {
2095                  $this->inHead(
2096                      array(
2097                          'name' => 'head',
2098                          'type' => HTML5::ENDTAG
2099                      )
2100                  );
2101  
2102                  /* Otherwise, change the insertion mode to "after head". */
2103              } else {
2104                  $this->mode = self::AFTER_HEAD;
2105              }
2106  
2107              /* Then, reprocess the current token. */
2108              return $this->afterHead($token);
2109          }
2110      }
2111  
2112      private function afterHead($token)
2113      {
2114          /* Handle the token as follows: */
2115  
2116          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
2117          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
2118          or U+0020 SPACE */
2119          if ($token['type'] === HTML5::CHARACTR &&
2120              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
2121          ) {
2122              /* Append the character to the current node. */
2123              $this->insertText($token['data']);
2124  
2125              /* A comment token */
2126          } elseif ($token['type'] === HTML5::COMMENT) {
2127              /* Append a Comment node to the current node with the data attribute
2128              set to the data given in the comment token. */
2129              $this->insertComment($token['data']);
2130  
2131              /* A start tag token with the tag name "body" */
2132          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'body') {
2133              /* Insert a body element for the token. */
2134              $this->insertElement($token);
2135  
2136              /* Change the insertion mode to "in body". */
2137              $this->mode = self::IN_BODY;
2138  
2139              /* A start tag token with the tag name "frameset" */
2140          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'frameset') {
2141              /* Insert a frameset element for the token. */
2142              $this->insertElement($token);
2143  
2144              /* Change the insertion mode to "in frameset". */
2145              $this->mode = self::IN_FRAME;
2146  
2147              /* A start tag token whose tag name is one of: "base", "link", "meta",
2148              "script", "style", "title" */
2149          } elseif ($token['type'] === HTML5::STARTTAG && in_array(
2150                  $token['name'],
2151                  array('base', 'link', 'meta', 'script', 'style', 'title')
2152              )
2153          ) {
2154              /* Parse error. Switch the insertion mode back to "in head" and
2155              reprocess the token. */
2156              $this->mode = self::IN_HEAD;
2157              return $this->inHead($token);
2158  
2159              /* Anything else */
2160          } else {
2161              /* Act as if a start tag token with the tag name "body" and no
2162              attributes had been seen, and then reprocess the current token. */
2163              $this->afterHead(
2164                  array(
2165                      'name' => 'body',
2166                      'type' => HTML5::STARTTAG,
2167                      'attr' => array()
2168                  )
2169              );
2170  
2171              return $this->inBody($token);
2172          }
2173      }
2174  
2175      private function inBody($token)
2176      {
2177          /* Handle the token as follows: */
2178  
2179          switch ($token['type']) {
2180              /* A character token */
2181              case HTML5::CHARACTR:
2182                  /* Reconstruct the active formatting elements, if any. */
2183                  $this->reconstructActiveFormattingElements();
2184  
2185                  /* Append the token's character to the current node. */
2186                  $this->insertText($token['data']);
2187                  break;
2188  
2189              /* A comment token */
2190              case HTML5::COMMENT:
2191                  /* Append a Comment node to the current node with the data
2192                  attribute set to the data given in the comment token. */
2193                  $this->insertComment($token['data']);
2194                  break;
2195  
2196              case HTML5::STARTTAG:
2197                  switch ($token['name']) {
2198                      /* A start tag token whose tag name is one of: "script",
2199                      "style" */
2200                      case 'script':
2201                      case 'style':
2202                          /* Process the token as if the insertion mode had been "in
2203                          head". */
2204                          return $this->inHead($token);
2205                          break;
2206  
2207                      /* A start tag token whose tag name is one of: "base", "link",
2208                      "meta", "title" */
2209                      case 'base':
2210                      case 'link':
2211                      case 'meta':
2212                      case 'title':
2213                          /* Parse error. Process the token as if the insertion mode
2214                          had    been "in head". */
2215                          return $this->inHead($token);
2216                          break;
2217  
2218                      /* A start tag token with the tag name "body" */
2219                      case 'body':
2220                          /* Parse error. If the second element on the stack of open
2221                          elements is not a body element, or, if the stack of open
2222                          elements has only one node on it, then ignore the token.
2223                          (innerHTML case) */
2224                          if (count($this->stack) === 1 || $this->stack[1]->nodeName !== 'body') {
2225                              // Ignore
2226  
2227                              /* Otherwise, for each attribute on the token, check to see
2228                              if the attribute is already present on the body element (the
2229                              second element)    on the stack of open elements. If it is not,
2230                              add the attribute and its corresponding value to that
2231                              element. */
2232                          } else {
2233                              foreach ($token['attr'] as $attr) {
2234                                  if (!$this->stack[1]->hasAttribute($attr['name'])) {
2235                                      $this->stack[1]->setAttribute($attr['name'], $attr['value']);
2236                                  }
2237                              }
2238                          }
2239                          break;
2240  
2241                      /* A start tag whose tag name is one of: "address",
2242                      "blockquote", "center", "dir", "div", "dl", "fieldset",
2243                      "listing", "menu", "ol", "p", "ul" */
2244                      case 'address':
2245                      case 'blockquote':
2246                      case 'center':
2247                      case 'dir':
2248                      case 'div':
2249                      case 'dl':
2250                      case 'fieldset':
2251                      case 'listing':
2252                      case 'menu':
2253                      case 'ol':
2254                      case 'p':
2255                      case 'ul':
2256                          /* If the stack of open elements has a p element in scope,
2257                          then act as if an end tag with the tag name p had been
2258                          seen. */
2259                          if ($this->elementInScope('p')) {
2260                              $this->emitToken(
2261                                  array(
2262                                      'name' => 'p',
2263                                      'type' => HTML5::ENDTAG
2264                                  )
2265                              );
2266                          }
2267  
2268                          /* Insert an HTML element for the token. */
2269                          $this->insertElement($token);
2270                          break;
2271  
2272                      /* A start tag whose tag name is "form" */
2273                      case 'form':
2274                          /* If the form element pointer is not null, ignore the
2275                          token with a parse error. */
2276                          if ($this->form_pointer !== null) {
2277                              // Ignore.
2278  
2279                              /* Otherwise: */
2280                          } else {
2281                              /* If the stack of open elements has a p element in
2282                              scope, then act as if an end tag with the tag name p
2283                              had been seen. */
2284                              if ($this->elementInScope('p')) {
2285                                  $this->emitToken(
2286                                      array(
2287                                          'name' => 'p',
2288                                          'type' => HTML5::ENDTAG
2289                                      )
2290                                  );
2291                              }
2292  
2293                              /* Insert an HTML element for the token, and set the
2294                              form element pointer to point to the element created. */
2295                              $element = $this->insertElement($token);
2296                              $this->form_pointer = $element;
2297                          }
2298                          break;
2299  
2300                      /* A start tag whose tag name is "li", "dd" or "dt" */
2301                      case 'li':
2302                      case 'dd':
2303                      case 'dt':
2304                          /* If the stack of open elements has a p  element in scope,
2305                          then act as if an end tag with the tag name p had been
2306                          seen. */
2307                          if ($this->elementInScope('p')) {
2308                              $this->emitToken(
2309                                  array(
2310                                      'name' => 'p',
2311                                      'type' => HTML5::ENDTAG
2312                                  )
2313                              );
2314                          }
2315  
2316                          $stack_length = count($this->stack) - 1;
2317  
2318                          for ($n = $stack_length; 0 <= $n; $n--) {
2319                              /* 1. Initialise node to be the current node (the
2320                              bottommost node of the stack). */
2321                              $stop = false;
2322                              $node = $this->stack[$n];
2323                              $cat = $this->getElementCategory($node->tagName);
2324  
2325                              /* 2. If node is an li, dd or dt element, then pop all
2326                              the    nodes from the current node up to node, including
2327                              node, then stop this algorithm. */
2328                              if ($token['name'] === $node->tagName || ($token['name'] !== 'li'
2329                                      && ($node->tagName === 'dd' || $node->tagName === 'dt'))
2330                              ) {
2331                                  for ($x = $stack_length; $x >= $n; $x--) {
2332                                      array_pop($this->stack);
2333                                  }
2334  
2335                                  break;
2336                              }
2337  
2338                              /* 3. If node is not in the formatting category, and is
2339                              not    in the phrasing category, and is not an address or
2340                              div element, then stop this algorithm. */
2341                              if ($cat !== self::FORMATTING && $cat !== self::PHRASING &&
2342                                  $node->tagName !== 'address' && $node->tagName !== 'div'
2343                              ) {
2344                                  break;
2345                              }
2346                          }
2347  
2348                          /* Finally, insert an HTML element with the same tag
2349                          name as the    token's. */
2350                          $this->insertElement($token);
2351                          break;
2352  
2353                      /* A start tag token whose tag name is "plaintext" */
2354                      case 'plaintext':
2355                          /* If the stack of open elements has a p  element in scope,
2356                          then act as if an end tag with the tag name p had been
2357                          seen. */
2358                          if ($this->elementInScope('p')) {
2359                              $this->emitToken(
2360                                  array(
2361                                      'name' => 'p',
2362                                      'type' => HTML5::ENDTAG
2363                                  )
2364                              );
2365                          }
2366  
2367                          /* Insert an HTML element for the token. */
2368                          $this->insertElement($token);
2369  
2370                          return HTML5::PLAINTEXT;
2371                          break;
2372  
2373                      /* A start tag whose tag name is one of: "h1", "h2", "h3", "h4",
2374                      "h5", "h6" */
2375                      case 'h1':
2376                      case 'h2':
2377                      case 'h3':
2378                      case 'h4':
2379                      case 'h5':
2380                      case 'h6':
2381                          /* If the stack of open elements has a p  element in scope,
2382                          then act as if an end tag with the tag name p had been seen. */
2383                          if ($this->elementInScope('p')) {
2384                              $this->emitToken(
2385                                  array(
2386                                      'name' => 'p',
2387                                      'type' => HTML5::ENDTAG
2388                                  )
2389                              );
2390                          }
2391  
2392                          /* If the stack of open elements has in scope an element whose
2393                          tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2394                          this is a parse error; pop elements from the stack until an
2395                          element with one of those tag names has been popped from the
2396                          stack. */
2397                          while ($this->elementInScope(array('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))) {
2398                              array_pop($this->stack);
2399                          }
2400  
2401                          /* Insert an HTML element for the token. */
2402                          $this->insertElement($token);
2403                          break;
2404  
2405                      /* A start tag whose tag name is "a" */
2406                      case 'a':
2407                          /* If the list of active formatting elements contains
2408                          an element whose tag name is "a" between the end of the
2409                          list and the last marker on the list (or the start of
2410                          the list if there is no marker on the list), then this
2411                          is a parse error; act as if an end tag with the tag name
2412                          "a" had been seen, then remove that element from the list
2413                          of active formatting elements and the stack of open
2414                          elements if the end tag didn't already remove it (it
2415                          might not have if the element is not in table scope). */
2416                          $leng = count($this->a_formatting);
2417  
2418                          for ($n = $leng - 1; $n >= 0; $n--) {
2419                              if ($this->a_formatting[$n] === self::MARKER) {
2420                                  break;
2421  
2422                              } elseif ($this->a_formatting[$n]->nodeName === 'a') {
2423                                  $this->emitToken(
2424                                      array(
2425                                          'name' => 'a',
2426                                          'type' => HTML5::ENDTAG
2427                                      )
2428                                  );
2429                                  break;
2430                              }
2431                          }
2432  
2433                          /* Reconstruct the active formatting elements, if any. */
2434                          $this->reconstructActiveFormattingElements();
2435  
2436                          /* Insert an HTML element for the token. */
2437                          $el = $this->insertElement($token);
2438  
2439                          /* Add that element to the list of active formatting
2440                          elements. */
2441                          $this->a_formatting[] = $el;
2442                          break;
2443  
2444                      /* A start tag whose tag name is one of: "b", "big", "em", "font",
2445                      "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2446                      case 'b':
2447                      case 'big':
2448                      case 'em':
2449                      case 'font':
2450                      case 'i':
2451                      case 'nobr':
2452                      case 's':
2453                      case 'small':
2454                      case 'strike':
2455                      case 'strong':
2456                      case 'tt':
2457                      case 'u':
2458                          /* Reconstruct the active formatting elements, if any. */
2459                          $this->reconstructActiveFormattingElements();
2460  
2461                          /* Insert an HTML element for the token. */
2462                          $el = $this->insertElement($token);
2463  
2464                          /* Add that element to the list of active formatting
2465                          elements. */
2466                          $this->a_formatting[] = $el;
2467                          break;
2468  
2469                      /* A start tag token whose tag name is "button" */
2470                      case 'button':
2471                          /* If the stack of open elements has a button element in scope,
2472                          then this is a parse error; act as if an end tag with the tag
2473                          name "button" had been seen, then reprocess the token. (We don't
2474                          do that. Unnecessary.) */
2475                          if ($this->elementInScope('button')) {
2476                              $this->inBody(
2477                                  array(
2478                                      'name' => 'button',
2479                                      'type' => HTML5::ENDTAG
2480                                  )
2481                              );
2482                          }
2483  
2484                          /* Reconstruct the active formatting elements, if any. */
2485                          $this->reconstructActiveFormattingElements();
2486  
2487                          /* Insert an HTML element for the token. */
2488                          $this->insertElement($token);
2489  
2490                          /* Insert a marker at the end of the list of active
2491                          formatting elements. */
2492                          $this->a_formatting[] = self::MARKER;
2493                          break;
2494  
2495                      /* A start tag token whose tag name is one of: "marquee", "object" */
2496                      case 'marquee':
2497                      case 'object':
2498                          /* Reconstruct the active formatting elements, if any. */
2499                          $this->reconstructActiveFormattingElements();
2500  
2501                          /* Insert an HTML element for the token. */
2502                          $this->insertElement($token);
2503  
2504                          /* Insert a marker at the end of the list of active
2505                          formatting elements. */
2506                          $this->a_formatting[] = self::MARKER;
2507                          break;
2508  
2509                      /* A start tag token whose tag name is "xmp" */
2510                      case 'xmp':
2511                          /* Reconstruct the active formatting elements, if any. */
2512                          $this->reconstructActiveFormattingElements();
2513  
2514                          /* Insert an HTML element for the token. */
2515                          $this->insertElement($token);
2516  
2517                          /* Switch the content model flag to the CDATA state. */
2518                          return HTML5::CDATA;
2519                          break;
2520  
2521                      /* A start tag whose tag name is "table" */
2522                      case 'table':
2523                          /* If the stack of open elements has a p element in scope,
2524                          then act as if an end tag with the tag name p had been seen. */
2525                          if ($this->elementInScope('p')) {
2526                              $this->emitToken(
2527                                  array(
2528                                      'name' => 'p',
2529                                      'type' => HTML5::ENDTAG
2530                                  )
2531                              );
2532                          }
2533  
2534                          /* Insert an HTML element for the token. */
2535                          $this->insertElement($token);
2536  
2537                          /* Change the insertion mode to "in table". */
2538                          $this->mode = self::IN_TABLE;
2539                          break;
2540  
2541                      /* A start tag whose tag name is one of: "area", "basefont",
2542                      "bgsound", "br", "embed", "img", "param", "spacer", "wbr" */
2543                      case 'area':
2544                      case 'basefont':
2545                      case 'bgsound':
2546                      case 'br':
2547                      case 'embed':
2548                      case 'img':
2549                      case 'param':
2550                      case 'spacer':
2551                      case 'wbr':
2552                          /* Reconstruct the active formatting elements, if any. */
2553                          $this->reconstructActiveFormattingElements();
2554  
2555                          /* Insert an HTML element for the token. */
2556                          $this->insertElement($token);
2557  
2558                          /* Immediately pop the current node off the stack of open elements. */
2559                          array_pop($this->stack);
2560                          break;
2561  
2562                      /* A start tag whose tag name is "hr" */
2563                      case 'hr':
2564                          /* If the stack of open elements has a p element in scope,
2565                          then act as if an end tag with the tag name p had been seen. */
2566                          if ($this->elementInScope('p')) {
2567                              $this->emitToken(
2568                                  array(
2569                                      'name' => 'p',
2570                                      'type' => HTML5::ENDTAG
2571                                  )
2572                              );
2573                          }
2574  
2575                          /* Insert an HTML element for the token. */
2576                          $this->insertElement($token);
2577  
2578                          /* Immediately pop the current node off the stack of open elements. */
2579                          array_pop($this->stack);
2580                          break;
2581  
2582                      /* A start tag whose tag name is "image" */
2583                      case 'image':
2584                          /* Parse error. Change the token's tag name to "img" and
2585                          reprocess it. (Don't ask.) */
2586                          $token['name'] = 'img';
2587                          return $this->inBody($token);
2588                          break;
2589  
2590                      /* A start tag whose tag name is "input" */
2591                      case 'input':
2592                          /* Reconstruct the active formatting elements, if any. */
2593                          $this->reconstructActiveFormattingElements();
2594  
2595                          /* Insert an input element for the token. */
2596                          $element = $this->insertElement($token, false);
2597  
2598                          /* If the form element pointer is not null, then associate the
2599                          input element with the form element pointed to by the form
2600                          element pointer. */
2601                          $this->form_pointer !== null
2602                              ? $this->form_pointer->appendChild($element)
2603                              : end($this->stack)->appendChild($element);
2604  
2605                          /* Pop that input element off the stack of open elements. */
2606                          array_pop($this->stack);
2607                          break;
2608  
2609                      /* A start tag whose tag name is "isindex" */
2610                      case 'isindex':
2611                          /* Parse error. */
2612                          // w/e
2613  
2614                          /* If the form element pointer is not null,
2615                          then ignore the token. */
2616                          if ($this->form_pointer === null) {
2617                              /* Act as if a start tag token with the tag name "form" had
2618                              been seen. */
2619                              $this->inBody(
2620                                  array(
2621                                      'name' => 'body',
2622                                      'type' => HTML5::STARTTAG,
2623                                      'attr' => array()
2624                                  )
2625                              );
2626  
2627                              /* Act as if a start tag token with the tag name "hr" had
2628                              been seen. */
2629                              $this->inBody(
2630                                  array(
2631                                      'name' => 'hr',
2632                                      'type' => HTML5::STARTTAG,
2633                                      'attr' => array()
2634                                  )
2635                              );
2636  
2637                              /* Act as if a start tag token with the tag name "p" had
2638                              been seen. */
2639                              $this->inBody(
2640                                  array(
2641                                      'name' => 'p',
2642                                      'type' => HTML5::STARTTAG,
2643                                      'attr' => array()
2644                                  )
2645                              );
2646  
2647                              /* Act as if a start tag token with the tag name "label"
2648                              had been seen. */
2649                              $this->inBody(
2650                                  array(
2651                                      'name' => 'label',
2652                                      'type' => HTML5::STARTTAG,
2653                                      'attr' => array()
2654                                  )
2655                              );
2656  
2657                              /* Act as if a stream of character tokens had been seen. */
2658                              $this->insertText(
2659                                  'This is a searchable index. ' .
2660                                  'Insert your search keywords here: '
2661                              );
2662  
2663                              /* Act as if a start tag token with the tag name "input"
2664                              had been seen, with all the attributes from the "isindex"
2665                              token, except with the "name" attribute set to the value
2666                              "isindex" (ignoring any explicit "name" attribute). */
2667                              $attr = $token['attr'];
2668                              $attr[] = array('name' => 'name', 'value' => 'isindex');
2669  
2670                              $this->inBody(
2671                                  array(
2672                                      'name' => 'input',
2673                                      'type' => HTML5::STARTTAG,
2674                                      'attr' => $attr
2675                                  )
2676                              );
2677  
2678                              /* Act as if a stream of character tokens had been seen
2679                              (see below for what they should say). */
2680                              $this->insertText(
2681                                  'This is a searchable index. ' .
2682                                  'Insert your search keywords here: '
2683                              );
2684  
2685                              /* Act as if an end tag token with the tag name "label"
2686                              had been seen. */
2687                              $this->inBody(
2688                                  array(
2689                                      'name' => 'label',
2690                                      'type' => HTML5::ENDTAG
2691                                  )
2692                              );
2693  
2694                              /* Act as if an end tag token with the tag name "p" had
2695                              been seen. */
2696                              $this->inBody(
2697                                  array(
2698                                      'name' => 'p',
2699                                      'type' => HTML5::ENDTAG
2700                                  )
2701                              );
2702  
2703                              /* Act as if a start tag token with the tag name "hr" had
2704                              been seen. */
2705                              $this->inBody(
2706                                  array(
2707                                      'name' => 'hr',
2708                                      'type' => HTML5::ENDTAG
2709                                  )
2710                              );
2711  
2712                              /* Act as if an end tag token with the tag name "form" had
2713                              been seen. */
2714                              $this->inBody(
2715                                  array(
2716                                      'name' => 'form',
2717                                      'type' => HTML5::ENDTAG
2718                                  )
2719                              );
2720                          }
2721                          break;
2722  
2723                      /* A start tag whose tag name is "textarea" */
2724                      case 'textarea':
2725                          $this->insertElement($token);
2726  
2727                          /* Switch the tokeniser's content model flag to the
2728                          RCDATA state. */
2729                          return HTML5::RCDATA;
2730                          break;
2731  
2732                      /* A start tag whose tag name is one of: "iframe", "noembed",
2733                      "noframes" */
2734                      case 'iframe':
2735                      case 'noembed':
2736                      case 'noframes':
2737                          $this->insertElement($token);
2738  
2739                          /* Switch the tokeniser's content model flag to the CDATA state. */
2740                          return HTML5::CDATA;
2741                          break;
2742  
2743                      /* A start tag whose tag name is "select" */
2744                      case 'select':
2745                          /* Reconstruct the active formatting elements, if any. */
2746                          $this->reconstructActiveFormattingElements();
2747  
2748                          /* Insert an HTML element for the token. */
2749                          $this->insertElement($token);
2750  
2751                          /* Change the insertion mode to "in select". */
2752                          $this->mode = self::IN_SELECT;
2753                          break;
2754  
2755                      /* A start or end tag whose tag name is one of: "caption", "col",
2756                      "colgroup", "frame", "frameset", "head", "option", "optgroup",
2757                      "tbody", "td", "tfoot", "th", "thead", "tr". */
2758                      case 'caption':
2759                      case 'col':
2760                      case 'colgroup':
2761                      case 'frame':
2762                      case 'frameset':
2763                      case 'head':
2764                      case 'option':
2765                      case 'optgroup':
2766                      case 'tbody':
2767                      case 'td':
2768                      case 'tfoot':
2769                      case 'th':
2770                      case 'thead':
2771                      case 'tr':
2772                          // Parse error. Ignore the token.
2773                          break;
2774  
2775                      /* A start or end tag whose tag name is one of: "event-source",
2776                      "section", "nav", "article", "aside", "header", "footer",
2777                      "datagrid", "command" */
2778                      case 'event-source':
2779                      case 'section':
2780                      case 'nav':
2781                      case 'article':
2782                      case 'aside':
2783                      case 'header':
2784                      case 'footer':
2785                      case 'datagrid':
2786                      case 'command':
2787                          // Work in progress!
2788                          break;
2789  
2790                      /* A start tag token not covered by the previous entries */
2791                      default:
2792                          /* Reconstruct the active formatting elements, if any. */
2793                          $this->reconstructActiveFormattingElements();
2794  
2795                          $this->insertElement($token, true, true);
2796                          break;
2797                  }
2798                  break;
2799  
2800              case HTML5::ENDTAG:
2801                  switch ($token['name']) {
2802                      /* An end tag with the tag name "body" */
2803                      case 'body':
2804                          /* If the second element in the stack of open elements is
2805                          not a body element, this is a parse error. Ignore the token.
2806                          (innerHTML case) */
2807                          if (count($this->stack) < 2 || $this->stack[1]->nodeName !== 'body') {
2808                              // Ignore.
2809  
2810                              /* If the current node is not the body element, then this
2811                              is a parse error. */
2812                          } elseif (end($this->stack)->nodeName !== 'body') {
2813                              // Parse error.
2814                          }
2815  
2816                          /* Change the insertion mode to "after body". */
2817                          $this->mode = self::AFTER_BODY;
2818                          break;
2819  
2820                      /* An end tag with the tag name "html" */
2821                      case 'html':
2822                          /* Act as if an end tag with tag name "body" had been seen,
2823                          then, if that token wasn't ignored, reprocess the current
2824                          token. */
2825                          $this->inBody(
2826                              array(
2827                                  'name' => 'body',
2828                                  'type' => HTML5::ENDTAG
2829                              )
2830                          );
2831  
2832                          return $this->afterBody($token);
2833                          break;
2834  
2835                      /* An end tag whose tag name is one of: "address", "blockquote",
2836                      "center", "dir", "div", "dl", "fieldset", "listing", "menu",
2837                      "ol", "pre", "ul" */
2838                      case 'address':
2839                      case 'blockquote':
2840                      case 'center':
2841                      case 'dir':
2842                      case 'div':
2843                      case 'dl':
2844                      case 'fieldset':
2845                      case 'listing':
2846                      case 'menu':
2847                      case 'ol':
2848                      case 'pre':
2849                      case 'ul':
2850                          /* If the stack of open elements has an element in scope
2851                          with the same tag name as that of the token, then generate
2852                          implied end tags. */
2853                          if ($this->elementInScope($token['name'])) {
2854                              $this->generateImpliedEndTags();
2855  
2856                              /* Now, if the current node is not an element with
2857                              the same tag name as that of the token, then this
2858                              is a parse error. */
2859                              // w/e
2860  
2861                              /* If the stack of open elements has an element in
2862                              scope with the same tag name as that of the token,
2863                              then pop elements from this stack until an element
2864                              with that tag name has been popped from the stack. */
2865                              for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2866                                  if ($this->stack[$n]->nodeName === $token['name']) {
2867                                      $n = -1;
2868                                  }
2869  
2870                                  array_pop($this->stack);
2871                              }
2872                          }
2873                          break;
2874  
2875                      /* An end tag whose tag name is "form" */
2876                      case 'form':
2877                          /* If the stack of open elements has an element in scope
2878                          with the same tag name as that of the token, then generate
2879                          implied    end tags. */
2880                          if ($this->elementInScope($token['name'])) {
2881                              $this->generateImpliedEndTags();
2882  
2883                          }
2884  
2885                          if (end($this->stack)->nodeName !== $token['name']) {
2886                              /* Now, if the current node is not an element with the
2887                              same tag name as that of the token, then this is a parse
2888                              error. */
2889                              // w/e
2890  
2891                          } else {
2892                              /* Otherwise, if the current node is an element with
2893                              the same tag name as that of the token pop that element
2894                              from the stack. */
2895                              array_pop($this->stack);
2896                          }
2897  
2898                          /* In any case, set the form element pointer to null. */
2899                          $this->form_pointer = null;
2900                          break;
2901  
2902                      /* An end tag whose tag name is "p" */
2903                      case 'p':
2904                          /* If the stack of open elements has a p element in scope,
2905                          then generate implied end tags, except for p elements. */
2906                          if ($this->elementInScope('p')) {
2907                              $this->generateImpliedEndTags(array('p'));
2908  
2909                              /* If the current node is not a p element, then this is
2910                              a parse error. */
2911                              // k
2912  
2913                              /* If the stack of open elements has a p element in
2914                              scope, then pop elements from this stack until the stack
2915                              no longer has a p element in scope. */
2916                              for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2917                                  if ($this->elementInScope('p')) {
2918                                      array_pop($this->stack);
2919  
2920                                  } else {
2921                                      break;
2922                                  }
2923                              }
2924                          }
2925                          break;
2926  
2927                      /* An end tag whose tag name is "dd", "dt", or "li" */
2928                      case 'dd':
2929                      case 'dt':
2930                      case 'li':
2931                          /* If the stack of open elements has an element in scope
2932                          whose tag name matches the tag name of the token, then
2933                          generate implied end tags, except for elements with the
2934                          same tag name as the token. */
2935                          if ($this->elementInScope($token['name'])) {
2936                              $this->generateImpliedEndTags(array($token['name']));
2937  
2938                              /* If the current node is not an element with the same
2939                              tag name as the token, then this is a parse error. */
2940                              // w/e
2941  
2942                              /* If the stack of open elements has an element in scope
2943                              whose tag name matches the tag name of the token, then
2944                              pop elements from this stack until an element with that
2945                              tag name has been popped from the stack. */
2946                              for ($n = count($this->stack) - 1; $n >= 0; $n--) {
2947                                  if ($this->stack[$n]->nodeName === $token['name']) {
2948                                      $n = -1;
2949                                  }
2950  
2951                                  array_pop($this->stack);
2952                              }
2953                          }
2954                          break;
2955  
2956                      /* An end tag whose tag name is one of: "h1", "h2", "h3", "h4",
2957                      "h5", "h6" */
2958                      case 'h1':
2959                      case 'h2':
2960                      case 'h3':
2961                      case 'h4':
2962                      case 'h5':
2963                      case 'h6':
2964                          $elements = array('h1', 'h2', 'h3', 'h4', 'h5', 'h6');
2965  
2966                          /* If the stack of open elements has in scope an element whose
2967                          tag name is one of "h1", "h2", "h3", "h4", "h5", or "h6", then
2968                          generate implied end tags. */
2969                          if ($this->elementInScope($elements)) {
2970                              $this->generateImpliedEndTags();
2971  
2972                              /* Now, if the current node is not an element with the same
2973                              tag name as that of the token, then this is a parse error. */
2974                              // w/e
2975  
2976                              /* If the stack of open elements has in scope an element
2977                              whose tag name is one of "h1", "h2", "h3", "h4", "h5", or
2978                              "h6", then pop elements from the stack until an element
2979                              with one of those tag names has been popped from the stack. */
2980                              while ($this->elementInScope($elements)) {
2981                                  array_pop($this->stack);
2982                              }
2983                          }
2984                          break;
2985  
2986                      /* An end tag whose tag name is one of: "a", "b", "big", "em",
2987                      "font", "i", "nobr", "s", "small", "strike", "strong", "tt", "u" */
2988                      case 'a':
2989                      case 'b':
2990                      case 'big':
2991                      case 'em':
2992                      case 'font':
2993                      case 'i':
2994                      case 'nobr':
2995                      case 's':
2996                      case 'small':
2997                      case 'strike':
2998                      case 'strong':
2999                      case 'tt':
3000                      case 'u':
3001                          /* 1. Let the formatting element be the last element in
3002                          the list of active formatting elements that:
3003                              * is between the end of the list and the last scope
3004                              marker in the list, if any, or the start of the list
3005                              otherwise, and
3006                              * has the same tag name as the token.
3007                          */
3008                          while (true) {
3009                              for ($a = count($this->a_formatting) - 1; $a >= 0; $a--) {
3010                                  if ($this->a_formatting[$a] === self::MARKER) {
3011                                      break;
3012  
3013                                  } elseif ($this->a_formatting[$a]->tagName === $token['name']) {
3014                                      $formatting_element = $this->a_formatting[$a];
3015                                      $in_stack = in_array($formatting_element, $this->stack, true);
3016                                      $fe_af_pos = $a;
3017                                      break;
3018                                  }
3019                              }
3020  
3021                              /* If there is no such node, or, if that node is
3022                              also in the stack of open elements but the element
3023                              is not in scope, then this is a parse error. Abort
3024                              these steps. The token is ignored. */
3025                              if (!isset($formatting_element) || ($in_stack &&
3026                                      !$this->elementInScope($token['name']))
3027                              ) {
3028                                  break;
3029  
3030                                  /* Otherwise, if there is such a node, but that node
3031                                  is not in the stack of open elements, then this is a
3032                                  parse error; remove the element from the list, and
3033                                  abort these steps. */
3034                              } elseif (isset($formatting_element) && !$in_stack) {
3035                                  unset($this->a_formatting[$fe_af_pos]);
3036                                  $this->a_formatting = array_merge($this->a_formatting);
3037                                  break;
3038                              }
3039  
3040                              /* 2. Let the furthest block be the topmost node in the
3041                              stack of open elements that is lower in the stack
3042                              than the formatting element, and is not an element in
3043                              the phrasing or formatting categories. There might
3044                              not be one. */
3045                              $fe_s_pos = array_search($formatting_element, $this->stack, true);
3046                              $length = count($this->stack);
3047  
3048                              for ($s = $fe_s_pos + 1; $s < $length; $s++) {
3049                                  $category = $this->getElementCategory($this->stack[$s]->nodeName);
3050  
3051                                  if ($category !== self::PHRASING && $category !== self::FORMATTING) {
3052                                      $furthest_block = $this->stack[$s];
3053                                  }
3054                              }
3055  
3056                              /* 3. If there is no furthest block, then the UA must
3057                              skip the subsequent steps and instead just pop all
3058                              the nodes from the bottom of the stack of open
3059                              elements, from the current node up to the formatting
3060                              element, and remove the formatting element from the
3061                              list of active formatting elements. */
3062                              if (!isset($furthest_block)) {
3063                                  for ($n = $length - 1; $n >= $fe_s_pos; $n--) {
3064                                      array_pop($this->stack);
3065                                  }
3066  
3067                                  unset($this->a_formatting[$fe_af_pos]);
3068                                  $this->a_formatting = array_merge($this->a_formatting);
3069                                  break;
3070                              }
3071  
3072                              /* 4. Let the common ancestor be the element
3073                              immediately above the formatting element in the stack
3074                              of open elements. */
3075                              $common_ancestor = $this->stack[$fe_s_pos - 1];
3076  
3077                              /* 5. If the furthest block has a parent node, then
3078                              remove the furthest block from its parent node. */
3079                              if ($furthest_block->parentNode !== null) {
3080                                  $furthest_block->parentNode->removeChild($furthest_block);
3081                              }
3082  
3083                              /* 6. Let a bookmark note the position of the
3084                              formatting element in the list of active formatting
3085                              elements relative to the elements on either side
3086                              of it in the list. */
3087                              $bookmark = $fe_af_pos;
3088  
3089                              /* 7. Let node and last node  be the furthest block.
3090                              Follow these steps: */
3091                              $node = $furthest_block;
3092                              $last_node = $furthest_block;
3093  
3094                              while (true) {
3095                                  for ($n = array_search($node, $this->stack, true) - 1; $n >= 0; $n--) {
3096                                      /* 7.1 Let node be the element immediately
3097                                      prior to node in the stack of open elements. */
3098                                      $node = $this->stack[$n];
3099  
3100                                      /* 7.2 If node is not in the list of active
3101                                      formatting elements, then remove node from
3102                                      the stack of open elements and then go back
3103                                      to step 1. */
3104                                      if (!in_array($node, $this->a_formatting, true)) {
3105                                          unset($this->stack[$n]);
3106                                          $this->stack = array_merge($this->stack);
3107  
3108                                      } else {
3109                                          break;
3110                                      }
3111                                  }
3112  
3113                                  /* 7.3 Otherwise, if node is the formatting
3114                                  element, then go to the next step in the overall
3115                                  algorithm. */
3116                                  if ($node === $formatting_element) {
3117                                      break;
3118  
3119                                      /* 7.4 Otherwise, if last node is the furthest
3120                                      block, then move the aforementioned bookmark to
3121                                      be immediately after the node in the list of
3122                                      active formatting elements. */
3123                                  } elseif ($last_node === $furthest_block) {
3124                                      $bookmark = array_search($node, $this->a_formatting, true) + 1;
3125                                  }
3126  
3127                                  /* 7.5 If node has any children, perform a
3128                                  shallow clone of node, replace the entry for
3129                                  node in the list of active formatting elements
3130                                  with an entry for the clone, replace the entry
3131                                  for node in the stack of open elements with an
3132                                  entry for the clone, and let node be the clone. */
3133                                  if ($node->hasChildNodes()) {
3134                                      $clone = $node->cloneNode();
3135                                      $s_pos = array_search($node, $this->stack, true);
3136                                      $a_pos = array_search($node, $this->a_formatting, true);
3137  
3138                                      $this->stack[$s_pos] = $clone;
3139                                      $this->a_formatting[$a_pos] = $clone;
3140                                      $node = $clone;
3141                                  }
3142  
3143                                  /* 7.6 Insert last node into node, first removing
3144                                  it from its previous parent node if any. */
3145                                  if ($last_node->parentNode !== null) {
3146                                      $last_node->parentNode->removeChild($last_node);
3147                                  }
3148  
3149                                  $node->appendChild($last_node);
3150  
3151                                  /* 7.7 Let last node be node. */
3152                                  $last_node = $node;
3153                              }
3154  
3155                              /* 8. Insert whatever last node ended up being in
3156                              the previous step into the common ancestor node,
3157                              first removing it from its previous parent node if
3158                              any. */
3159                              if ($last_node->parentNode !== null) {
3160                                  $last_node->parentNode->removeChild($last_node);
3161                              }
3162  
3163                              $common_ancestor->appendChild($last_node);
3164  
3165                              /* 9. Perform a shallow clone of the formatting
3166                              element. */
3167                              $clone = $formatting_element->cloneNode();
3168  
3169                              /* 10. Take all of the child nodes of the furthest
3170                              block and append them to the clone created in the
3171                              last step. */
3172                              while ($furthest_block->hasChildNodes()) {
3173                                  $child = $furthest_block->firstChild;
3174                                  $furthest_block->removeChild($child);
3175                                  $clone->appendChild($child);
3176                              }
3177  
3178                              /* 11. Append that clone to the furthest block. */
3179                              $furthest_block->appendChild($clone);
3180  
3181                              /* 12. Remove the formatting element from the list
3182                              of active formatting elements, and insert the clone
3183                              into the list of active formatting elements at the
3184                              position of the aforementioned bookmark. */
3185                              $fe_af_pos = array_search($formatting_element, $this->a_formatting, true);
3186                              unset($this->a_formatting[$fe_af_pos]);
3187                              $this->a_formatting = array_merge($this->a_formatting);
3188  
3189                              $af_part1 = array_slice($this->a_formatting, 0, $bookmark - 1);
3190                              $af_part2 = array_slice($this->a_formatting, $bookmark, count($this->a_formatting));
3191                              $this->a_formatting = array_merge($af_part1, array($clone), $af_part2);
3192  
3193                              /* 13. Remove the formatting element from the stack
3194                              of open elements, and insert the clone into the stack
3195                              of open elements immediately after (i.e. in a more
3196                              deeply nested position than) the position of the
3197                              furthest block in that stack. */
3198                              $fe_s_pos = array_search($formatting_element, $this->stack, true);
3199                              $fb_s_pos = array_search($furthest_block, $this->stack, true);
3200                              unset($this->stack[$fe_s_pos]);
3201  
3202                              $s_part1 = array_slice($this->stack, 0, $fb_s_pos);
3203                              $s_part2 = array_slice($this->stack, $fb_s_pos + 1, count($this->stack));
3204                              $this->stack = array_merge($s_part1, array($clone), $s_part2);
3205  
3206                              /* 14. Jump back to step 1 in this series of steps. */
3207                              unset($formatting_element, $fe_af_pos, $fe_s_pos, $furthest_block);
3208                          }
3209                          break;
3210  
3211                      /* An end tag token whose tag name is one of: "button",
3212                      "marquee", "object" */
3213                      case 'button':
3214                      case 'marquee':
3215                      case 'object':
3216                          /* If the stack of open elements has an element in scope whose
3217                          tag name matches the tag name of the token, then generate implied
3218                          tags. */
3219                          if ($this->elementInScope($token['name'])) {
3220                              $this->generateImpliedEndTags();
3221  
3222                              /* Now, if the current node is not an element with the same
3223                              tag name as the token, then this is a parse error. */
3224                              // k
3225  
3226                              /* Now, if the stack of open elements has an element in scope
3227                              whose tag name matches the tag name of the token, then pop
3228                              elements from the stack until that element has been popped from
3229                              the stack, and clear the list of active formatting elements up
3230                              to the last marker. */
3231                              for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3232                                  if ($this->stack[$n]->nodeName === $token['name']) {
3233                                      $n = -1;
3234                                  }
3235  
3236                                  array_pop($this->stack);
3237                              }
3238  
3239                              $marker = end(array_keys($this->a_formatting, self::MARKER, true));
3240  
3241                              for ($n = count($this->a_formatting) - 1; $n > $marker; $n--) {
3242                                  array_pop($this->a_formatting);
3243                              }
3244                          }
3245                          break;
3246  
3247                      /* Or an end tag whose tag name is one of: "area", "basefont",
3248                      "bgsound", "br", "embed", "hr", "iframe", "image", "img",
3249                      "input", "isindex", "noembed", "noframes", "param", "select",
3250                      "spacer", "table", "textarea", "wbr" */
3251                      case 'area':
3252                      case 'basefont':
3253                      case 'bgsound':
3254                      case 'br':
3255                      case 'embed':
3256                      case 'hr':
3257                      case 'iframe':
3258                      case 'image':
3259                      case 'img':
3260                      case 'input':
3261                      case 'isindex':
3262                      case 'noembed':
3263                      case 'noframes':
3264                      case 'param':
3265                      case 'select':
3266                      case 'spacer':
3267                      case 'table':
3268                      case 'textarea':
3269                      case 'wbr':
3270                          // Parse error. Ignore the token.
3271                          break;
3272  
3273                      /* An end tag token not covered by the previous entries */
3274                      default:
3275                          for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3276                              /* Initialise node to be the current node (the bottommost
3277                              node of the stack). */
3278                              $node = end($this->stack);
3279  
3280                              /* If node has the same tag name as the end tag token,
3281                              then: */
3282                              if ($token['name'] === $node->nodeName) {
3283                                  /* Generate implied end tags. */
3284                                  $this->generateImpliedEndTags();
3285  
3286                                  /* If the tag name of the end tag token does not
3287                                  match the tag name of the current node, this is a
3288                                  parse error. */
3289                                  // k
3290  
3291                                  /* Pop all the nodes from the current node up to
3292                                  node, including node, then stop this algorithm. */
3293                                  for ($x = count($this->stack) - $n; $x >= $n; $x--) {
3294                                      array_pop($this->stack);
3295                                  }
3296  
3297                              } else {
3298                                  $category = $this->getElementCategory($node);
3299  
3300                                  if ($category !== self::SPECIAL && $category !== self::SCOPING) {
3301                                      /* Otherwise, if node is in neither the formatting
3302                                      category nor the phrasing category, then this is a
3303                                      parse error. Stop this algorithm. The end tag token
3304                                      is ignored. */
3305                                      return false;
3306                                  }
3307                              }
3308                          }
3309                          break;
3310                  }
3311                  break;
3312          }
3313      }
3314  
3315      private function inTable($token)
3316      {
3317          $clear = array('html', 'table');
3318  
3319          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3320          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3321          or U+0020 SPACE */
3322          if ($token['type'] === HTML5::CHARACTR &&
3323              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3324          ) {
3325              /* Append the character to the current node. */
3326              $text = $this->dom->createTextNode($token['data']);
3327              end($this->stack)->appendChild($text);
3328  
3329              /* A comment token */
3330          } elseif ($token['type'] === HTML5::COMMENT) {
3331              /* Append a Comment node to the current node with the data
3332              attribute set to the data given in the comment token. */
3333              $comment = $this->dom->createComment($token['data']);
3334              end($this->stack)->appendChild($comment);
3335  
3336              /* A start tag whose tag name is "caption" */
3337          } elseif ($token['type'] === HTML5::STARTTAG &&
3338              $token['name'] === 'caption'
3339          ) {
3340              /* Clear the stack back to a table context. */
3341              $this->clearStackToTableContext($clear);
3342  
3343              /* Insert a marker at the end of the list of active
3344              formatting elements. */
3345              $this->a_formatting[] = self::MARKER;
3346  
3347              /* Insert an HTML element for the token, then switch the
3348              insertion mode to "in caption". */
3349              $this->insertElement($token);
3350              $this->mode = self::IN_CAPTION;
3351  
3352              /* A start tag whose tag name is "colgroup" */
3353          } elseif ($token['type'] === HTML5::STARTTAG &&
3354              $token['name'] === 'colgroup'
3355          ) {
3356              /* Clear the stack back to a table context. */
3357              $this->clearStackToTableContext($clear);
3358  
3359              /* Insert an HTML element for the token, then switch the
3360              insertion mode to "in column group". */
3361              $this->insertElement($token);
3362              $this->mode = self::IN_CGROUP;
3363  
3364              /* A start tag whose tag name is "col" */
3365          } elseif ($token['type'] === HTML5::STARTTAG &&
3366              $token['name'] === 'col'
3367          ) {
3368              $this->inTable(
3369                  array(
3370                      'name' => 'colgroup',
3371                      'type' => HTML5::STARTTAG,
3372                      'attr' => array()
3373                  )
3374              );
3375  
3376              $this->inColumnGroup($token);
3377  
3378              /* A start tag whose tag name is one of: "tbody", "tfoot", "thead" */
3379          } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3380                  $token['name'],
3381                  array('tbody', 'tfoot', 'thead')
3382              )
3383          ) {
3384              /* Clear the stack back to a table context. */
3385              $this->clearStackToTableContext($clear);
3386  
3387              /* Insert an HTML element for the token, then switch the insertion
3388              mode to "in table body". */
3389              $this->insertElement($token);
3390              $this->mode = self::IN_TBODY;
3391  
3392              /* A start tag whose tag name is one of: "td", "th", "tr" */
3393          } elseif ($token['type'] === HTML5::STARTTAG &&
3394              in_array($token['name'], array('td', 'th', 'tr'))
3395          ) {
3396              /* Act as if a start tag token with the tag name "tbody" had been
3397              seen, then reprocess the current token. */
3398              $this->inTable(
3399                  array(
3400                      'name' => 'tbody',
3401                      'type' => HTML5::STARTTAG,
3402                      'attr' => array()
3403                  )
3404              );
3405  
3406              return $this->inTableBody($token);
3407  
3408              /* A start tag whose tag name is "table" */
3409          } elseif ($token['type'] === HTML5::STARTTAG &&
3410              $token['name'] === 'table'
3411          ) {
3412              /* Parse error. Act as if an end tag token with the tag name "table"
3413              had been seen, then, if that token wasn't ignored, reprocess the
3414              current token. */
3415              $this->inTable(
3416                  array(
3417                      'name' => 'table',
3418                      'type' => HTML5::ENDTAG
3419                  )
3420              );
3421  
3422              return $this->mainPhase($token);
3423  
3424              /* An end tag whose tag name is "table" */
3425          } elseif ($token['type'] === HTML5::ENDTAG &&
3426              $token['name'] === 'table'
3427          ) {
3428              /* If the stack of open elements does not have an element in table
3429              scope with the same tag name as the token, this is a parse error.
3430              Ignore the token. (innerHTML case) */
3431              if (!$this->elementInScope($token['name'], true)) {
3432                  return false;
3433  
3434                  /* Otherwise: */
3435              } else {
3436                  /* Generate implied end tags. */
3437                  $this->generateImpliedEndTags();
3438  
3439                  /* Now, if the current node is not a table element, then this
3440                  is a parse error. */
3441                  // w/e
3442  
3443                  /* Pop elements from this stack until a table element has been
3444                  popped from the stack. */
3445                  while (true) {
3446                      $current = end($this->stack)->nodeName;
3447                      array_pop($this->stack);
3448  
3449                      if ($current === 'table') {
3450                          break;
3451                      }
3452                  }
3453  
3454                  /* Reset the insertion mode appropriately. */
3455                  $this->resetInsertionMode();
3456              }
3457  
3458              /* An end tag whose tag name is one of: "body", "caption", "col",
3459              "colgroup", "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3460          } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3461                  $token['name'],
3462                  array(
3463                      'body',
3464                      'caption',
3465                      'col',
3466                      'colgroup',
3467                      'html',
3468                      'tbody',
3469                      'td',
3470                      'tfoot',
3471                      'th',
3472                      'thead',
3473                      'tr'
3474                  )
3475              )
3476          ) {
3477              // Parse error. Ignore the token.
3478  
3479              /* Anything else */
3480          } else {
3481              /* Parse error. Process the token as if the insertion mode was "in
3482              body", with the following exception: */
3483  
3484              /* If the current node is a table, tbody, tfoot, thead, or tr
3485              element, then, whenever a node would be inserted into the current
3486              node, it must instead be inserted into the foster parent element. */
3487              if (in_array(
3488                  end($this->stack)->nodeName,
3489                  array('table', 'tbody', 'tfoot', 'thead', 'tr')
3490              )
3491              ) {
3492                  /* The foster parent element is the parent element of the last
3493                  table element in the stack of open elements, if there is a
3494                  table element and it has such a parent element. If there is no
3495                  table element in the stack of open elements (innerHTML case),
3496                  then the foster parent element is the first element in the
3497                  stack of open elements (the html  element). Otherwise, if there
3498                  is a table element in the stack of open elements, but the last
3499                  table element in the stack of open elements has no parent, or
3500                  its parent node is not an element, then the foster parent
3501                  element is the element before the last table element in the
3502                  stack of open elements. */
3503                  for ($n = count($this->stack) - 1; $n >= 0; $n--) {
3504                      if ($this->stack[$n]->nodeName === 'table') {
3505                          $table = $this->stack[$n];
3506                          break;
3507                      }
3508                  }
3509  
3510                  if (isset($table) && $table->parentNode !== null) {
3511                      $this->foster_parent = $table->parentNode;
3512  
3513                  } elseif (!isset($table)) {
3514                      $this->foster_parent = $this->stack[0];
3515  
3516                  } elseif (isset($table) && ($table->parentNode === null ||
3517                          $table->parentNode->nodeType !== XML_ELEMENT_NODE)
3518                  ) {
3519                      $this->foster_parent = $this->stack[$n - 1];
3520                  }
3521              }
3522  
3523              $this->inBody($token);
3524          }
3525      }
3526  
3527      private function inCaption($token)
3528      {
3529          /* An end tag whose tag name is "caption" */
3530          if ($token['type'] === HTML5::ENDTAG && $token['name'] === 'caption') {
3531              /* If the stack of open elements does not have an element in table
3532              scope with the same tag name as the token, this is a parse error.
3533              Ignore the token. (innerHTML case) */
3534              if (!$this->elementInScope($token['name'], true)) {
3535                  // Ignore
3536  
3537                  /* Otherwise: */
3538              } else {
3539                  /* Generate implied end tags. */
3540                  $this->generateImpliedEndTags();
3541  
3542                  /* Now, if the current node is not a caption element, then this
3543                  is a parse error. */
3544                  // w/e
3545  
3546                  /* Pop elements from this stack until a caption element has
3547                  been popped from the stack. */
3548                  while (true) {
3549                      $node = end($this->stack)->nodeName;
3550                      array_pop($this->stack);
3551  
3552                      if ($node === 'caption') {
3553                          break;
3554                      }
3555                  }
3556  
3557                  /* Clear the list of active formatting elements up to the last
3558                  marker. */
3559                  $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3560  
3561                  /* Switch the insertion mode to "in table". */
3562                  $this->mode = self::IN_TABLE;
3563              }
3564  
3565              /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3566              "tbody", "td", "tfoot", "th", "thead", "tr", or an end tag whose tag
3567              name is "table" */
3568          } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3569                      $token['name'],
3570                      array(
3571                          'caption',
3572                          'col',
3573                          'colgroup',
3574                          'tbody',
3575                          'td',
3576                          'tfoot',
3577                          'th',
3578                          'thead',
3579                          'tr'
3580                      )
3581                  )) || ($token['type'] === HTML5::ENDTAG &&
3582                  $token['name'] === 'table')
3583          ) {
3584              /* Parse error. Act as if an end tag with the tag name "caption"
3585              had been seen, then, if that token wasn't ignored, reprocess the
3586              current token. */
3587              $this->inCaption(
3588                  array(
3589                      'name' => 'caption',
3590                      'type' => HTML5::ENDTAG
3591                  )
3592              );
3593  
3594              return $this->inTable($token);
3595  
3596              /* An end tag whose tag name is one of: "body", "col", "colgroup",
3597              "html", "tbody", "td", "tfoot", "th", "thead", "tr" */
3598          } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3599                  $token['name'],
3600                  array(
3601                      'body',
3602                      'col',
3603                      'colgroup',
3604                      'html',
3605                      'tbody',
3606                      'tfoot',
3607                      'th',
3608                      'thead',
3609                      'tr'
3610                  )
3611              )
3612          ) {
3613              // Parse error. Ignore the token.
3614  
3615              /* Anything else */
3616          } else {
3617              /* Process the token as if the insertion mode was "in body". */
3618              $this->inBody($token);
3619          }
3620      }
3621  
3622      private function inColumnGroup($token)
3623      {
3624          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
3625          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
3626          or U+0020 SPACE */
3627          if ($token['type'] === HTML5::CHARACTR &&
3628              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
3629          ) {
3630              /* Append the character to the current node. */
3631              $text = $this->dom->createTextNode($token['data']);
3632              end($this->stack)->appendChild($text);
3633  
3634              /* A comment token */
3635          } elseif ($token['type'] === HTML5::COMMENT) {
3636              /* Append a Comment node to the current node with the data
3637              attribute set to the data given in the comment token. */
3638              $comment = $this->dom->createComment($token['data']);
3639              end($this->stack)->appendChild($comment);
3640  
3641              /* A start tag whose tag name is "col" */
3642          } elseif ($token['type'] === HTML5::STARTTAG && $token['name'] === 'col') {
3643              /* Insert a col element for the token. Immediately pop the current
3644              node off the stack of open elements. */
3645              $this->insertElement($token);
3646              array_pop($this->stack);
3647  
3648              /* An end tag whose tag name is "colgroup" */
3649          } elseif ($token['type'] === HTML5::ENDTAG &&
3650              $token['name'] === 'colgroup'
3651          ) {
3652              /* If the current node is the root html element, then this is a
3653              parse error, ignore the token. (innerHTML case) */
3654              if (end($this->stack)->nodeName === 'html') {
3655                  // Ignore
3656  
3657                  /* Otherwise, pop the current node (which will be a colgroup
3658                  element) from the stack of open elements. Switch the insertion
3659                  mode to "in table". */
3660              } else {
3661                  array_pop($this->stack);
3662                  $this->mode = self::IN_TABLE;
3663              }
3664  
3665              /* An end tag whose tag name is "col" */
3666          } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'col') {
3667              /* Parse error. Ignore the token. */
3668  
3669              /* Anything else */
3670          } else {
3671              /* Act as if an end tag with the tag name "colgroup" had been seen,
3672              and then, if that token wasn't ignored, reprocess the current token. */
3673              $this->inColumnGroup(
3674                  array(
3675                      'name' => 'colgroup',
3676                      'type' => HTML5::ENDTAG
3677                  )
3678              );
3679  
3680              return $this->inTable($token);
3681          }
3682      }
3683  
3684      private function inTableBody($token)
3685      {
3686          $clear = array('tbody', 'tfoot', 'thead', 'html');
3687  
3688          /* A start tag whose tag name is "tr" */
3689          if ($token['type'] === HTML5::STARTTAG && $token['name'] === 'tr') {
3690              /* Clear the stack back to a table body context. */
3691              $this->clearStackToTableContext($clear);
3692  
3693              /* Insert a tr element for the token, then switch the insertion
3694              mode to "in row". */
3695              $this->insertElement($token);
3696              $this->mode = self::IN_ROW;
3697  
3698              /* A start tag whose tag name is one of: "th", "td" */
3699          } elseif ($token['type'] === HTML5::STARTTAG &&
3700              ($token['name'] === 'th' || $token['name'] === 'td')
3701          ) {
3702              /* Parse error. Act as if a start tag with the tag name "tr" had
3703              been seen, then reprocess the current token. */
3704              $this->inTableBody(
3705                  array(
3706                      'name' => 'tr',
3707                      'type' => HTML5::STARTTAG,
3708                      'attr' => array()
3709                  )
3710              );
3711  
3712              return $this->inRow($token);
3713  
3714              /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3715          } elseif ($token['type'] === HTML5::ENDTAG &&
3716              in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3717          ) {
3718              /* If the stack of open elements does not have an element in table
3719              scope with the same tag name as the token, this is a parse error.
3720              Ignore the token. */
3721              if (!$this->elementInScope($token['name'], true)) {
3722                  // Ignore
3723  
3724                  /* Otherwise: */
3725              } else {
3726                  /* Clear the stack back to a table body context. */
3727                  $this->clearStackToTableContext($clear);
3728  
3729                  /* Pop the current node from the stack of open elements. Switch
3730                  the insertion mode to "in table". */
3731                  array_pop($this->stack);
3732                  $this->mode = self::IN_TABLE;
3733              }
3734  
3735              /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3736              "tbody", "tfoot", "thead", or an end tag whose tag name is "table" */
3737          } elseif (($token['type'] === HTML5::STARTTAG && in_array(
3738                      $token['name'],
3739                      array('caption', 'col', 'colgroup', 'tbody', 'tfoor', 'thead')
3740                  )) ||
3741              ($token['type'] === HTML5::STARTTAG && $token['name'] === 'table')
3742          ) {
3743              /* If the stack of open elements does not have a tbody, thead, or
3744              tfoot element in table scope, this is a parse error. Ignore the
3745              token. (innerHTML case) */
3746              if (!$this->elementInScope(array('tbody', 'thead', 'tfoot'), true)) {
3747                  // Ignore.
3748  
3749                  /* Otherwise: */
3750              } else {
3751                  /* Clear the stack back to a table body context. */
3752                  $this->clearStackToTableContext($clear);
3753  
3754                  /* Act as if an end tag with the same tag name as the current
3755                  node ("tbody", "tfoot", or "thead") had been seen, then
3756                  reprocess the current token. */
3757                  $this->inTableBody(
3758                      array(
3759                          'name' => end($this->stack)->nodeName,
3760                          'type' => HTML5::ENDTAG
3761                      )
3762                  );
3763  
3764                  return $this->mainPhase($token);
3765              }
3766  
3767              /* An end tag whose tag name is one of: "body", "caption", "col",
3768              "colgroup", "html", "td", "th", "tr" */
3769          } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3770                  $token['name'],
3771                  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3772              )
3773          ) {
3774              /* Parse error. Ignore the token. */
3775  
3776              /* Anything else */
3777          } else {
3778              /* Process the token as if the insertion mode was "in table". */
3779              $this->inTable($token);
3780          }
3781      }
3782  
3783      private function inRow($token)
3784      {
3785          $clear = array('tr', 'html');
3786  
3787          /* A start tag whose tag name is one of: "th", "td" */
3788          if ($token['type'] === HTML5::STARTTAG &&
3789              ($token['name'] === 'th' || $token['name'] === 'td')
3790          ) {
3791              /* Clear the stack back to a table row context. */
3792              $this->clearStackToTableContext($clear);
3793  
3794              /* Insert an HTML element for the token, then switch the insertion
3795              mode to "in cell". */
3796              $this->insertElement($token);
3797              $this->mode = self::IN_CELL;
3798  
3799              /* Insert a marker at the end of the list of active formatting
3800              elements. */
3801              $this->a_formatting[] = self::MARKER;
3802  
3803              /* An end tag whose tag name is "tr" */
3804          } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'tr') {
3805              /* If the stack of open elements does not have an element in table
3806              scope with the same tag name as the token, this is a parse error.
3807              Ignore the token. (innerHTML case) */
3808              if (!$this->elementInScope($token['name'], true)) {
3809                  // Ignore.
3810  
3811                  /* Otherwise: */
3812              } else {
3813                  /* Clear the stack back to a table row context. */
3814                  $this->clearStackToTableContext($clear);
3815  
3816                  /* Pop the current node (which will be a tr element) from the
3817                  stack of open elements. Switch the insertion mode to "in table
3818                  body". */
3819                  array_pop($this->stack);
3820                  $this->mode = self::IN_TBODY;
3821              }
3822  
3823              /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3824              "tbody", "tfoot", "thead", "tr" or an end tag whose tag name is "table" */
3825          } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3826                  $token['name'],
3827                  array('caption', 'col', 'colgroup', 'tbody', 'tfoot', 'thead', 'tr')
3828              )
3829          ) {
3830              /* Act as if an end tag with the tag name "tr" had been seen, then,
3831              if that token wasn't ignored, reprocess the current token. */
3832              $this->inRow(
3833                  array(
3834                      'name' => 'tr',
3835                      'type' => HTML5::ENDTAG
3836                  )
3837              );
3838  
3839              return $this->inCell($token);
3840  
3841              /* An end tag whose tag name is one of: "tbody", "tfoot", "thead" */
3842          } elseif ($token['type'] === HTML5::ENDTAG &&
3843              in_array($token['name'], array('tbody', 'tfoot', 'thead'))
3844          ) {
3845              /* If the stack of open elements does not have an element in table
3846              scope with the same tag name as the token, this is a parse error.
3847              Ignore the token. */
3848              if (!$this->elementInScope($token['name'], true)) {
3849                  // Ignore.
3850  
3851                  /* Otherwise: */
3852              } else {
3853                  /* Otherwise, act as if an end tag with the tag name "tr" had
3854                  been seen, then reprocess the current token. */
3855                  $this->inRow(
3856                      array(
3857                          'name' => 'tr',
3858                          'type' => HTML5::ENDTAG
3859                      )
3860                  );
3861  
3862                  return $this->inCell($token);
3863              }
3864  
3865              /* An end tag whose tag name is one of: "body", "caption", "col",
3866              "colgroup", "html", "td", "th" */
3867          } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3868                  $token['name'],
3869                  array('body', 'caption', 'col', 'colgroup', 'html', 'td', 'th', 'tr')
3870              )
3871          ) {
3872              /* Parse error. Ignore the token. */
3873  
3874              /* Anything else */
3875          } else {
3876              /* Process the token as if the insertion mode was "in table". */
3877              $this->inTable($token);
3878          }
3879      }
3880  
3881      private function inCell($token)
3882      {
3883          /* An end tag whose tag name is one of: "td", "th" */
3884          if ($token['type'] === HTML5::ENDTAG &&
3885              ($token['name'] === 'td' || $token['name'] === 'th')
3886          ) {
3887              /* If the stack of open elements does not have an element in table
3888              scope with the same tag name as that of the token, then this is a
3889              parse error and the token must be ignored. */
3890              if (!$this->elementInScope($token['name'], true)) {
3891                  // Ignore.
3892  
3893                  /* Otherwise: */
3894              } else {
3895                  /* Generate implied end tags, except for elements with the same
3896                  tag name as the token. */
3897                  $this->generateImpliedEndTags(array($token['name']));
3898  
3899                  /* Now, if the current node is not an element with the same tag
3900                  name as the token, then this is a parse error. */
3901                  // k
3902  
3903                  /* Pop elements from this stack until an element with the same
3904                  tag name as the token has been popped from the stack. */
3905                  while (true) {
3906                      $node = end($this->stack)->nodeName;
3907                      array_pop($this->stack);
3908  
3909                      if ($node === $token['name']) {
3910                          break;
3911                      }
3912                  }
3913  
3914                  /* Clear the list of active formatting elements up to the last
3915                  marker. */
3916                  $this->clearTheActiveFormattingElementsUpToTheLastMarker();
3917  
3918                  /* Switch the insertion mode to "in row". (The current node
3919                  will be a tr element at this point.) */
3920                  $this->mode = self::IN_ROW;
3921              }
3922  
3923              /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3924              "tbody", "td", "tfoot", "th", "thead", "tr" */
3925          } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3926                  $token['name'],
3927                  array(
3928                      'caption',
3929                      'col',
3930                      'colgroup',
3931                      'tbody',
3932                      'td',
3933                      'tfoot',
3934                      'th',
3935                      'thead',
3936                      'tr'
3937                  )
3938              )
3939          ) {
3940              /* If the stack of open elements does not have a td or th element
3941              in table scope, then this is a parse error; ignore the token.
3942              (innerHTML case) */
3943              if (!$this->elementInScope(array('td', 'th'), true)) {
3944                  // Ignore.
3945  
3946                  /* Otherwise, close the cell (see below) and reprocess the current
3947                  token. */
3948              } else {
3949                  $this->closeCell();
3950                  return $this->inRow($token);
3951              }
3952  
3953              /* A start tag whose tag name is one of: "caption", "col", "colgroup",
3954              "tbody", "td", "tfoot", "th", "thead", "tr" */
3955          } elseif ($token['type'] === HTML5::STARTTAG && in_array(
3956                  $token['name'],
3957                  array(
3958                      'caption',
3959                      'col',
3960                      'colgroup',
3961                      'tbody',
3962                      'td',
3963                      'tfoot',
3964                      'th',
3965                      'thead',
3966                      'tr'
3967                  )
3968              )
3969          ) {
3970              /* If the stack of open elements does not have a td or th element
3971              in table scope, then this is a parse error; ignore the token.
3972              (innerHTML case) */
3973              if (!$this->elementInScope(array('td', 'th'), true)) {
3974                  // Ignore.
3975  
3976                  /* Otherwise, close the cell (see below) and reprocess the current
3977                  token. */
3978              } else {
3979                  $this->closeCell();
3980                  return $this->inRow($token);
3981              }
3982  
3983              /* An end tag whose tag name is one of: "body", "caption", "col",
3984              "colgroup", "html" */
3985          } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3986                  $token['name'],
3987                  array('body', 'caption', 'col', 'colgroup', 'html')
3988              )
3989          ) {
3990              /* Parse error. Ignore the token. */
3991  
3992              /* An end tag whose tag name is one of: "table", "tbody", "tfoot",
3993              "thead", "tr" */
3994          } elseif ($token['type'] === HTML5::ENDTAG && in_array(
3995                  $token['name'],
3996                  array('table', 'tbody', 'tfoot', 'thead', 'tr')
3997              )
3998          ) {
3999              /* If the stack of open elements does not have an element in table
4000              scope with the same tag name as that of the token (which can only
4001              happen for "tbody", "tfoot" and "thead", or, in the innerHTML case),
4002              then this is a parse error and the token must be ignored. */
4003              if (!$this->elementInScope($token['name'], true)) {
4004                  // Ignore.
4005  
4006                  /* Otherwise, close the cell (see below) and reprocess the current
4007                  token. */
4008              } else {
4009                  $this->closeCell();
4010                  return $this->inRow($token);
4011              }
4012  
4013              /* Anything else */
4014          } else {
4015              /* Process the token as if the insertion mode was "in body". */
4016              $this->inBody($token);
4017          }
4018      }
4019  
4020      private function inSelect($token)
4021      {
4022          /* Handle the token as follows: */
4023  
4024          /* A character token */
4025          if ($token['type'] === HTML5::CHARACTR) {
4026              /* Append the token's character to the current node. */
4027              $this->insertText($token['data']);
4028  
4029              /* A comment token */
4030          } elseif ($token['type'] === HTML5::COMMENT) {
4031              /* Append a Comment node to the current node with the data
4032              attribute set to the data given in the comment token. */
4033              $this->insertComment($token['data']);
4034  
4035              /* A start tag token whose tag name is "option" */
4036          } elseif ($token['type'] === HTML5::STARTTAG &&
4037              $token['name'] === 'option'
4038          ) {
4039              /* If the current node is an option element, act as if an end tag
4040              with the tag name "option" had been seen. */
4041              if (end($this->stack)->nodeName === 'option') {
4042                  $this->inSelect(
4043                      array(
4044                          'name' => 'option',
4045                          'type' => HTML5::ENDTAG
4046                      )
4047                  );
4048              }
4049  
4050              /* Insert an HTML element for the token. */
4051              $this->insertElement($token);
4052  
4053              /* A start tag token whose tag name is "optgroup" */
4054          } elseif ($token['type'] === HTML5::STARTTAG &&
4055              $token['name'] === 'optgroup'
4056          ) {
4057              /* If the current node is an option element, act as if an end tag
4058              with the tag name "option" had been seen. */
4059              if (end($this->stack)->nodeName === 'option') {
4060                  $this->inSelect(
4061                      array(
4062                          'name' => 'option',
4063                          'type' => HTML5::ENDTAG
4064                      )
4065                  );
4066              }
4067  
4068              /* If the current node is an optgroup element, act as if an end tag
4069              with the tag name "optgroup" had been seen. */
4070              if (end($this->stack)->nodeName === 'optgroup') {
4071                  $this->inSelect(
4072                      array(
4073                          'name' => 'optgroup',
4074                          'type' => HTML5::ENDTAG
4075                      )
4076                  );
4077              }
4078  
4079              /* Insert an HTML element for the token. */
4080              $this->insertElement($token);
4081  
4082              /* An end tag token whose tag name is "optgroup" */
4083          } elseif ($token['type'] === HTML5::ENDTAG &&
4084              $token['name'] === 'optgroup'
4085          ) {
4086              /* First, if the current node is an option element, and the node
4087              immediately before it in the stack of open elements is an optgroup
4088              element, then act as if an end tag with the tag name "option" had
4089              been seen. */
4090              $elements_in_stack = count($this->stack);
4091  
4092              if ($this->stack[$elements_in_stack - 1]->nodeName === 'option' &&
4093                  $this->stack[$elements_in_stack - 2]->nodeName === 'optgroup'
4094              ) {
4095                  $this->inSelect(
4096                      array(
4097                          'name' => 'option',
4098                          'type' => HTML5::ENDTAG
4099                      )
4100                  );
4101              }
4102  
4103              /* If the current node is an optgroup element, then pop that node
4104              from the stack of open elements. Otherwise, this is a parse error,
4105              ignore the token. */
4106              if ($this->stack[$elements_in_stack - 1] === 'optgroup') {
4107                  array_pop($this->stack);
4108              }
4109  
4110              /* An end tag token whose tag name is "option" */
4111          } elseif ($token['type'] === HTML5::ENDTAG &&
4112              $token['name'] === 'option'
4113          ) {
4114              /* If the current node is an option element, then pop that node
4115              from the stack of open elements. Otherwise, this is a parse error,
4116              ignore the token. */
4117              if (end($this->stack)->nodeName === 'option') {
4118                  array_pop($this->stack);
4119              }
4120  
4121              /* An end tag whose tag name is "select" */
4122          } elseif ($token['type'] === HTML5::ENDTAG &&
4123              $token['name'] === 'select'
4124          ) {
4125              /* If the stack of open elements does not have an element in table
4126              scope with the same tag name as the token, this is a parse error.
4127              Ignore the token. (innerHTML case) */
4128              if (!$this->elementInScope($token['name'], true)) {
4129                  // w/e
4130  
4131                  /* Otherwise: */
4132              } else {
4133                  /* Pop elements from the stack of open elements until a select
4134                  element has been popped from the stack. */
4135                  while (true) {
4136                      $current = end($this->stack)->nodeName;
4137                      array_pop($this->stack);
4138  
4139                      if ($current === 'select') {
4140                          break;
4141                      }
4142                  }
4143  
4144                  /* Reset the insertion mode appropriately. */
4145                  $this->resetInsertionMode();
4146              }
4147  
4148              /* A start tag whose tag name is "select" */
4149          } elseif ($token['name'] === 'select' &&
4150              $token['type'] === HTML5::STARTTAG
4151          ) {
4152              /* Parse error. Act as if the token had been an end tag with the
4153              tag name "select" instead. */
4154              $this->inSelect(
4155                  array(
4156                      'name' => 'select',
4157                      'type' => HTML5::ENDTAG
4158                  )
4159              );
4160  
4161              /* An end tag whose tag name is one of: "caption", "table", "tbody",
4162              "tfoot", "thead", "tr", "td", "th" */
4163          } elseif (in_array(
4164                  $token['name'],
4165                  array(
4166                      'caption',
4167                      'table',
4168                      'tbody',
4169                      'tfoot',
4170                      'thead',
4171                      'tr',
4172                      'td',
4173                      'th'
4174                  )
4175              ) && $token['type'] === HTML5::ENDTAG
4176          ) {
4177              /* Parse error. */
4178              // w/e
4179  
4180              /* If the stack of open elements has an element in table scope with
4181              the same tag name as that of the token, then act as if an end tag
4182              with the tag name "select" had been seen, and reprocess the token.
4183              Otherwise, ignore the token. */
4184              if ($this->elementInScope($token['name'], true)) {
4185                  $this->inSelect(
4186                      array(
4187                          'name' => 'select',
4188                          'type' => HTML5::ENDTAG
4189                      )
4190                  );
4191  
4192                  $this->mainPhase($token);
4193              }
4194  
4195              /* Anything else */
4196          } else {
4197              /* Parse error. Ignore the token. */
4198          }
4199      }
4200  
4201      private function afterBody($token)
4202      {
4203          /* Handle the token as follows: */
4204  
4205          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4206          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4207          or U+0020 SPACE */
4208          if ($token['type'] === HTML5::CHARACTR &&
4209              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4210          ) {
4211              /* Process the token as it would be processed if the insertion mode
4212              was "in body". */
4213              $this->inBody($token);
4214  
4215              /* A comment token */
4216          } elseif ($token['type'] === HTML5::COMMENT) {
4217              /* Append a Comment node to the first element in the stack of open
4218              elements (the html element), with the data attribute set to the
4219              data given in the comment token. */
4220              $comment = $this->dom->createComment($token['data']);
4221              $this->stack[0]->appendChild($comment);
4222  
4223              /* An end tag with the tag name "html" */
4224          } elseif ($token['type'] === HTML5::ENDTAG && $token['name'] === 'html') {
4225              /* If the parser was originally created in order to handle the
4226              setting of an element's innerHTML attribute, this is a parse error;
4227              ignore the token. (The element will be an html element in this
4228              case.) (innerHTML case) */
4229  
4230              /* Otherwise, switch to the trailing end phase. */
4231              $this->phase = self::END_PHASE;
4232  
4233              /* Anything else */
4234          } else {
4235              /* Parse error. Set the insertion mode to "in body" and reprocess
4236              the token. */
4237              $this->mode = self::IN_BODY;
4238              return $this->inBody($token);
4239          }
4240      }
4241  
4242      private function inFrameset($token)
4243      {
4244          /* Handle the token as follows: */
4245  
4246          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4247          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4248          U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4249          if ($token['type'] === HTML5::CHARACTR &&
4250              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4251          ) {
4252              /* Append the character to the current node. */
4253              $this->insertText($token['data']);
4254  
4255              /* A comment token */
4256          } elseif ($token['type'] === HTML5::COMMENT) {
4257              /* Append a Comment node to the current node with the data
4258              attribute set to the data given in the comment token. */
4259              $this->insertComment($token['data']);
4260  
4261              /* A start tag with the tag name "frameset" */
4262          } elseif ($token['name'] === 'frameset' &&
4263              $token['type'] === HTML5::STARTTAG
4264          ) {
4265              $this->insertElement($token);
4266  
4267              /* An end tag with the tag name "frameset" */
4268          } elseif ($token['name'] === 'frameset' &&
4269              $token['type'] === HTML5::ENDTAG
4270          ) {
4271              /* If the current node is the root html element, then this is a
4272              parse error; ignore the token. (innerHTML case) */
4273              if (end($this->stack)->nodeName === 'html') {
4274                  // Ignore
4275  
4276              } else {
4277                  /* Otherwise, pop the current node from the stack of open
4278                  elements. */
4279                  array_pop($this->stack);
4280  
4281                  /* If the parser was not originally created in order to handle
4282                  the setting of an element's innerHTML attribute (innerHTML case),
4283                  and the current node is no longer a frameset element, then change
4284                  the insertion mode to "after frameset". */
4285                  $this->mode = self::AFTR_FRAME;
4286              }
4287  
4288              /* A start tag with the tag name "frame" */
4289          } elseif ($token['name'] === 'frame' &&
4290              $token['type'] === HTML5::STARTTAG
4291          ) {
4292              /* Insert an HTML element for the token. */
4293              $this->insertElement($token);
4294  
4295              /* Immediately pop the current node off the stack of open elements. */
4296              array_pop($this->stack);
4297  
4298              /* A start tag with the tag name "noframes" */
4299          } elseif ($token['name'] === 'noframes' &&
4300              $token['type'] === HTML5::STARTTAG
4301          ) {
4302              /* Process the token as if the insertion mode had been "in body". */
4303              $this->inBody($token);
4304  
4305              /* Anything else */
4306          } else {
4307              /* Parse error. Ignore the token. */
4308          }
4309      }
4310  
4311      private function afterFrameset($token)
4312      {
4313          /* Handle the token as follows: */
4314  
4315          /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4316          U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4317          U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
4318          if ($token['type'] === HTML5::CHARACTR &&
4319              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4320          ) {
4321              /* Append the character to the current node. */
4322              $this->insertText($token['data']);
4323  
4324              /* A comment token */
4325          } elseif ($token['type'] === HTML5::COMMENT) {
4326              /* Append a Comment node to the current node with the data
4327              attribute set to the data given in the comment token. */
4328              $this->insertComment($token['data']);
4329  
4330              /* An end tag with the tag name "html" */
4331          } elseif ($token['name'] === 'html' &&
4332              $token['type'] === HTML5::ENDTAG
4333          ) {
4334              /* Switch to the trailing end phase. */
4335              $this->phase = self::END_PHASE;
4336  
4337              /* A start tag with the tag name "noframes" */
4338          } elseif ($token['name'] === 'noframes' &&
4339              $token['type'] === HTML5::STARTTAG
4340          ) {
4341              /* Process the token as if the insertion mode had been "in body". */
4342              $this->inBody($token);
4343  
4344              /* Anything else */
4345          } else {
4346              /* Parse error. Ignore the token. */
4347          }
4348      }
4349  
4350      private function trailingEndPhase($token)
4351      {
4352          /* After the main phase, as each token is emitted from the tokenisation
4353          stage, it must be processed as described in this section. */
4354  
4355          /* A DOCTYPE token */
4356          if ($token['type'] === HTML5::DOCTYPE) {
4357              // Parse error. Ignore the token.
4358  
4359              /* A comment token */
4360          } elseif ($token['type'] === HTML5::COMMENT) {
4361              /* Append a Comment node to the Document object with the data
4362              attribute set to the data given in the comment token. */
4363              $comment = $this->dom->createComment($token['data']);
4364              $this->dom->appendChild($comment);
4365  
4366              /* A character token that is one of one of U+0009 CHARACTER TABULATION,
4367              U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4368              or U+0020 SPACE */
4369          } elseif ($token['type'] === HTML5::CHARACTR &&
4370              preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])
4371          ) {
4372              /* Process the token as it would be processed in the main phase. */
4373              $this->mainPhase($token);
4374  
4375              /* A character token that is not one of U+0009 CHARACTER TABULATION,
4376              U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
4377              or U+0020 SPACE. Or a start tag token. Or an end tag token. */
4378          } elseif (($token['type'] === HTML5::CHARACTR &&
4379                  preg_match('/^[\t\n\x0b\x0c ]+$/', $token['data'])) ||
4380              $token['type'] === HTML5::STARTTAG || $token['type'] === HTML5::ENDTAG
4381          ) {
4382              /* Parse error. Switch back to the main phase and reprocess the
4383              token. */
4384              $this->phase = self::MAIN_PHASE;
4385              return $this->mainPhase($token);
4386  
4387              /* An end-of-file token */
4388          } elseif ($token['type'] === HTML5::EOF) {
4389              /* OMG DONE!! */
4390          }
4391      }
4392  
4393      private function insertElement($token, $append = true, $check = false)
4394      {
4395          // Proprietary workaround for libxml2's limitations with tag names
4396          if ($check) {
4397              // Slightly modified HTML5 tag-name modification,
4398              // removing anything that's not an ASCII letter, digit, or hyphen
4399              $token['name'] = preg_replace('/[^a-z0-9-]/i', '', $token['name']);
4400              // Remove leading hyphens and numbers
4401              $token['name'] = ltrim($token['name'], '-0..9');
4402              // In theory, this should ever be needed, but just in case
4403              if ($token['name'] === '') {
4404                  $token['name'] = 'span';
4405              } // arbitrary generic choice
4406          }
4407  
4408          $el = $this->dom->createElement($token['name']);
4409  
4410          foreach ($token['attr'] as $attr) {
4411              if (!$el->hasAttribute($attr['name'])) {
4412                  $el->setAttribute($attr['name'], $attr['value']);
4413              }
4414          }
4415  
4416          $this->appendToRealParent($el);
4417          $this->stack[] = $el;
4418  
4419          return $el;
4420      }
4421  
4422      private function insertText($data)
4423      {
4424          $text = $this->dom->createTextNode($data);
4425          $this->appendToRealParent($text);
4426      }
4427  
4428      private function insertComment($data)
4429      {
4430          $comment = $this->dom->createComment($data);
4431          $this->appendToRealParent($comment);
4432      }
4433  
4434      private function appendToRealParent($node)
4435      {
4436          if ($this->foster_parent === null) {
4437              end($this->stack)->appendChild($node);
4438  
4439          } elseif ($this->foster_parent !== null) {
4440              /* If the foster parent element is the parent element of the
4441              last table element in the stack of open elements, then the new
4442              node must be inserted immediately before the last table element
4443              in the stack of open elements in the foster parent element;
4444              otherwise, the new node must be appended to the foster parent
4445              element. */
4446              for ($n = count($this->stack) - 1; $n >= 0; $n--) {
4447                  if ($this->stack[$n]->nodeName === 'table' &&
4448                      $this->stack[$n]->parentNode !== null
4449                  ) {
4450                      $table = $this->stack[$n];
4451                      break;
4452                  }
4453              }
4454  
4455              if (isset($table) && $this->foster_parent->isSameNode($table->parentNode)) {
4456                  $this->foster_parent->insertBefore($node, $table);
4457              } else {
4458                  $this->foster_parent->appendChild($node);
4459              }
4460  
4461              $this->foster_parent = null;
4462          }
4463      }
4464  
4465      private function elementInScope($el, $table = false)
4466      {
4467          if (is_array($el)) {
4468              foreach ($el as $element) {
4469                  if ($this->elementInScope($element, $table)) {
4470                      return true;
4471                  }
4472              }
4473  
4474              return false;
4475          }
4476  
4477          $leng = count($this->stack);
4478  
4479          for ($n = 0; $n < $leng; $n++) {
4480              /* 1. Initialise node to be the current node (the bottommost node of
4481              the stack). */
4482              $node = $this->stack[$leng - 1 - $n];
4483  
4484              if ($node->tagName === $el) {
4485                  /* 2. If node is the target node, terminate in a match state. */
4486                  return true;
4487  
4488              } elseif ($node->tagName === 'table') {
4489                  /* 3. Otherwise, if node is a table element, terminate in a failure
4490                  state. */
4491                  return false;
4492  
4493              } elseif ($table === true && in_array(
4494                      $node->tagName,
4495                      array(
4496                          'caption',
4497                          'td',
4498                          'th',
4499                          'button',
4500                          'marquee',
4501                          'object'
4502                      )
4503                  )
4504              ) {
4505                  /* 4. Otherwise, if the algorithm is the "has an element in scope"
4506                  variant (rather than the "has an element in table scope" variant),
4507                  and node is one of the following, terminate in a failure state. */
4508                  return false;
4509  
4510              } elseif ($node === $node->ownerDocument->documentElement) {
4511                  /* 5. Otherwise, if node is an html element (root element), terminate
4512                  in a failure state. (This can only happen if the node is the topmost
4513                  node of the    stack of open elements, and prevents the next step from
4514                  being invoked if there are no more elements in the stack.) */
4515                  return false;
4516              }
4517  
4518              /* Otherwise, set node to the previous entry in the stack of open
4519              elements and return to step 2. (This will never fail, since the loop
4520              will always terminate in the previous step if the top of the stack
4521              is reached.) */
4522          }
4523      }
4524  
4525      private function reconstructActiveFormattingElements()
4526      {
4527          /* 1. If there are no entries in the list of active formatting elements,
4528          then there is nothing to reconstruct; stop this algorithm. */
4529          $formatting_elements = count($this->a_formatting);
4530  
4531          if ($formatting_elements === 0) {
4532              return false;
4533          }
4534  
4535          /* 3. Let entry be the last (most recently added) element in the list
4536          of active formatting elements. */
4537          $entry = end($this->a_formatting);
4538  
4539          /* 2. If the last (most recently added) entry in the list of active
4540          formatting elements is a marker, or if it is an element that is in the
4541          stack of open elements, then there is nothing to reconstruct; stop this
4542          algorithm. */
4543          if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4544              return false;
4545          }
4546  
4547          for ($a = $formatting_elements - 1; $a >= 0; true) {
4548              /* 4. If there are no entries before entry in the list of active
4549              formatting elements, then jump to step 8. */
4550              if ($a === 0) {
4551                  $step_seven = false;
4552                  break;
4553              }
4554  
4555              /* 5. Let entry be the entry one earlier than entry in the list of
4556              active formatting elements. */
4557              $a--;
4558              $entry = $this->a_formatting[$a];
4559  
4560              /* 6. If entry is neither a marker nor an element that is also in
4561              thetack of open elements, go to step 4. */
4562              if ($entry === self::MARKER || in_array($entry, $this->stack, true)) {
4563                  break;
4564              }
4565          }
4566  
4567          while (true) {
4568              /* 7. Let entry be the element one later than entry in the list of
4569              active formatting elements. */
4570              if (isset($step_seven) && $step_seven === true) {
4571                  $a++;
4572                  $entry = $this->a_formatting[$a];
4573              }
4574  
4575              /* 8. Perform a shallow clone of the element entry to obtain clone. */
4576              $clone = $entry->cloneNode();
4577  
4578              /* 9. Append clone to the current node and push it onto the stack
4579              of open elements  so that it is the new current node. */
4580              end($this->stack)->appendChild($clone);
4581              $this->stack[] = $clone;
4582  
4583              /* 10. Replace the entry for entry in the list with an entry for
4584              clone. */
4585              $this->a_formatting[$a] = $clone;
4586  
4587              /* 11. If the entry for clone in the list of active formatting
4588              elements is not the last entry in the list, return to step 7. */
4589              if (end($this->a_formatting) !== $clone) {
4590                  $step_seven = true;
4591              } else {
4592                  break;
4593              }
4594          }
4595      }
4596  
4597      private function clearTheActiveFormattingElementsUpToTheLastMarker()
4598      {
4599          /* When the steps below require the UA to clear the list of active
4600          formatting elements up to the last marker, the UA must perform the
4601          following steps: */
4602  
4603          while (true) {
4604              /* 1. Let entry be the last (most recently added) entry in the list
4605              of active formatting elements. */
4606              $entry = end($this->a_formatting);
4607  
4608              /* 2. Remove entry from the list of active formatting elements. */
4609              array_pop($this->a_formatting);
4610  
4611              /* 3. If entry was a marker, then stop the algorithm at this point.
4612              The list has been cleared up to the last marker. */
4613              if ($entry === self::MARKER) {
4614                  break;
4615              }
4616          }
4617      }
4618  
4619      private function generateImpliedEndTags($exclude = array())
4620      {
4621          /* When the steps below require the UA to generate implied end tags,
4622          then, if the current node is a dd element, a dt element, an li element,
4623          a p element, a td element, a th  element, or a tr element, the UA must
4624          act as if an end tag with the respective tag name had been seen and
4625          then generate implied end tags again. */
4626          $node = end($this->stack);
4627          $elements = array_diff(array('dd', 'dt', 'li', 'p', 'td', 'th', 'tr'), $exclude);
4628  
4629          while (in_array(end($this->stack)->nodeName, $elements)) {
4630              array_pop($this->stack);
4631          }
4632      }
4633  
4634      private function getElementCategory($node)
4635      {
4636          $name = $node->tagName;
4637          if (in_array($name, $this->special)) {
4638              return self::SPECIAL;
4639          } elseif (in_array($name, $this->scoping)) {
4640              return self::SCOPING;
4641          } elseif (in_array($name, $this->formatting)) {
4642              return self::FORMATTING;
4643          } else {
4644              return self::PHRASING;
4645          }
4646      }
4647  
4648      private function clearStackToTableContext($elements)
4649      {
4650          /* When the steps above require the UA to clear the stack back to a
4651          table context, it means that the UA must, while the current node is not
4652          a table element or an html element, pop elements from the stack of open
4653          elements. If this causes any elements to be popped from the stack, then
4654          this is a parse error. */
4655          while (true) {
4656              $node = end($this->stack)->nodeName;
4657  
4658              if (in_array($node, $elements)) {
4659                  break;
4660              } else {
4661                  array_pop($this->stack);
4662              }
4663          }
4664      }
4665  
4666      private function resetInsertionMode()
4667      {
4668          /* 1. Let last be false. */
4669          $last = false;
4670          $leng = count($this->stack);
4671  
4672          for ($n = $leng - 1; $n >= 0; $n--) {
4673              /* 2. Let node be the last node in the stack of open elements. */
4674              $node = $this->stack[$n];
4675  
4676              /* 3. If node is the first node in the stack of open elements, then
4677              set last to true. If the element whose innerHTML  attribute is being
4678              set is neither a td  element nor a th element, then set node to the
4679              element whose innerHTML  attribute is being set. (innerHTML  case) */
4680              if ($this->stack[0]->isSameNode($node)) {
4681                  $last = true;
4682              }
4683  
4684              /* 4. If node is a select element, then switch the insertion mode to
4685              "in select" and abort these steps. (innerHTML case) */
4686              if ($node->nodeName === 'select') {
4687                  $this->mode = self::IN_SELECT;
4688                  break;
4689  
4690                  /* 5. If node is a td or th element, then switch the insertion mode
4691                  to "in cell" and abort these steps. */
4692              } elseif ($node->nodeName === 'td' || $node->nodeName === 'th') {
4693                  $this->mode = self::IN_CELL;
4694                  break;
4695  
4696                  /* 6. If node is a tr element, then switch the insertion mode to
4697                  "in    row" and abort these steps. */
4698              } elseif ($node->nodeName === 'tr') {
4699                  $this->mode = self::IN_ROW;
4700                  break;
4701  
4702                  /* 7. If node is a tbody, thead, or tfoot element, then switch the
4703                  insertion mode to "in table body" and abort these steps. */
4704              } elseif (in_array($node->nodeName, array('tbody', 'thead', 'tfoot'))) {
4705                  $this->mode = self::IN_TBODY;
4706                  break;
4707  
4708                  /* 8. If node is a caption element, then switch the insertion mode
4709                  to "in caption" and abort these steps. */
4710              } elseif ($node->nodeName === 'caption') {
4711                  $this->mode = self::IN_CAPTION;
4712                  break;
4713  
4714                  /* 9. If node is a colgroup element, then switch the insertion mode
4715                  to "in column group" and abort these steps. (innerHTML case) */
4716              } elseif ($node->nodeName === 'colgroup') {
4717                  $this->mode = self::IN_CGROUP;
4718                  break;
4719  
4720                  /* 10. If node is a table element, then switch the insertion mode
4721                  to "in table" and abort these steps. */
4722              } elseif ($node->nodeName === 'table') {
4723                  $this->mode = self::IN_TABLE;
4724                  break;
4725  
4726                  /* 11. If node is a head element, then switch the insertion mode
4727                  to "in body" ("in body"! not "in head"!) and abort these steps.
4728                  (innerHTML case) */
4729              } elseif ($node->nodeName === 'head') {
4730                  $this->mode = self::IN_BODY;
4731                  break;
4732  
4733                  /* 12. If node is a body element, then switch the insertion mode to
4734                  "in body" and abort these steps. */
4735              } elseif ($node->nodeName === 'body') {
4736                  $this->mode = self::IN_BODY;
4737                  break;
4738  
4739                  /* 13. If node is a frameset element, then switch the insertion
4740                  mode to "in frameset" and abort these steps. (innerHTML case) */
4741              } elseif ($node->nodeName === 'frameset') {
4742                  $this->mode = self::IN_FRAME;
4743                  break;
4744  
4745                  /* 14. If node is an html element, then: if the head element
4746                  pointer is null, switch the insertion mode to "before head",
4747                  otherwise, switch the insertion mode to "after head". In either
4748                  case, abort these steps. (innerHTML case) */
4749              } elseif ($node->nodeName === 'html') {
4750                  $this->mode = ($this->head_pointer === null)
4751                      ? self::BEFOR_HEAD
4752                      : self::AFTER_HEAD;
4753  
4754                  break;
4755  
4756                  /* 15. If last is true, then set the insertion mode to "in body"
4757                  and    abort these steps. (innerHTML case) */
4758              } elseif ($last) {
4759                  $this->mode = self::IN_BODY;
4760                  break;
4761              }
4762          }
4763      }
4764  
4765      private function closeCell()
4766      {
4767          /* If the stack of open elements has a td or th element in table scope,
4768          then act as if an end tag token with that tag name had been seen. */
4769          foreach (array('td', 'th') as $cell) {
4770              if ($this->elementInScope($cell, true)) {
4771                  $this->inCell(
4772                      array(
4773                          'name' => $cell,
4774                          'type' => HTML5::ENDTAG
4775                      )
4776                  );
4777  
4778                  break;
4779              }
4780          }
4781      }
4782  
4783      public function save()
4784      {
4785          return $this->dom;
4786      }
4787  }


Generated: Thu Aug 11 10:00:09 2016 Cross-referenced by PHPXref 0.7.1