[ Index ]

PHP Cross Reference of Unnamed Project

title

Body

[close]

/lib/phpexcel/PHPExcel/Reader/ -> HTML.php (source)

   1  <?php
   2  
   3  if (!defined('PHPEXCEL_ROOT')) {
   4      /**
   5       * @ignore
   6       */
   7      define('PHPEXCEL_ROOT', dirname(__FILE__) . '/../../');
   8      require (PHPEXCEL_ROOT . 'PHPExcel/Autoloader.php');
   9  }
  10  
  11  /**
  12   * PHPExcel_Reader_HTML
  13   *
  14   * Copyright (c) 2006 - 2015 PHPExcel
  15   *
  16   * This library is free software; you can redistribute it and/or
  17   * modify it under the terms of the GNU Lesser General Public
  18   * License as published by the Free Software Foundation; either
  19   * version 2.1 of the License, or (at your option) any later version.
  20   *
  21   * This library is distributed in the hope that it will be useful,
  22   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  23   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  24   * Lesser General Public License for more details.
  25   *
  26   * You should have received a copy of the GNU Lesser General Public
  27   * License along with this library; if not, write to the Free Software
  28   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
  29   *
  30   * @category   PHPExcel
  31   * @package    PHPExcel_Reader
  32   * @copyright  Copyright (c) 2006 - 2015 PHPExcel (http://www.codeplex.com/PHPExcel)
  33   * @license    http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt    LGPL
  34   * @version    ##VERSION##, ##DATE##
  35   */
  36  /** PHPExcel root directory */
  37  class PHPExcel_Reader_HTML extends PHPExcel_Reader_Abstract implements PHPExcel_Reader_IReader
  38  {
  39  
  40      /**
  41       * Input encoding
  42       *
  43       * @var string
  44       */
  45      protected $inputEncoding = 'ANSI';
  46  
  47      /**
  48       * Sheet index to read
  49       *
  50       * @var int
  51       */
  52      protected $sheetIndex = 0;
  53  
  54      /**
  55       * Formats
  56       *
  57       * @var array
  58       */
  59      protected $formats = array(
  60          'h1' => array(
  61              'font' => array(
  62                  'bold' => true,
  63                  'size' => 24,
  64              ),
  65          ), //    Bold, 24pt
  66          'h2' => array(
  67              'font' => array(
  68                  'bold' => true,
  69                  'size' => 18,
  70              ),
  71          ), //    Bold, 18pt
  72          'h3' => array(
  73              'font' => array(
  74                  'bold' => true,
  75                  'size' => 13.5,
  76              ),
  77          ), //    Bold, 13.5pt
  78          'h4' => array(
  79              'font' => array(
  80                  'bold' => true,
  81                  'size' => 12,
  82              ),
  83          ), //    Bold, 12pt
  84          'h5' => array(
  85              'font' => array(
  86                  'bold' => true,
  87                  'size' => 10,
  88              ),
  89          ), //    Bold, 10pt
  90          'h6' => array(
  91              'font' => array(
  92                  'bold' => true,
  93                  'size' => 7.5,
  94              ),
  95          ), //    Bold, 7.5pt
  96          'a' => array(
  97              'font' => array(
  98                  'underline' => true,
  99                  'color' => array(
 100                      'argb' => PHPExcel_Style_Color::COLOR_BLUE,
 101                  ),
 102              ),
 103          ), //    Blue underlined
 104          'hr' => array(
 105              'borders' => array(
 106                  'bottom' => array(
 107                      'style' => PHPExcel_Style_Border::BORDER_THIN,
 108                      'color' => array(
 109                          PHPExcel_Style_Color::COLOR_BLACK,
 110                      ),
 111                  ),
 112              ),
 113          ), //    Bottom border
 114      );
 115  
 116      protected $rowspan = array();
 117  
 118      /**
 119       * Create a new PHPExcel_Reader_HTML
 120       */
 121      public function __construct()
 122      {
 123          $this->readFilter = new PHPExcel_Reader_DefaultReadFilter();
 124      }
 125  
 126      /**
 127       * Validate that the current file is an HTML file
 128       *
 129       * @return boolean
 130       */
 131      protected function isValidFormat()
 132      {
 133          //    Reading 2048 bytes should be enough to validate that the format is HTML
 134          $data = fread($this->fileHandle, 2048);
 135          if ((strpos($data, '<') !== false) &&
 136                  (strlen($data) !== strlen(strip_tags($data)))) {
 137              return true;
 138          }
 139  
 140          return false;
 141      }
 142  
 143      /**
 144       * Loads PHPExcel from file
 145       *
 146       * @param  string                    $pFilename
 147       * @return PHPExcel
 148       * @throws PHPExcel_Reader_Exception
 149       */
 150      public function load($pFilename)
 151      {
 152          // Create new PHPExcel
 153          $objPHPExcel = new PHPExcel();
 154  
 155          // Load into this instance
 156          return $this->loadIntoExisting($pFilename, $objPHPExcel);
 157      }
 158  
 159      /**
 160       * Set input encoding
 161       *
 162       * @param string $pValue Input encoding
 163       */
 164      public function setInputEncoding($pValue = 'ANSI')
 165      {
 166          $this->inputEncoding = $pValue;
 167  
 168          return $this;
 169      }
 170  
 171      /**
 172       * Get input encoding
 173       *
 174       * @return string
 175       */
 176      public function getInputEncoding()
 177      {
 178          return $this->inputEncoding;
 179      }
 180  
 181      //    Data Array used for testing only, should write to PHPExcel object on completion of tests
 182      protected $dataArray = array();
 183      protected $tableLevel = 0;
 184      protected $nestedColumn = array('A');
 185  
 186      protected function setTableStartColumn($column)
 187      {
 188          if ($this->tableLevel == 0) {
 189              $column = 'A';
 190          }
 191          ++$this->tableLevel;
 192          $this->nestedColumn[$this->tableLevel] = $column;
 193  
 194          return $this->nestedColumn[$this->tableLevel];
 195      }
 196  
 197      protected function getTableStartColumn()
 198      {
 199          return $this->nestedColumn[$this->tableLevel];
 200      }
 201  
 202      protected function releaseTableStartColumn()
 203      {
 204          --$this->tableLevel;
 205  
 206          return array_pop($this->nestedColumn);
 207      }
 208  
 209      protected function flushCell($sheet, $column, $row, &$cellContent)
 210      {
 211          if (is_string($cellContent)) {
 212              //    Simple String content
 213              if (trim($cellContent) > '') {
 214                  //    Only actually write it if there's content in the string
 215  //                echo 'FLUSH CELL: ' , $column , $row , ' => ' , $cellContent , '<br />';
 216                  //    Write to worksheet to be done here...
 217                  //    ... we return the cell so we can mess about with styles more easily
 218                  $sheet->setCellValue($column . $row, $cellContent, true);
 219                  $this->dataArray[$row][$column] = $cellContent;
 220              }
 221          } else {
 222              //    We have a Rich Text run
 223              //    TODO
 224              $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent;
 225          }
 226          $cellContent = (string) '';
 227      }
 228  
 229      protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, &$cellContent, $format = null)
 230      {
 231          foreach ($element->childNodes as $child) {
 232              if ($child instanceof DOMText) {
 233                  $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue));
 234                  if (is_string($cellContent)) {
 235                      //    simply append the text if the cell content is a plain text string
 236                      $cellContent .= $domText;
 237                  } else {
 238                      //    but if we have a rich text run instead, we need to append it correctly
 239                      //    TODO
 240                  }
 241              } elseif ($child instanceof DOMElement) {
 242  //                echo '<b>DOM ELEMENT: </b>' , strtoupper($child->nodeName) , '<br />';
 243  
 244                  $attributeArray = array();
 245                  foreach ($child->attributes as $attribute) {
 246  //                    echo '<b>ATTRIBUTE: </b>' , $attribute->name , ' => ' , $attribute->value , '<br />';
 247                      $attributeArray[$attribute->name] = $attribute->value;
 248                  }
 249  
 250                  switch ($child->nodeName) {
 251                      case 'meta':
 252                          foreach ($attributeArray as $attributeName => $attributeValue) {
 253                              switch ($attributeName) {
 254                                  case 'content':
 255                                      //    TODO
 256                                      //    Extract character set, so we can convert to UTF-8 if required
 257                                      break;
 258                              }
 259                          }
 260                          $this->processDomElement($child, $sheet, $row, $column, $cellContent);
 261                          break;
 262                      case 'title':
 263                          $this->processDomElement($child, $sheet, $row, $column, $cellContent);
 264                          $sheet->setTitle($cellContent);
 265                          $cellContent = '';
 266                          break;
 267                      case 'span':
 268                      case 'div':
 269                      case 'font':
 270                      case 'i':
 271                      case 'em':
 272                      case 'strong':
 273                      case 'b':
 274  //                        echo 'STYLING, SPAN OR DIV<br />';
 275                          if ($cellContent > '') {
 276                              $cellContent .= ' ';
 277                          }
 278                          $this->processDomElement($child, $sheet, $row, $column, $cellContent);
 279                          if ($cellContent > '') {
 280                              $cellContent .= ' ';
 281                          }
 282  //                        echo 'END OF STYLING, SPAN OR DIV<br />';
 283                          break;
 284                      case 'hr':
 285                          $this->flushCell($sheet, $column, $row, $cellContent);
 286                          ++$row;
 287                          if (isset($this->formats[$child->nodeName])) {
 288                              $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
 289                          } else {
 290                              $cellContent = '----------';
 291                              $this->flushCell($sheet, $column, $row, $cellContent);
 292                          }
 293                          ++$row;
 294                          // Add a break after a horizontal rule, simply by allowing the code to dropthru
 295                      case 'br':
 296                          if ($this->tableLevel > 0) {
 297                              //    If we're inside a table, replace with a \n
 298                              $cellContent .= "\n";
 299                          } else {
 300                              //    Otherwise flush our existing content and move the row cursor on
 301                              $this->flushCell($sheet, $column, $row, $cellContent);
 302                              ++$row;
 303                          }
 304  //                        echo 'HARD LINE BREAK: ' , '<br />';
 305                          break;
 306                      case 'a':
 307  //                        echo 'START OF HYPERLINK: ' , '<br />';
 308                          foreach ($attributeArray as $attributeName => $attributeValue) {
 309                              switch ($attributeName) {
 310                                  case 'href':
 311  //                                    echo 'Link to ' , $attributeValue , '<br />';
 312                                      $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue);
 313                                      if (isset($this->formats[$child->nodeName])) {
 314                                          $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
 315                                      }
 316                                      break;
 317                              }
 318                          }
 319                          $cellContent .= ' ';
 320                          $this->processDomElement($child, $sheet, $row, $column, $cellContent);
 321  //                        echo 'END OF HYPERLINK:' , '<br />';
 322                          break;
 323                      case 'h1':
 324                      case 'h2':
 325                      case 'h3':
 326                      case 'h4':
 327                      case 'h5':
 328                      case 'h6':
 329                      case 'ol':
 330                      case 'ul':
 331                      case 'p':
 332                          if ($this->tableLevel > 0) {
 333                              //    If we're inside a table, replace with a \n
 334                              $cellContent .= "\n";
 335  //                            echo 'LIST ENTRY: ' , '<br />';
 336                              $this->processDomElement($child, $sheet, $row, $column, $cellContent);
 337  //                            echo 'END OF LIST ENTRY:' , '<br />';
 338                          } else {
 339                              if ($cellContent > '') {
 340                                  $this->flushCell($sheet, $column, $row, $cellContent);
 341                                  $row++;
 342                              }
 343  //                            echo 'START OF PARAGRAPH: ' , '<br />';
 344                              $this->processDomElement($child, $sheet, $row, $column, $cellContent);
 345  //                            echo 'END OF PARAGRAPH:' , '<br />';
 346                              $this->flushCell($sheet, $column, $row, $cellContent);
 347  
 348                              if (isset($this->formats[$child->nodeName])) {
 349                                  $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]);
 350                              }
 351  
 352                              $row++;
 353                              $column = 'A';
 354                          }
 355                          break;
 356                      case 'li':
 357                          if ($this->tableLevel > 0) {
 358                              //    If we're inside a table, replace with a \n
 359                              $cellContent .= "\n";
 360  //                            echo 'LIST ENTRY: ' , '<br />';
 361                              $this->processDomElement($child, $sheet, $row, $column, $cellContent);
 362  //                            echo 'END OF LIST ENTRY:' , '<br />';
 363                          } else {
 364                              if ($cellContent > '') {
 365                                  $this->flushCell($sheet, $column, $row, $cellContent);
 366                              }
 367                              ++$row;
 368  //                            echo 'LIST ENTRY: ' , '<br />';
 369                              $this->processDomElement($child, $sheet, $row, $column, $cellContent);
 370  //                            echo 'END OF LIST ENTRY:' , '<br />';
 371                              $this->flushCell($sheet, $column, $row, $cellContent);
 372                              $column = 'A';
 373                          }
 374                          break;
 375                      case 'table':
 376                          $this->flushCell($sheet, $column, $row, $cellContent);
 377                          $column = $this->setTableStartColumn($column);
 378  //                        echo 'START OF TABLE LEVEL ' , $this->tableLevel , '<br />';
 379                          if ($this->tableLevel > 1) {
 380                              --$row;
 381                          }
 382                          $this->processDomElement($child, $sheet, $row, $column, $cellContent);
 383  //                        echo 'END OF TABLE LEVEL ' , $this->tableLevel , '<br />';
 384                          $column = $this->releaseTableStartColumn();
 385                          if ($this->tableLevel > 1) {
 386                              ++$column;
 387                          } else {
 388                              ++$row;
 389                          }
 390                          break;
 391                      case 'thead':
 392                      case 'tbody':
 393                          $this->processDomElement($child, $sheet, $row, $column, $cellContent);
 394                          break;
 395                      case 'tr':
 396                          $column = $this->getTableStartColumn();
 397                          $cellContent = '';
 398  //                        echo 'START OF TABLE ' , $this->tableLevel , ' ROW<br />';
 399                          $this->processDomElement($child, $sheet, $row, $column, $cellContent);
 400                          ++$row;
 401  //                        echo 'END OF TABLE ' , $this->tableLevel , ' ROW<br />';
 402                          break;
 403                      case 'th':
 404                      case 'td':
 405  //                        echo 'START OF TABLE ' , $this->tableLevel , ' CELL<br />';
 406                          $this->processDomElement($child, $sheet, $row, $column, $cellContent);
 407  //                        echo 'END OF TABLE ' , $this->tableLevel , ' CELL<br />';
 408  
 409                          while (isset($this->rowspan[$column . $row])) {
 410                              ++$column;
 411                          }
 412  
 413                          $this->flushCell($sheet, $column, $row, $cellContent);
 414  
 415  //                        if (isset($attributeArray['style']) && !empty($attributeArray['style'])) {
 416  //                            $styleAry = $this->getPhpExcelStyleArray($attributeArray['style']);
 417  //
 418  //                            if (!empty($styleAry)) {
 419  //                                $sheet->getStyle($column . $row)->applyFromArray($styleAry);
 420  //                            }
 421  //                        }
 422  
 423                          if (isset($attributeArray['rowspan']) && isset($attributeArray['colspan'])) {
 424                              //create merging rowspan and colspan
 425                              $columnTo = $column;
 426                              for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) {
 427                                  ++$columnTo;
 428                              }
 429                              $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1);
 430                              foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) {
 431                                  $this->rowspan[$value] = true;
 432                              }
 433                              $sheet->mergeCells($range);
 434                              $column = $columnTo;
 435                          } elseif (isset($attributeArray['rowspan'])) {
 436                              //create merging rowspan
 437                              $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1);
 438                              foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) {
 439                                  $this->rowspan[$value] = true;
 440                              }
 441                              $sheet->mergeCells($range);
 442                          } elseif (isset($attributeArray['colspan'])) {
 443                              //create merging colspan
 444                              $columnTo = $column;
 445                              for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) {
 446                                  ++$columnTo;
 447                              }
 448                              $sheet->mergeCells($column . $row . ':' . $columnTo . $row);
 449                              $column = $columnTo;
 450                          }
 451                          ++$column;
 452                          break;
 453                      case 'body':
 454                          $row = 1;
 455                          $column = 'A';
 456                          $content = '';
 457                          $this->tableLevel = 0;
 458                          $this->processDomElement($child, $sheet, $row, $column, $cellContent);
 459                          break;
 460                      default:
 461                          $this->processDomElement($child, $sheet, $row, $column, $cellContent);
 462                  }
 463              }
 464          }
 465      }
 466  
 467      /**
 468       * Loads PHPExcel from file into PHPExcel instance
 469       *
 470       * @param  string                    $pFilename
 471       * @param  PHPExcel                  $objPHPExcel
 472       * @return PHPExcel
 473       * @throws PHPExcel_Reader_Exception
 474       */
 475      public function loadIntoExisting($pFilename, PHPExcel $objPHPExcel)
 476      {
 477          // Open file to validate
 478          $this->openFile($pFilename);
 479          if (!$this->isValidFormat()) {
 480              fclose($this->fileHandle);
 481              throw new PHPExcel_Reader_Exception($pFilename . " is an Invalid HTML file.");
 482          }
 483          //    Close after validating
 484          fclose($this->fileHandle);
 485  
 486          // Create new PHPExcel
 487          while ($objPHPExcel->getSheetCount() <= $this->sheetIndex) {
 488              $objPHPExcel->createSheet();
 489          }
 490          $objPHPExcel->setActiveSheetIndex($this->sheetIndex);
 491  
 492          //    Create a new DOM object
 493          $dom = new domDocument;
 494          //    Reload the HTML file into the DOM object
 495          $loaded = $dom->loadHTML($this->securityScanFile($pFilename));
 496          if ($loaded === false) {
 497              throw new PHPExcel_Reader_Exception('Failed to load ', $pFilename, ' as a DOM Document');
 498          }
 499  
 500          //    Discard white space
 501          $dom->preserveWhiteSpace = false;
 502  
 503          $row = 0;
 504          $column = 'A';
 505          $content = '';
 506          $this->processDomElement($dom, $objPHPExcel->getActiveSheet(), $row, $column, $content);
 507  
 508          // Return
 509          return $objPHPExcel;
 510      }
 511  
 512      /**
 513       * Get sheet index
 514       *
 515       * @return int
 516       */
 517      public function getSheetIndex()
 518      {
 519          return $this->sheetIndex;
 520      }
 521  
 522      /**
 523       * Set sheet index
 524       *
 525       * @param  int                  $pValue Sheet index
 526       * @return PHPExcel_Reader_HTML
 527       */
 528      public function setSheetIndex($pValue = 0)
 529      {
 530          $this->sheetIndex = $pValue;
 531  
 532          return $this;
 533      }
 534  
 535      /**
 536       * Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks
 537       *
 538       * @param     string         $xml
 539       * @throws PHPExcel_Reader_Exception
 540       */
 541      public function securityScan($xml)
 542      {
 543          $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/';
 544          if (preg_match($pattern, $xml)) {
 545              throw new PHPExcel_Reader_Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks');
 546          }
 547          return $xml;
 548      }
 549  }


Generated: Thu Aug 11 10:00:09 2016 Cross-referenced by PHPXref 0.7.1