[ Index ] |
PHP Cross Reference of Unnamed Project |
[Summary view] [Print] [Text view]
1 <?php 2 3 if (!defined('PHPEXCEL_ROOT')) { 4 /** 5 * @ignore 6 */ 7 define('PHPEXCEL_ROOT', dirname(__FILE__) . '/../../'); 8 require (PHPEXCEL_ROOT . 'PHPExcel/Autoloader.php'); 9 } 10 11 /** 12 * PHPExcel_Reader_HTML 13 * 14 * Copyright (c) 2006 - 2015 PHPExcel 15 * 16 * This library is free software; you can redistribute it and/or 17 * modify it under the terms of the GNU Lesser General Public 18 * License as published by the Free Software Foundation; either 19 * version 2.1 of the License, or (at your option) any later version. 20 * 21 * This library is distributed in the hope that it will be useful, 22 * but WITHOUT ANY WARRANTY; without even the implied warranty of 23 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 24 * Lesser General Public License for more details. 25 * 26 * You should have received a copy of the GNU Lesser General Public 27 * License along with this library; if not, write to the Free Software 28 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA 29 * 30 * @category PHPExcel 31 * @package PHPExcel_Reader 32 * @copyright Copyright (c) 2006 - 2015 PHPExcel (http://www.codeplex.com/PHPExcel) 33 * @license http://www.gnu.org/licenses/old-licenses/lgpl-2.1.txt LGPL 34 * @version ##VERSION##, ##DATE## 35 */ 36 /** PHPExcel root directory */ 37 class PHPExcel_Reader_HTML extends PHPExcel_Reader_Abstract implements PHPExcel_Reader_IReader 38 { 39 40 /** 41 * Input encoding 42 * 43 * @var string 44 */ 45 protected $inputEncoding = 'ANSI'; 46 47 /** 48 * Sheet index to read 49 * 50 * @var int 51 */ 52 protected $sheetIndex = 0; 53 54 /** 55 * Formats 56 * 57 * @var array 58 */ 59 protected $formats = array( 60 'h1' => array( 61 'font' => array( 62 'bold' => true, 63 'size' => 24, 64 ), 65 ), // Bold, 24pt 66 'h2' => array( 67 'font' => array( 68 'bold' => true, 69 'size' => 18, 70 ), 71 ), // Bold, 18pt 72 'h3' => array( 73 'font' => array( 74 'bold' => true, 75 'size' => 13.5, 76 ), 77 ), // Bold, 13.5pt 78 'h4' => array( 79 'font' => array( 80 'bold' => true, 81 'size' => 12, 82 ), 83 ), // Bold, 12pt 84 'h5' => array( 85 'font' => array( 86 'bold' => true, 87 'size' => 10, 88 ), 89 ), // Bold, 10pt 90 'h6' => array( 91 'font' => array( 92 'bold' => true, 93 'size' => 7.5, 94 ), 95 ), // Bold, 7.5pt 96 'a' => array( 97 'font' => array( 98 'underline' => true, 99 'color' => array( 100 'argb' => PHPExcel_Style_Color::COLOR_BLUE, 101 ), 102 ), 103 ), // Blue underlined 104 'hr' => array( 105 'borders' => array( 106 'bottom' => array( 107 'style' => PHPExcel_Style_Border::BORDER_THIN, 108 'color' => array( 109 PHPExcel_Style_Color::COLOR_BLACK, 110 ), 111 ), 112 ), 113 ), // Bottom border 114 ); 115 116 protected $rowspan = array(); 117 118 /** 119 * Create a new PHPExcel_Reader_HTML 120 */ 121 public function __construct() 122 { 123 $this->readFilter = new PHPExcel_Reader_DefaultReadFilter(); 124 } 125 126 /** 127 * Validate that the current file is an HTML file 128 * 129 * @return boolean 130 */ 131 protected function isValidFormat() 132 { 133 // Reading 2048 bytes should be enough to validate that the format is HTML 134 $data = fread($this->fileHandle, 2048); 135 if ((strpos($data, '<') !== false) && 136 (strlen($data) !== strlen(strip_tags($data)))) { 137 return true; 138 } 139 140 return false; 141 } 142 143 /** 144 * Loads PHPExcel from file 145 * 146 * @param string $pFilename 147 * @return PHPExcel 148 * @throws PHPExcel_Reader_Exception 149 */ 150 public function load($pFilename) 151 { 152 // Create new PHPExcel 153 $objPHPExcel = new PHPExcel(); 154 155 // Load into this instance 156 return $this->loadIntoExisting($pFilename, $objPHPExcel); 157 } 158 159 /** 160 * Set input encoding 161 * 162 * @param string $pValue Input encoding 163 */ 164 public function setInputEncoding($pValue = 'ANSI') 165 { 166 $this->inputEncoding = $pValue; 167 168 return $this; 169 } 170 171 /** 172 * Get input encoding 173 * 174 * @return string 175 */ 176 public function getInputEncoding() 177 { 178 return $this->inputEncoding; 179 } 180 181 // Data Array used for testing only, should write to PHPExcel object on completion of tests 182 protected $dataArray = array(); 183 protected $tableLevel = 0; 184 protected $nestedColumn = array('A'); 185 186 protected function setTableStartColumn($column) 187 { 188 if ($this->tableLevel == 0) { 189 $column = 'A'; 190 } 191 ++$this->tableLevel; 192 $this->nestedColumn[$this->tableLevel] = $column; 193 194 return $this->nestedColumn[$this->tableLevel]; 195 } 196 197 protected function getTableStartColumn() 198 { 199 return $this->nestedColumn[$this->tableLevel]; 200 } 201 202 protected function releaseTableStartColumn() 203 { 204 --$this->tableLevel; 205 206 return array_pop($this->nestedColumn); 207 } 208 209 protected function flushCell($sheet, $column, $row, &$cellContent) 210 { 211 if (is_string($cellContent)) { 212 // Simple String content 213 if (trim($cellContent) > '') { 214 // Only actually write it if there's content in the string 215 // echo 'FLUSH CELL: ' , $column , $row , ' => ' , $cellContent , '<br />'; 216 // Write to worksheet to be done here... 217 // ... we return the cell so we can mess about with styles more easily 218 $sheet->setCellValue($column . $row, $cellContent, true); 219 $this->dataArray[$row][$column] = $cellContent; 220 } 221 } else { 222 // We have a Rich Text run 223 // TODO 224 $this->dataArray[$row][$column] = 'RICH TEXT: ' . $cellContent; 225 } 226 $cellContent = (string) ''; 227 } 228 229 protected function processDomElement(DOMNode $element, $sheet, &$row, &$column, &$cellContent, $format = null) 230 { 231 foreach ($element->childNodes as $child) { 232 if ($child instanceof DOMText) { 233 $domText = preg_replace('/\s+/u', ' ', trim($child->nodeValue)); 234 if (is_string($cellContent)) { 235 // simply append the text if the cell content is a plain text string 236 $cellContent .= $domText; 237 } else { 238 // but if we have a rich text run instead, we need to append it correctly 239 // TODO 240 } 241 } elseif ($child instanceof DOMElement) { 242 // echo '<b>DOM ELEMENT: </b>' , strtoupper($child->nodeName) , '<br />'; 243 244 $attributeArray = array(); 245 foreach ($child->attributes as $attribute) { 246 // echo '<b>ATTRIBUTE: </b>' , $attribute->name , ' => ' , $attribute->value , '<br />'; 247 $attributeArray[$attribute->name] = $attribute->value; 248 } 249 250 switch ($child->nodeName) { 251 case 'meta': 252 foreach ($attributeArray as $attributeName => $attributeValue) { 253 switch ($attributeName) { 254 case 'content': 255 // TODO 256 // Extract character set, so we can convert to UTF-8 if required 257 break; 258 } 259 } 260 $this->processDomElement($child, $sheet, $row, $column, $cellContent); 261 break; 262 case 'title': 263 $this->processDomElement($child, $sheet, $row, $column, $cellContent); 264 $sheet->setTitle($cellContent); 265 $cellContent = ''; 266 break; 267 case 'span': 268 case 'div': 269 case 'font': 270 case 'i': 271 case 'em': 272 case 'strong': 273 case 'b': 274 // echo 'STYLING, SPAN OR DIV<br />'; 275 if ($cellContent > '') { 276 $cellContent .= ' '; 277 } 278 $this->processDomElement($child, $sheet, $row, $column, $cellContent); 279 if ($cellContent > '') { 280 $cellContent .= ' '; 281 } 282 // echo 'END OF STYLING, SPAN OR DIV<br />'; 283 break; 284 case 'hr': 285 $this->flushCell($sheet, $column, $row, $cellContent); 286 ++$row; 287 if (isset($this->formats[$child->nodeName])) { 288 $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]); 289 } else { 290 $cellContent = '----------'; 291 $this->flushCell($sheet, $column, $row, $cellContent); 292 } 293 ++$row; 294 // Add a break after a horizontal rule, simply by allowing the code to dropthru 295 case 'br': 296 if ($this->tableLevel > 0) { 297 // If we're inside a table, replace with a \n 298 $cellContent .= "\n"; 299 } else { 300 // Otherwise flush our existing content and move the row cursor on 301 $this->flushCell($sheet, $column, $row, $cellContent); 302 ++$row; 303 } 304 // echo 'HARD LINE BREAK: ' , '<br />'; 305 break; 306 case 'a': 307 // echo 'START OF HYPERLINK: ' , '<br />'; 308 foreach ($attributeArray as $attributeName => $attributeValue) { 309 switch ($attributeName) { 310 case 'href': 311 // echo 'Link to ' , $attributeValue , '<br />'; 312 $sheet->getCell($column . $row)->getHyperlink()->setUrl($attributeValue); 313 if (isset($this->formats[$child->nodeName])) { 314 $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]); 315 } 316 break; 317 } 318 } 319 $cellContent .= ' '; 320 $this->processDomElement($child, $sheet, $row, $column, $cellContent); 321 // echo 'END OF HYPERLINK:' , '<br />'; 322 break; 323 case 'h1': 324 case 'h2': 325 case 'h3': 326 case 'h4': 327 case 'h5': 328 case 'h6': 329 case 'ol': 330 case 'ul': 331 case 'p': 332 if ($this->tableLevel > 0) { 333 // If we're inside a table, replace with a \n 334 $cellContent .= "\n"; 335 // echo 'LIST ENTRY: ' , '<br />'; 336 $this->processDomElement($child, $sheet, $row, $column, $cellContent); 337 // echo 'END OF LIST ENTRY:' , '<br />'; 338 } else { 339 if ($cellContent > '') { 340 $this->flushCell($sheet, $column, $row, $cellContent); 341 $row++; 342 } 343 // echo 'START OF PARAGRAPH: ' , '<br />'; 344 $this->processDomElement($child, $sheet, $row, $column, $cellContent); 345 // echo 'END OF PARAGRAPH:' , '<br />'; 346 $this->flushCell($sheet, $column, $row, $cellContent); 347 348 if (isset($this->formats[$child->nodeName])) { 349 $sheet->getStyle($column . $row)->applyFromArray($this->formats[$child->nodeName]); 350 } 351 352 $row++; 353 $column = 'A'; 354 } 355 break; 356 case 'li': 357 if ($this->tableLevel > 0) { 358 // If we're inside a table, replace with a \n 359 $cellContent .= "\n"; 360 // echo 'LIST ENTRY: ' , '<br />'; 361 $this->processDomElement($child, $sheet, $row, $column, $cellContent); 362 // echo 'END OF LIST ENTRY:' , '<br />'; 363 } else { 364 if ($cellContent > '') { 365 $this->flushCell($sheet, $column, $row, $cellContent); 366 } 367 ++$row; 368 // echo 'LIST ENTRY: ' , '<br />'; 369 $this->processDomElement($child, $sheet, $row, $column, $cellContent); 370 // echo 'END OF LIST ENTRY:' , '<br />'; 371 $this->flushCell($sheet, $column, $row, $cellContent); 372 $column = 'A'; 373 } 374 break; 375 case 'table': 376 $this->flushCell($sheet, $column, $row, $cellContent); 377 $column = $this->setTableStartColumn($column); 378 // echo 'START OF TABLE LEVEL ' , $this->tableLevel , '<br />'; 379 if ($this->tableLevel > 1) { 380 --$row; 381 } 382 $this->processDomElement($child, $sheet, $row, $column, $cellContent); 383 // echo 'END OF TABLE LEVEL ' , $this->tableLevel , '<br />'; 384 $column = $this->releaseTableStartColumn(); 385 if ($this->tableLevel > 1) { 386 ++$column; 387 } else { 388 ++$row; 389 } 390 break; 391 case 'thead': 392 case 'tbody': 393 $this->processDomElement($child, $sheet, $row, $column, $cellContent); 394 break; 395 case 'tr': 396 $column = $this->getTableStartColumn(); 397 $cellContent = ''; 398 // echo 'START OF TABLE ' , $this->tableLevel , ' ROW<br />'; 399 $this->processDomElement($child, $sheet, $row, $column, $cellContent); 400 ++$row; 401 // echo 'END OF TABLE ' , $this->tableLevel , ' ROW<br />'; 402 break; 403 case 'th': 404 case 'td': 405 // echo 'START OF TABLE ' , $this->tableLevel , ' CELL<br />'; 406 $this->processDomElement($child, $sheet, $row, $column, $cellContent); 407 // echo 'END OF TABLE ' , $this->tableLevel , ' CELL<br />'; 408 409 while (isset($this->rowspan[$column . $row])) { 410 ++$column; 411 } 412 413 $this->flushCell($sheet, $column, $row, $cellContent); 414 415 // if (isset($attributeArray['style']) && !empty($attributeArray['style'])) { 416 // $styleAry = $this->getPhpExcelStyleArray($attributeArray['style']); 417 // 418 // if (!empty($styleAry)) { 419 // $sheet->getStyle($column . $row)->applyFromArray($styleAry); 420 // } 421 // } 422 423 if (isset($attributeArray['rowspan']) && isset($attributeArray['colspan'])) { 424 //create merging rowspan and colspan 425 $columnTo = $column; 426 for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) { 427 ++$columnTo; 428 } 429 $range = $column . $row . ':' . $columnTo . ($row + $attributeArray['rowspan'] - 1); 430 foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) { 431 $this->rowspan[$value] = true; 432 } 433 $sheet->mergeCells($range); 434 $column = $columnTo; 435 } elseif (isset($attributeArray['rowspan'])) { 436 //create merging rowspan 437 $range = $column . $row . ':' . $column . ($row + $attributeArray['rowspan'] - 1); 438 foreach (\PHPExcel_Cell::extractAllCellReferencesInRange($range) as $value) { 439 $this->rowspan[$value] = true; 440 } 441 $sheet->mergeCells($range); 442 } elseif (isset($attributeArray['colspan'])) { 443 //create merging colspan 444 $columnTo = $column; 445 for ($i = 0; $i < $attributeArray['colspan'] - 1; $i++) { 446 ++$columnTo; 447 } 448 $sheet->mergeCells($column . $row . ':' . $columnTo . $row); 449 $column = $columnTo; 450 } 451 ++$column; 452 break; 453 case 'body': 454 $row = 1; 455 $column = 'A'; 456 $content = ''; 457 $this->tableLevel = 0; 458 $this->processDomElement($child, $sheet, $row, $column, $cellContent); 459 break; 460 default: 461 $this->processDomElement($child, $sheet, $row, $column, $cellContent); 462 } 463 } 464 } 465 } 466 467 /** 468 * Loads PHPExcel from file into PHPExcel instance 469 * 470 * @param string $pFilename 471 * @param PHPExcel $objPHPExcel 472 * @return PHPExcel 473 * @throws PHPExcel_Reader_Exception 474 */ 475 public function loadIntoExisting($pFilename, PHPExcel $objPHPExcel) 476 { 477 // Open file to validate 478 $this->openFile($pFilename); 479 if (!$this->isValidFormat()) { 480 fclose($this->fileHandle); 481 throw new PHPExcel_Reader_Exception($pFilename . " is an Invalid HTML file."); 482 } 483 // Close after validating 484 fclose($this->fileHandle); 485 486 // Create new PHPExcel 487 while ($objPHPExcel->getSheetCount() <= $this->sheetIndex) { 488 $objPHPExcel->createSheet(); 489 } 490 $objPHPExcel->setActiveSheetIndex($this->sheetIndex); 491 492 // Create a new DOM object 493 $dom = new domDocument; 494 // Reload the HTML file into the DOM object 495 $loaded = $dom->loadHTML($this->securityScanFile($pFilename)); 496 if ($loaded === false) { 497 throw new PHPExcel_Reader_Exception('Failed to load ', $pFilename, ' as a DOM Document'); 498 } 499 500 // Discard white space 501 $dom->preserveWhiteSpace = false; 502 503 $row = 0; 504 $column = 'A'; 505 $content = ''; 506 $this->processDomElement($dom, $objPHPExcel->getActiveSheet(), $row, $column, $content); 507 508 // Return 509 return $objPHPExcel; 510 } 511 512 /** 513 * Get sheet index 514 * 515 * @return int 516 */ 517 public function getSheetIndex() 518 { 519 return $this->sheetIndex; 520 } 521 522 /** 523 * Set sheet index 524 * 525 * @param int $pValue Sheet index 526 * @return PHPExcel_Reader_HTML 527 */ 528 public function setSheetIndex($pValue = 0) 529 { 530 $this->sheetIndex = $pValue; 531 532 return $this; 533 } 534 535 /** 536 * Scan theXML for use of <!ENTITY to prevent XXE/XEE attacks 537 * 538 * @param string $xml 539 * @throws PHPExcel_Reader_Exception 540 */ 541 public function securityScan($xml) 542 { 543 $pattern = '/\\0?' . implode('\\0?', str_split('<!ENTITY')) . '\\0?/'; 544 if (preg_match($pattern, $xml)) { 545 throw new PHPExcel_Reader_Exception('Detected use of ENTITY in XML, spreadsheet file load() aborted to prevent XXE/XEE attacks'); 546 } 547 return $xml; 548 } 549 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Thu Aug 11 10:00:09 2016 | Cross-referenced by PHPXref 0.7.1 |