[ Index ] |
PHP Cross Reference of Unnamed Project |
[Summary view] [Print] [Text view]
1 <?php 2 3 /* 4 * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com> 5 * 6 * This script is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License as published by 8 * the Free Software Foundation; either version 2 of the License, or 9 * (at your option) any later version. 10 * 11 * The GNU General Public License can be found at 12 * http://www.gnu.org/copyleft/gpl.html. 13 * 14 * This script is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 */ 19 20 namespace Html2Text; 21 22 class Html2Text 23 { 24 const ENCODING = 'UTF-8'; 25 26 protected $htmlFuncFlags; 27 28 /** 29 * Contains the HTML content to convert. 30 * 31 * @type string 32 */ 33 protected $html; 34 35 /** 36 * Contains the converted, formatted text. 37 * 38 * @type string 39 */ 40 protected $text; 41 42 /** 43 * List of preg* regular expression patterns to search for, 44 * used in conjunction with $replace. 45 * 46 * @type array 47 * @see $replace 48 */ 49 protected $search = array( 50 "/\r/", // Non-legal carriage return 51 "/[\n\t]+/", // Newlines and tabs 52 '/<head\b[^>]*>.*?<\/head>/i', // <head> 53 '/<script\b[^>]*>.*?<\/script>/i', // <script>s -- which strip_tags supposedly has problems with 54 '/<style\b[^>]*>.*?<\/style>/i', // <style>s -- which strip_tags supposedly has problems with 55 '/<i\b[^>]*>(.*?)<\/i>/i', // <i> 56 '/<em\b[^>]*>(.*?)<\/em>/i', // <em> 57 '/(<ul\b[^>]*>|<\/ul>)/i', // <ul> and </ul> 58 '/(<ol\b[^>]*>|<\/ol>)/i', // <ol> and </ol> 59 '/(<dl\b[^>]*>|<\/dl>)/i', // <dl> and </dl> 60 '/<li\b[^>]*>(.*?)<\/li>/i', // <li> and </li> 61 '/<dd\b[^>]*>(.*?)<\/dd>/i', // <dd> and </dd> 62 '/<dt\b[^>]*>(.*?)<\/dt>/i', // <dt> and </dt> 63 '/<li\b[^>]*>/i', // <li> 64 '/<hr\b[^>]*>/i', // <hr> 65 '/<div\b[^>]*>/i', // <div> 66 '/(<table\b[^>]*>|<\/table>)/i', // <table> and </table> 67 '/(<tr\b[^>]*>|<\/tr>)/i', // <tr> and </tr> 68 '/<td\b[^>]*>(.*?)<\/td>/i', // <td> and </td> 69 '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span> 70 '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i', // <img> with alt tag 71 ); 72 73 /** 74 * List of pattern replacements corresponding to patterns searched. 75 * 76 * @type array 77 * @see $search 78 */ 79 protected $replace = array( 80 '', // Non-legal carriage return 81 ' ', // Newlines and tabs 82 '', // <head> 83 '', // <script>s -- which strip_tags supposedly has problems with 84 '', // <style>s -- which strip_tags supposedly has problems with 85 '_\\1_', // <i> 86 '_\\1_', // <em> 87 "\n\n", // <ul> and </ul> 88 "\n\n", // <ol> and </ol> 89 "\n\n", // <dl> and </dl> 90 "\t* \\1\n", // <li> and </li> 91 " \\1\n", // <dd> and </dd> 92 "\t* \\1", // <dt> and </dt> 93 "\n\t* ", // <li> 94 "\n-------------------------\n", // <hr> 95 "<div>\n", // <div> 96 "\n\n", // <table> and </table> 97 "\n", // <tr> and </tr> 98 "\t\t\\1\n", // <td> and </td> 99 "", // <span class="_html2text_ignore">...</span> 100 '[\\2]', // <img> with alt tag 101 ); 102 103 /** 104 * List of preg* regular expression patterns to search for, 105 * used in conjunction with $entReplace. 106 * 107 * @type array 108 * @see $entReplace 109 */ 110 protected $entSearch = array( 111 '/™/i', // TM symbol in win-1252 112 '/—/i', // m-dash in win-1252 113 '/&(amp|#38);/i', // Ampersand: see converter() 114 '/[ ]{2,}/', // Runs of spaces, post-handling 115 ); 116 117 /** 118 * List of pattern replacements corresponding to patterns searched. 119 * 120 * @type array 121 * @see $entSearch 122 */ 123 protected $entReplace = array( 124 '™', // TM symbol 125 '—', // m-dash 126 '|+|amp|+|', // Ampersand: see converter() 127 ' ', // Runs of spaces, post-handling 128 ); 129 130 /** 131 * List of preg* regular expression patterns to search for 132 * and replace using callback function. 133 * 134 * @type array 135 */ 136 protected $callbackSearch = array( 137 '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i', // h1 - h6 138 '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si', // <p> with surrounding whitespace. 139 '/<(br)[^>]*>[ ]*/i', // <br> with leading whitespace after the newline. 140 '/<(b)( [^>]*)?>(.*?)<\/b>/i', // <b> 141 '/<(strong)( [^>]*)?>(.*?)<\/strong>/i', // <strong> 142 '/<(th)( [^>]*)?>(.*?)<\/th>/i', // <th> and </th> 143 '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i' // <a href=""> 144 ); 145 146 /** 147 * List of preg* regular expression patterns to search for in PRE body, 148 * used in conjunction with $preReplace. 149 * 150 * @type array 151 * @see $preReplace 152 */ 153 protected $preSearch = array( 154 "/\n/", 155 "/\t/", 156 '/ /', 157 '/<pre[^>]*>/', 158 '/<\/pre>/' 159 ); 160 161 /** 162 * List of pattern replacements corresponding to patterns searched for PRE body. 163 * 164 * @type array 165 * @see $preSearch 166 */ 167 protected $preReplace = array( 168 '<br>', 169 ' ', 170 ' ', 171 '', 172 '', 173 ); 174 175 /** 176 * Temporary workspace used during PRE processing. 177 * 178 * @type string 179 */ 180 protected $preContent = ''; 181 182 /** 183 * Contains the base URL that relative links should resolve to. 184 * 185 * @type string 186 */ 187 protected $baseurl = ''; 188 189 /** 190 * Indicates whether content in the $html variable has been converted yet. 191 * 192 * @type boolean 193 * @see $html, $text 194 */ 195 protected $converted = false; 196 197 /** 198 * Contains URL addresses from links to be rendered in plain text. 199 * 200 * @type array 201 * @see buildlinkList() 202 */ 203 protected $linkList = array(); 204 205 /** 206 * Various configuration options (able to be set in the constructor) 207 * 208 * @type array 209 */ 210 protected $options = array( 211 'do_links' => 'inline', // 'none' 212 // 'inline' (show links inline) 213 // 'nextline' (show links on the next line) 214 // 'table' (if a table of link URLs should be listed after the text. 215 // 'bbcode' (show links as bbcode) 216 217 'width' => 70, // Maximum width of the formatted text, in columns. 218 // Set this value to 0 (or less) to ignore word wrapping 219 // and not constrain text to a fixed-width column. 220 ); 221 222 private function legacyConstruct($html = '', $fromFile = false, array $options = array()) 223 { 224 $this->set_html($html, $fromFile); 225 $this->options = array_merge($this->options, $options); 226 } 227 228 /** 229 * @param string $html Source HTML 230 * @param array $options Set configuration options 231 */ 232 public function __construct($html = '', $options = array()) 233 { 234 // for backwards compatibility 235 if (!is_array($options)) { 236 return call_user_func_array(array($this, 'legacyConstruct'), func_get_args()); 237 } 238 239 $this->html = $html; 240 $this->options = array_merge($this->options, $options); 241 $this->htmlFuncFlags = (PHP_VERSION_ID < 50400) 242 ? ENT_COMPAT 243 : ENT_COMPAT | ENT_HTML5; 244 } 245 246 /** 247 * Set the source HTML 248 * 249 * @param string $html HTML source content 250 */ 251 public function setHtml($html) 252 { 253 $this->html = $html; 254 $this->converted = false; 255 } 256 257 /** 258 * @deprecated 259 */ 260 public function set_html($html, $from_file = false) 261 { 262 if ($from_file) { 263 throw new \InvalidArgumentException("Argument from_file no longer supported"); 264 } 265 266 return $this->setHtml($html); 267 } 268 269 /** 270 * Returns the text, converted from HTML. 271 * 272 * @return string 273 */ 274 public function getText() 275 { 276 if (!$this->converted) { 277 $this->convert(); 278 } 279 280 return $this->text; 281 } 282 283 /** 284 * @deprecated 285 */ 286 public function get_text() 287 { 288 return $this->getText(); 289 } 290 291 /** 292 * @deprecated 293 */ 294 public function print_text() 295 { 296 print $this->getText(); 297 } 298 299 /** 300 * @deprecated 301 */ 302 public function p() 303 { 304 return $this->print_text(); 305 } 306 307 /** 308 * Sets a base URL to handle relative links. 309 * 310 * @param string $baseurl 311 */ 312 public function setBaseUrl($baseurl) 313 { 314 $this->baseurl = $baseurl; 315 } 316 317 /** 318 * @deprecated 319 */ 320 public function set_base_url($baseurl) 321 { 322 return $this->setBaseUrl($baseurl); 323 } 324 325 protected function convert() 326 { 327 $origEncoding = mb_internal_encoding(); 328 mb_internal_encoding(self::ENCODING); 329 330 $this->doConvert(); 331 332 mb_internal_encoding($origEncoding); 333 } 334 335 protected function doConvert() 336 { 337 $this->linkList = array(); 338 339 $text = trim($this->html); 340 341 $this->converter($text); 342 343 if ($this->linkList) { 344 $text .= "\n\nLinks:\n------\n"; 345 foreach ($this->linkList as $i => $url) { 346 $text .= '[' . ($i + 1) . '] ' . $url . "\n"; 347 } 348 } 349 350 $this->text = $text; 351 352 $this->converted = true; 353 } 354 355 protected function converter(&$text) 356 { 357 $this->convertBlockquotes($text); 358 $this->convertPre($text); 359 $text = preg_replace($this->search, $this->replace, $text); 360 $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text); 361 $text = strip_tags($text); 362 $text = preg_replace($this->entSearch, $this->entReplace, $text); 363 $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING); 364 365 // Remove unknown/unhandled entities (this cannot be done in search-and-replace block) 366 $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text); 367 368 // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities 369 // This properly handles situation of "&quot;" in input string 370 $text = str_replace('|+|amp|+|', '&', $text); 371 372 // Normalise empty lines 373 $text = preg_replace("/\n\s+\n/", "\n\n", $text); 374 $text = preg_replace("/[\n]{3,}/", "\n\n", $text); 375 376 // remove leading empty lines (can be produced by eg. P tag on the beginning) 377 $text = ltrim($text, "\n"); 378 379 if ($this->options['width'] > 0) { 380 $text = wordwrap($text, $this->options['width']); 381 } 382 } 383 384 /** 385 * Helper function called by preg_replace() on link replacement. 386 * 387 * Maintains an internal list of links to be displayed at the end of the 388 * text, with numeric indices to the original point in the text they 389 * appeared. Also makes an effort at identifying and handling absolute 390 * and relative links. 391 * 392 * @param string $link URL of the link 393 * @param string $display Part of the text to associate number with 394 * @param null $linkOverride 395 * @return string 396 */ 397 protected function buildlinkList($link, $display, $linkOverride = null) 398 { 399 $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links']; 400 if ($linkMethod == 'none') { 401 return $display; 402 } 403 404 // Ignored link types 405 if (preg_match('!^(javascript:|mailto:|#)!i', $link)) { 406 return $display; 407 } 408 409 if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) { 410 $url = $link; 411 } else { 412 $url = $this->baseurl; 413 if (mb_substr($link, 0, 1) != '/') { 414 $url .= '/'; 415 } 416 $url .= $link; 417 } 418 419 if ($linkMethod == 'table') { 420 if (($index = array_search($url, $this->linkList)) === false) { 421 $index = count($this->linkList); 422 $this->linkList[] = $url; 423 } 424 425 return $display . ' [' . ($index + 1) . ']'; 426 } elseif ($linkMethod == 'nextline') { 427 return $display . "\n[" . $url . ']'; 428 } elseif ($linkMethod == 'bbcode') { 429 return sprintf('[url=%s]%s[/url]', $url, $display); 430 } else { // link_method defaults to inline 431 return $display . ' [' . $url . ']'; 432 } 433 } 434 435 protected function convertPre(&$text) 436 { 437 // get the content of PRE element 438 while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) { 439 // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace 440 $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]); 441 442 // Run our defined tags search-and-replace with callback 443 $this->preContent = preg_replace_callback( 444 $this->callbackSearch, 445 array($this, 'pregCallback'), 446 $this->preContent 447 ); 448 449 // convert the content 450 $this->preContent = sprintf( 451 '<div><br>%s<br></div>', 452 preg_replace($this->preSearch, $this->preReplace, $this->preContent) 453 ); 454 455 // replace the content (use callback because content can contain $0 variable) 456 $text = preg_replace_callback( 457 '/<pre[^>]*>.*<\/pre>/ismU', 458 array($this, 'pregPreCallback'), 459 $text, 460 1 461 ); 462 463 // free memory 464 $this->preContent = ''; 465 } 466 } 467 468 /** 469 * Helper function for BLOCKQUOTE body conversion. 470 * 471 * @param string $text HTML content 472 */ 473 protected function convertBlockquotes(&$text) 474 { 475 if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) { 476 $originalText = $text; 477 $start = 0; 478 $taglen = 0; 479 $level = 0; 480 $diff = 0; 481 foreach ($matches[0] as $m) { 482 $m[1] = mb_strlen(substr($originalText, 0, $m[1])); 483 if ($m[0][0] == '<' && $m[0][1] == '/') { 484 $level--; 485 if ($level < 0) { 486 $level = 0; // malformed HTML: go to next blockquote 487 } elseif ($level > 0) { 488 // skip inner blockquote 489 } else { 490 $end = $m[1]; 491 $len = $end - $taglen - $start; 492 // Get blockquote content 493 $body = mb_substr($text, $start + $taglen - $diff, $len); 494 495 // Set text width 496 $pWidth = $this->options['width']; 497 if ($this->options['width'] > 0) $this->options['width'] -= 2; 498 // Convert blockquote content 499 $body = trim($body); 500 $this->converter($body); 501 // Add citation markers and create PRE block 502 $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body)); 503 $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>'; 504 // Re-set text width 505 $this->options['width'] = $pWidth; 506 // Replace content 507 $text = mb_substr($text, 0, $start - $diff) 508 . $body 509 . mb_substr($text, $end + mb_strlen($m[0]) - $diff); 510 511 $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body); 512 unset($body); 513 } 514 } else { 515 if ($level == 0) { 516 $start = $m[1]; 517 $taglen = mb_strlen($m[0]); 518 } 519 $level++; 520 } 521 } 522 } 523 } 524 525 /** 526 * Callback function for preg_replace_callback use. 527 * 528 * @param array $matches PREG matches 529 * @return string 530 */ 531 protected function pregCallback($matches) 532 { 533 switch (mb_strtolower($matches[1])) { 534 case 'p': 535 // Replace newlines with spaces. 536 $para = str_replace("\n", " ", $matches[3]); 537 538 // Trim trailing and leading whitespace within the tag. 539 $para = trim($para); 540 541 // Add trailing newlines for this para. 542 return "\n" . $para . "\n"; 543 case 'br': 544 return "\n"; 545 case 'b': 546 case 'strong': 547 return $this->toupper($matches[3]); 548 case 'th': 549 return $this->toupper("\t\t" . $matches[3] . "\n"); 550 case 'h': 551 return $this->toupper("\n\n" . $matches[3] . "\n\n"); 552 case 'a': 553 // override the link method 554 $linkOverride = null; 555 if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) { 556 $linkOverride = $linkOverrideMatch[1]; 557 } 558 // Remove spaces in URL (#1487805) 559 $url = str_replace(' ', '', $matches[3]); 560 561 return $this->buildlinkList($url, $matches[5], $linkOverride); 562 } 563 564 return ''; 565 } 566 567 /** 568 * Callback function for preg_replace_callback use in PRE content handler. 569 * 570 * @param array $matches PREG matches 571 * @return string 572 */ 573 protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches) 574 { 575 return $this->preContent; 576 } 577 578 /** 579 * Strtoupper function with HTML tags and entities handling. 580 * 581 * @param string $str Text to convert 582 * @return string Converted text 583 */ 584 protected function toupper($str) 585 { 586 // string can contain HTML tags 587 $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE); 588 589 // convert toupper only the text between HTML tags 590 foreach ($chunks as $i => $chunk) { 591 if ($chunk[0] != '<') { 592 $chunks[$i] = $this->strtoupper($chunk); 593 } 594 } 595 596 return implode($chunks); 597 } 598 599 /** 600 * Strtoupper multibyte wrapper function with HTML entities handling. 601 * 602 * @param string $str Text to convert 603 * @return string Converted text 604 */ 605 protected function strtoupper($str) 606 { 607 $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING); 608 $str = mb_strtoupper($str); 609 $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING); 610 611 return $str; 612 } 613 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Thu Aug 11 10:00:09 2016 | Cross-referenced by PHPXref 0.7.1 |