PHPXRef 0.7.1 : Unnamed Project : /lib/html2text/Html2Text.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  
   3  /*
   4   * Copyright (c) 2005-2007 Jon Abernathy <jon@chuggnutt.com>
   5   *
   6   * This script is free software; you can redistribute it and/or modify
   7   * it under the terms of the GNU General Public License as published by
   8   * the Free Software Foundation; either version 2 of the License, or
   9   * (at your option) any later version.
  10   *
  11   * The GNU General Public License can be found at
  12   * http://www.gnu.org/copyleft/gpl.html.
  13   *
  14   * This script is distributed in the hope that it will be useful,
  15   * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  17   * GNU General Public License for more details.
  18   */
  19  
  20  namespace Html2Text;
  21  
  22  class Html2Text
  23  {
  24      const ENCODING = 'UTF-8';
  25  
  26      protected $htmlFuncFlags;
  27  
  28      /**
  29       * Contains the HTML content to convert.
  30       *
  31       * @type string
  32       */
  33      protected $html;
  34  
  35      /**
  36       * Contains the converted, formatted text.
  37       *
  38       * @type string
  39       */
  40      protected $text;
  41  
  42      /**
  43       * List of preg* regular expression patterns to search for,
  44       * used in conjunction with $replace.
  45       *
  46       * @type array
  47       * @see $replace
  48       */
  49      protected $search = array(
  50          "/\r/",                                           // Non-legal carriage return
  51          "/[\n\t]+/",                                      // Newlines and tabs
  52          '/<head\b[^>]*>.*?<\/head>/i',                    // <head>
  53          '/<script\b[^>]*>.*?<\/script>/i',                // <script>s -- which strip_tags supposedly has problems with
  54          '/<style\b[^>]*>.*?<\/style>/i',                  // <style>s -- which strip_tags supposedly has problems with
  55          '/<i\b[^>]*>(.*?)<\/i>/i',                        // <i>
  56          '/<em\b[^>]*>(.*?)<\/em>/i',                      // <em>
  57          '/(<ul\b[^>]*>|<\/ul>)/i',                        // <ul> and </ul>
  58          '/(<ol\b[^>]*>|<\/ol>)/i',                        // <ol> and </ol>
  59          '/(<dl\b[^>]*>|<\/dl>)/i',                        // <dl> and </dl>
  60          '/<li\b[^>]*>(.*?)<\/li>/i',                      // <li> and </li>
  61          '/<dd\b[^>]*>(.*?)<\/dd>/i',                      // <dd> and </dd>
  62          '/<dt\b[^>]*>(.*?)<\/dt>/i',                      // <dt> and </dt>
  63          '/<li\b[^>]*>/i',                                 // <li>
  64          '/<hr\b[^>]*>/i',                                 // <hr>
  65          '/<div\b[^>]*>/i',                                // <div>
  66          '/(<table\b[^>]*>|<\/table>)/i',                  // <table> and </table>
  67          '/(<tr\b[^>]*>|<\/tr>)/i',                        // <tr> and </tr>
  68          '/<td\b[^>]*>(.*?)<\/td>/i',                      // <td> and </td>
  69          '/<span class="_html2text_ignore">.+?<\/span>/i', // <span class="_html2text_ignore">...</span>
  70          '/<(img)\b[^>]*alt=\"([^>"]+)\"[^>]*>/i',         // <img> with alt tag
  71      );
  72  
  73      /**
  74       * List of pattern replacements corresponding to patterns searched.
  75       *
  76       * @type array
  77       * @see $search
  78       */
  79      protected $replace = array(
  80          '',                              // Non-legal carriage return
  81          ' ',                             // Newlines and tabs
  82          '',                              // <head>
  83          '',                              // <script>s -- which strip_tags supposedly has problems with
  84          '',                              // <style>s -- which strip_tags supposedly has problems with
  85          '_\\1_',                         // <i>
  86          '_\\1_',                         // <em>
  87          "\n\n",                          // <ul> and </ul>
  88          "\n\n",                          // <ol> and </ol>
  89          "\n\n",                          // <dl> and </dl>
  90          "\t* \\1\n",                     // <li> and </li>
  91          " \\1\n",                        // <dd> and </dd>
  92          "\t* \\1",                       // <dt> and </dt>
  93          "\n\t* ",                        // <li>
  94          "\n-------------------------\n", // <hr>
  95          "<div>\n",                       // <div>
  96          "\n\n",                          // <table> and </table>
  97          "\n",                            // <tr> and </tr>
  98          "\t\t\\1\n",                     // <td> and </td>
  99          "",                              // <span class="_html2text_ignore">...</span>
 100          '[\\2]',                         // <img> with alt tag
 101      );
 102  
 103      /**
 104       * List of preg* regular expression patterns to search for,
 105       * used in conjunction with $entReplace.
 106       *
 107       * @type array
 108       * @see $entReplace
 109       */
 110      protected $entSearch = array(
 111          '/&#153;/i',                                     // TM symbol in win-1252
 112          '/&#151;/i',                                     // m-dash in win-1252
 113          '/&(amp|#38);/i',                                // Ampersand: see converter()
 114          '/[ ]{2,}/',                                     // Runs of spaces, post-handling
 115      );
 116  
 117      /**
 118       * List of pattern replacements corresponding to patterns searched.
 119       *
 120       * @type array
 121       * @see $entSearch
 122       */
 123      protected $entReplace = array(
 124          '™',         // TM symbol
 125          '—',         // m-dash
 126          '|+|amp|+|', // Ampersand: see converter()
 127          ' ',         // Runs of spaces, post-handling
 128      );
 129  
 130      /**
 131       * List of preg* regular expression patterns to search for
 132       * and replace using callback function.
 133       *
 134       * @type array
 135       */
 136      protected $callbackSearch = array(
 137          '/<(h)[123456]( [^>]*)?>(.*?)<\/h[123456]>/i',           // h1 - h6
 138          '/[ ]*<(p)( [^>]*)?>(.*?)<\/p>[ ]*/si',                  // <p> with surrounding whitespace.
 139          '/<(br)[^>]*>[ ]*/i',                                    // <br> with leading whitespace after the newline.
 140          '/<(b)( [^>]*)?>(.*?)<\/b>/i',                           // <b>
 141          '/<(strong)( [^>]*)?>(.*?)<\/strong>/i',                 // <strong>
 142          '/<(th)( [^>]*)?>(.*?)<\/th>/i',                         // <th> and </th>
 143          '/<(a) [^>]*href=("|\')([^"\']+)\2([^>]*)>(.*?)<\/a>/i'  // <a href="">
 144      );
 145  
 146      /**
 147       * List of preg* regular expression patterns to search for in PRE body,
 148       * used in conjunction with $preReplace.
 149       *
 150       * @type array
 151       * @see $preReplace
 152       */
 153      protected $preSearch = array(
 154          "/\n/",
 155          "/\t/",
 156          '/ /',
 157          '/<pre[^>]*>/',
 158          '/<\/pre>/'
 159      );
 160  
 161      /**
 162       * List of pattern replacements corresponding to patterns searched for PRE body.
 163       *
 164       * @type array
 165       * @see $preSearch
 166       */
 167      protected $preReplace = array(
 168          '<br>',
 169          '&nbsp;&nbsp;&nbsp;&nbsp;',
 170          '&nbsp;',
 171          '',
 172          '',
 173      );
 174  
 175      /**
 176       * Temporary workspace used during PRE processing.
 177       *
 178       * @type string
 179       */
 180      protected $preContent = '';
 181  
 182      /**
 183       * Contains the base URL that relative links should resolve to.
 184       *
 185       * @type string
 186       */
 187      protected $baseurl = '';
 188  
 189      /**
 190       * Indicates whether content in the $html variable has been converted yet.
 191       *
 192       * @type boolean
 193       * @see $html, $text
 194       */
 195      protected $converted = false;
 196  
 197      /**
 198       * Contains URL addresses from links to be rendered in plain text.
 199       *
 200       * @type array
 201       * @see buildlinkList()
 202       */
 203      protected $linkList = array();
 204  
 205      /**
 206       * Various configuration options (able to be set in the constructor)
 207       *
 208       * @type array
 209       */
 210      protected $options = array(
 211          'do_links' => 'inline', // 'none'
 212                                  // 'inline' (show links inline)
 213                                  // 'nextline' (show links on the next line)
 214                                  // 'table' (if a table of link URLs should be listed after the text.
 215                                  // 'bbcode' (show links as bbcode)
 216  
 217          'width' => 70,          //  Maximum width of the formatted text, in columns.
 218                                  //  Set this value to 0 (or less) to ignore word wrapping
 219                                  //  and not constrain text to a fixed-width column.
 220      );
 221  
 222      private function legacyConstruct($html = '', $fromFile = false, array $options = array())
 223      {
 224          $this->set_html($html, $fromFile);
 225          $this->options = array_merge($this->options, $options);
 226      }
 227  
 228      /**
 229       * @param string $html    Source HTML
 230       * @param array  $options Set configuration options
 231       */
 232      public function __construct($html = '', $options = array())
 233      {
 234          // for backwards compatibility
 235          if (!is_array($options)) {
 236              return call_user_func_array(array($this, 'legacyConstruct'), func_get_args());
 237          }
 238  
 239          $this->html = $html;
 240          $this->options = array_merge($this->options, $options);
 241          $this->htmlFuncFlags = (PHP_VERSION_ID < 50400)
 242              ? ENT_COMPAT
 243              : ENT_COMPAT | ENT_HTML5;
 244      }
 245  
 246      /**
 247       * Set the source HTML
 248       *
 249       * @param string $html HTML source content
 250       */
 251      public function setHtml($html)
 252      {
 253          $this->html = $html;
 254          $this->converted = false;
 255      }
 256  
 257      /**
 258       * @deprecated
 259       */
 260      public function set_html($html, $from_file = false)
 261      {
 262          if ($from_file) {
 263              throw new \InvalidArgumentException("Argument from_file no longer supported");
 264          }
 265  
 266          return $this->setHtml($html);
 267      }
 268  
 269      /**
 270       * Returns the text, converted from HTML.
 271       *
 272       * @return string
 273       */
 274      public function getText()
 275      {
 276          if (!$this->converted) {
 277              $this->convert();
 278          }
 279  
 280          return $this->text;
 281      }
 282  
 283      /**
 284       * @deprecated
 285       */
 286      public function get_text()
 287      {
 288          return $this->getText();
 289      }
 290  
 291      /**
 292       * @deprecated
 293       */
 294      public function print_text()
 295      {
 296          print $this->getText();
 297      }
 298  
 299      /**
 300       * @deprecated
 301       */
 302      public function p()
 303      {
 304          return $this->print_text();
 305      }
 306  
 307      /**
 308       * Sets a base URL to handle relative links.
 309       *
 310       * @param string $baseurl
 311       */
 312      public function setBaseUrl($baseurl)
 313      {
 314          $this->baseurl = $baseurl;
 315      }
 316  
 317      /**
 318       * @deprecated
 319       */
 320      public function set_base_url($baseurl)
 321      {
 322          return $this->setBaseUrl($baseurl);
 323      }
 324  
 325      protected function convert()
 326      {
 327         $origEncoding = mb_internal_encoding();
 328         mb_internal_encoding(self::ENCODING);
 329  
 330         $this->doConvert();
 331  
 332         mb_internal_encoding($origEncoding);
 333      }
 334  
 335      protected function doConvert()
 336      {
 337          $this->linkList = array();
 338  
 339          $text = trim($this->html);
 340  
 341          $this->converter($text);
 342  
 343          if ($this->linkList) {
 344              $text .= "\n\nLinks:\n------\n";
 345              foreach ($this->linkList as $i => $url) {
 346                  $text .= '[' . ($i + 1) . '] ' . $url . "\n";
 347              }
 348          }
 349  
 350          $this->text = $text;
 351  
 352          $this->converted = true;
 353      }
 354  
 355      protected function converter(&$text)
 356      {
 357          $this->convertBlockquotes($text);
 358          $this->convertPre($text);
 359          $text = preg_replace($this->search, $this->replace, $text);
 360          $text = preg_replace_callback($this->callbackSearch, array($this, 'pregCallback'), $text);
 361          $text = strip_tags($text);
 362          $text = preg_replace($this->entSearch, $this->entReplace, $text);
 363          $text = html_entity_decode($text, $this->htmlFuncFlags, self::ENCODING);
 364  
 365          // Remove unknown/unhandled entities (this cannot be done in search-and-replace block)
 366          $text = preg_replace('/&([a-zA-Z0-9]{2,6}|#[0-9]{2,4});/', '', $text);
 367  
 368          // Convert "|+|amp|+|" into "&", need to be done after handling of unknown entities
 369          // This properly handles situation of "&amp;quot;" in input string
 370          $text = str_replace('|+|amp|+|', '&', $text);
 371  
 372          // Normalise empty lines
 373          $text = preg_replace("/\n\s+\n/", "\n\n", $text);
 374          $text = preg_replace("/[\n]{3,}/", "\n\n", $text);
 375  
 376          // remove leading empty lines (can be produced by eg. P tag on the beginning)
 377          $text = ltrim($text, "\n");
 378  
 379          if ($this->options['width'] > 0) {
 380              $text = wordwrap($text, $this->options['width']);
 381          }
 382      }
 383  
 384      /**
 385       * Helper function called by preg_replace() on link replacement.
 386       *
 387       * Maintains an internal list of links to be displayed at the end of the
 388       * text, with numeric indices to the original point in the text they
 389       * appeared. Also makes an effort at identifying and handling absolute
 390       * and relative links.
 391       *
 392       * @param  string $link          URL of the link
 393       * @param  string $display       Part of the text to associate number with
 394       * @param  null   $linkOverride
 395       * @return string
 396       */
 397      protected function buildlinkList($link, $display, $linkOverride = null)
 398      {
 399          $linkMethod = ($linkOverride) ? $linkOverride : $this->options['do_links'];
 400          if ($linkMethod == 'none') {
 401              return $display;
 402          }
 403  
 404          // Ignored link types
 405          if (preg_match('!^(javascript:|mailto:|#)!i', $link)) {
 406              return $display;
 407          }
 408  
 409          if (preg_match('!^([a-z][a-z0-9.+-]+:)!i', $link)) {
 410              $url = $link;
 411          } else {
 412              $url = $this->baseurl;
 413              if (mb_substr($link, 0, 1) != '/') {
 414                  $url .= '/';
 415              }
 416              $url .= $link;
 417          }
 418  
 419          if ($linkMethod == 'table') {
 420              if (($index = array_search($url, $this->linkList)) === false) {
 421                  $index = count($this->linkList);
 422                  $this->linkList[] = $url;
 423              }
 424  
 425              return $display . ' [' . ($index + 1) . ']';
 426          } elseif ($linkMethod == 'nextline') {
 427              return $display . "\n[" . $url . ']';
 428          } elseif ($linkMethod == 'bbcode') {
 429              return sprintf('[url=%s]%s[/url]', $url, $display);
 430          } else { // link_method defaults to inline
 431              return $display . ' [' . $url . ']';
 432          }
 433      }
 434  
 435      protected function convertPre(&$text)
 436      {
 437          // get the content of PRE element
 438          while (preg_match('/<pre[^>]*>(.*)<\/pre>/ismU', $text, $matches)) {
 439              // Replace br tags with newlines to prevent the search-and-replace callback from killing whitespace
 440              $this->preContent = preg_replace('/(<br\b[^>]*>)/i', "\n", $matches[1]);
 441  
 442              // Run our defined tags search-and-replace with callback
 443              $this->preContent = preg_replace_callback(
 444                  $this->callbackSearch,
 445                  array($this, 'pregCallback'),
 446                  $this->preContent
 447              );
 448  
 449              // convert the content
 450              $this->preContent = sprintf(
 451                  '<div><br>%s<br></div>',
 452                  preg_replace($this->preSearch, $this->preReplace, $this->preContent)
 453              );
 454  
 455              // replace the content (use callback because content can contain $0 variable)
 456              $text = preg_replace_callback(
 457                  '/<pre[^>]*>.*<\/pre>/ismU',
 458                  array($this, 'pregPreCallback'),
 459                  $text,
 460                  1
 461              );
 462  
 463              // free memory
 464              $this->preContent = '';
 465          }
 466      }
 467  
 468      /**
 469       * Helper function for BLOCKQUOTE body conversion.
 470       *
 471       * @param string $text HTML content
 472       */
 473      protected function convertBlockquotes(&$text)
 474      {
 475          if (preg_match_all('/<\/*blockquote[^>]*>/i', $text, $matches, PREG_OFFSET_CAPTURE)) {
 476              $originalText = $text;
 477              $start = 0;
 478              $taglen = 0;
 479              $level = 0;
 480              $diff = 0;
 481              foreach ($matches[0] as $m) {
 482                  $m[1] = mb_strlen(substr($originalText, 0, $m[1]));
 483                  if ($m[0][0] == '<' && $m[0][1] == '/') {
 484                      $level--;
 485                      if ($level < 0) {
 486                          $level = 0; // malformed HTML: go to next blockquote
 487                      } elseif ($level > 0) {
 488                          // skip inner blockquote
 489                      } else {
 490                          $end = $m[1];
 491                          $len = $end - $taglen - $start;
 492                          // Get blockquote content
 493                          $body = mb_substr($text, $start + $taglen - $diff, $len);
 494  
 495                          // Set text width
 496                          $pWidth = $this->options['width'];
 497                          if ($this->options['width'] > 0) $this->options['width'] -= 2;
 498                          // Convert blockquote content
 499                          $body = trim($body);
 500                          $this->converter($body);
 501                          // Add citation markers and create PRE block
 502                          $body = preg_replace('/((^|\n)>*)/', '\\1> ', trim($body));
 503                          $body = '<pre>' . htmlspecialchars($body, $this->htmlFuncFlags, self::ENCODING) . '</pre>';
 504                          // Re-set text width
 505                          $this->options['width'] = $pWidth;
 506                          // Replace content
 507                          $text = mb_substr($text, 0, $start - $diff)
 508                              . $body
 509                              . mb_substr($text, $end + mb_strlen($m[0]) - $diff);
 510  
 511                          $diff += $len + $taglen + mb_strlen($m[0]) - mb_strlen($body);
 512                          unset($body);
 513                      }
 514                  } else {
 515                      if ($level == 0) {
 516                          $start = $m[1];
 517                          $taglen = mb_strlen($m[0]);
 518                      }
 519                      $level++;
 520                  }
 521              }
 522          }
 523      }
 524  
 525      /**
 526       * Callback function for preg_replace_callback use.
 527       *
 528       * @param  array  $matches PREG matches
 529       * @return string
 530       */
 531      protected function pregCallback($matches)
 532      {
 533          switch (mb_strtolower($matches[1])) {
 534              case 'p':
 535                  // Replace newlines with spaces.
 536                  $para = str_replace("\n", " ", $matches[3]);
 537  
 538                  // Trim trailing and leading whitespace within the tag.
 539                  $para = trim($para);
 540  
 541                  // Add trailing newlines for this para.
 542                  return "\n" . $para . "\n";
 543              case 'br':
 544                  return "\n";
 545              case 'b':
 546              case 'strong':
 547                  return $this->toupper($matches[3]);
 548              case 'th':
 549                  return $this->toupper("\t\t" . $matches[3] . "\n");
 550              case 'h':
 551                  return $this->toupper("\n\n" . $matches[3] . "\n\n");
 552              case 'a':
 553                  // override the link method
 554                  $linkOverride = null;
 555                  if (preg_match('/_html2text_link_(\w+)/', $matches[4], $linkOverrideMatch)) {
 556                      $linkOverride = $linkOverrideMatch[1];
 557                  }
 558                  // Remove spaces in URL (#1487805)
 559                  $url = str_replace(' ', '', $matches[3]);
 560  
 561                  return $this->buildlinkList($url, $matches[5], $linkOverride);
 562          }
 563  
 564          return '';
 565      }
 566  
 567      /**
 568       * Callback function for preg_replace_callback use in PRE content handler.
 569       *
 570       * @param  array  $matches PREG matches
 571       * @return string
 572       */
 573      protected function pregPreCallback(/** @noinspection PhpUnusedParameterInspection */ $matches)
 574      {
 575          return $this->preContent;
 576      }
 577  
 578      /**
 579       * Strtoupper function with HTML tags and entities handling.
 580       *
 581       * @param  string $str Text to convert
 582       * @return string Converted text
 583       */
 584      protected function toupper($str)
 585      {
 586          // string can contain HTML tags
 587          $chunks = preg_split('/(<[^>]*>)/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
 588  
 589          // convert toupper only the text between HTML tags
 590          foreach ($chunks as $i => $chunk) {
 591              if ($chunk[0] != '<') {
 592                  $chunks[$i] = $this->strtoupper($chunk);
 593              }
 594          }
 595  
 596          return implode($chunks);
 597      }
 598  
 599      /**
 600       * Strtoupper multibyte wrapper function with HTML entities handling.
 601       *
 602       * @param  string $str Text to convert
 603       * @return string Converted text
 604       */
 605      protected function strtoupper($str)
 606      {
 607          $str = html_entity_decode($str, $this->htmlFuncFlags, self::ENCODING);
 608          $str = mb_strtoupper($str);
 609          $str = htmlspecialchars($str, $this->htmlFuncFlags, self::ENCODING);
 610  
 611          return $str;
 612      }
 613  }
PHP Cross Reference of Unnamed Project

/lib/html2text/ -> Html2Text.php (source)