[ Index ]

PHP Cross Reference of Unnamed Project

title

Body

[close]

/repository/url/ -> locallib.php (source)

   1  <?php
   2  
   3  /**
   4   * Copyright (c) 2008, David R. Nadeau, NadeauSoftware.com.
   5   * All rights reserved.
   6   *
   7   * Redistribution and use in source and binary forms, with or without
   8   * modification, are permitted provided that the following conditions
   9   * are met:
  10   *
  11   *    * Redistributions of source code must retain the above copyright
  12   *      notice, this list of conditions and the following disclaimer.
  13   *
  14   *    * Redistributions in binary form must reproduce the above
  15   *      copyright notice, this list of conditions and the following
  16   *      disclaimer in the documentation and/or other materials provided
  17   *      with the distribution.
  18   *
  19   *    * Neither the names of David R. Nadeau or NadeauSoftware.com, nor
  20   *      the names of its contributors may be used to endorse or promote
  21   *      products derived from this software without specific prior
  22   *      written permission.
  23   *
  24   * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
  25   * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
  26   * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
  27   * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
  28   * COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
  29   * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING,
  30   * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  31   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
  32   * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  33   * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY
  34   * WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY
  35   * OF SUCH DAMAGE.
  36   */
  37  
  38  /*
  39   * This is a BSD License approved by the Open Source Initiative (OSI).
  40   * See:  http://www.opensource.org/licenses/bsd-license.php
  41   */
  42  
  43  defined('MOODLE_INTERNAL') || die();
  44  
  45  /**
  46   * Combine a base URL and a relative URL to produce a new
  47   * absolute URL.  The base URL is often the URL of a page,
  48   * and the relative URL is a URL embedded on that page.
  49   *
  50   * This function implements the "absolutize" algorithm from
  51   * the RFC3986 specification for URLs.
  52   *
  53   * This function supports multi-byte characters with the UTF-8 encoding,
  54   * per the URL specification.
  55   *
  56   * Parameters:
  57   *     baseUrl        the absolute base URL.
  58   *
  59   *     url        the relative URL to convert.
  60   *
  61   * Return values:
  62   *     An absolute URL that combines parts of the base and relative
  63   *     URLs, or FALSE if the base URL is not absolute or if either
  64   *     URL cannot be parsed.
  65   */
  66  function url_to_absolute( $baseUrl, $relativeUrl )
  67  {
  68      // If relative URL has a scheme, clean path and return.
  69      $r = split_url( $relativeUrl );
  70      if ( $r === FALSE )
  71          return FALSE;
  72      if ( !empty( $r['scheme'] ) )
  73      {
  74          if ( !empty( $r['path'] ) && $r['path'][0] == '/' )
  75              $r['path'] = url_remove_dot_segments( $r['path'] );
  76          return join_url( $r );
  77      }
  78  
  79      // Make sure the base URL is absolute.
  80      $b = split_url( $baseUrl );
  81      if ( $b === FALSE || empty( $b['scheme'] ) || empty( $b['host'] ) )
  82          return FALSE;
  83      $r['scheme'] = $b['scheme'];
  84      if (empty($b['path'])) {
  85          $b['path'] = '';
  86      }
  87  
  88      // If relative URL has an authority, clean path and return.
  89      if ( isset( $r['host'] ) )
  90      {
  91          if ( !empty( $r['path'] ) )
  92              $r['path'] = url_remove_dot_segments( $r['path'] );
  93          return join_url( $r );
  94      }
  95      unset( $r['port'] );
  96      unset( $r['user'] );
  97      unset( $r['pass'] );
  98  
  99      // Copy base authority.
 100      $r['host'] = $b['host'];
 101      if ( isset( $b['port'] ) ) $r['port'] = $b['port'];
 102      if ( isset( $b['user'] ) ) $r['user'] = $b['user'];
 103      if ( isset( $b['pass'] ) ) $r['pass'] = $b['pass'];
 104  
 105      // If relative URL has no path, use base path
 106      if ( empty( $r['path'] ) )
 107      {
 108          if ( !empty( $b['path'] ) )
 109              $r['path'] = $b['path'];
 110          if ( !isset( $r['query'] ) && isset( $b['query'] ) )
 111              $r['query'] = $b['query'];
 112          return join_url( $r );
 113      }
 114  
 115      // If relative URL path doesn't start with /, merge with base path.
 116      if ($r['path'][0] != '/') {
 117          $base = core_text::strrchr($b['path'], '/', TRUE);
 118          if ($base === FALSE) {
 119              $base = '';
 120          }
 121          $r['path'] = $base . '/' . $r['path'];
 122      }
 123      $r['path'] = url_remove_dot_segments($r['path']);
 124      return join_url($r);
 125  }
 126  
 127  /**
 128   * Filter out "." and ".." segments from a URL's path and return
 129   * the result.
 130   *
 131   * This function implements the "remove_dot_segments" algorithm from
 132   * the RFC3986 specification for URLs.
 133   *
 134   * This function supports multi-byte characters with the UTF-8 encoding,
 135   * per the URL specification.
 136   *
 137   * Parameters:
 138   *     path    the path to filter
 139   *
 140   * Return values:
 141   *     The filtered path with "." and ".." removed.
 142   */
 143  function url_remove_dot_segments( $path )
 144  {
 145      // multi-byte character explode
 146      $inSegs  = preg_split( '!/!u', $path );
 147      $outSegs = array( );
 148      foreach ( $inSegs as $seg )
 149      {
 150          if ( $seg == '' || $seg == '.')
 151              continue;
 152          if ( $seg == '..' )
 153              array_pop( $outSegs );
 154          else
 155              array_push( $outSegs, $seg );
 156      }
 157      $outPath = implode( '/', $outSegs );
 158  
 159      if ($path[0] == '/') {
 160          $outPath = '/' . $outPath;
 161      }
 162  
 163      // Compare last multi-byte character against '/'.
 164      if ($outPath != '/' && (core_text::strlen($path) - 1) == core_text::strrpos($path, '/', 'UTF-8')) {
 165          $outPath .= '/';
 166      }
 167      return $outPath;
 168  }
 169  
 170  /**
 171   * This function parses an absolute or relative URL and splits it
 172   * into individual components.
 173   *
 174   * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 175   * A portion of the ABNFs are repeated here:
 176   *
 177   *    URI-reference    = URI
 178   *            / relative-ref
 179   *
 180   *    URI        = scheme ":" hier-part [ "?" query ] [ "#" fragment ]
 181   *
 182   *    relative-ref    = relative-part [ "?" query ] [ "#" fragment ]
 183   *
 184   *    hier-part    = "//" authority path-abempty
 185   *            / path-absolute
 186   *            / path-rootless
 187   *            / path-empty
 188   *
 189   *    relative-part    = "//" authority path-abempty
 190   *            / path-absolute
 191   *            / path-noscheme
 192   *            / path-empty
 193   *
 194   *    authority    = [ userinfo "@" ] host [ ":" port ]
 195   *
 196   * So, a URL has the following major components:
 197   *
 198   *    scheme
 199   *        The name of a method used to interpret the rest of
 200   *        the URL.  Examples:  "http", "https", "mailto", "file'.
 201   *
 202   *    authority
 203   *        The name of the authority governing the URL's name
 204   *        space.  Examples:  "example.com", "user@example.com",
 205   *        "example.com:80", "user:password@example.com:80".
 206   *
 207   *        The authority may include a host name, port number,
 208   *        user name, and password.
 209   *
 210   *        The host may be a name, an IPv4 numeric address, or
 211   *        an IPv6 numeric address.
 212   *
 213   *    path
 214   *        The hierarchical path to the URL's resource.
 215   *        Examples:  "/index.htm", "/scripts/page.php".
 216   *
 217   *    query
 218   *        The data for a query.  Examples:  "?search=google.com".
 219   *
 220   *    fragment
 221   *        The name of a secondary resource relative to that named
 222   *        by the path.  Examples:  "#section1", "#header".
 223   *
 224   * An "absolute" URL must include a scheme and path.  The authority, query,
 225   * and fragment components are optional.
 226   *
 227   * A "relative" URL does not include a scheme and must include a path.  The
 228   * authority, query, and fragment components are optional.
 229   *
 230   * This function splits the $url argument into the following components
 231   * and returns them in an associative array.  Keys to that array include:
 232   *
 233   *    "scheme"    The scheme, such as "http".
 234   *    "host"        The host name, IPv4, or IPv6 address.
 235   *    "port"        The port number.
 236   *    "user"        The user name.
 237   *    "pass"        The user password.
 238   *    "path"        The path, such as a file path for "http".
 239   *    "query"        The query.
 240   *    "fragment"    The fragment.
 241   *
 242   * One or more of these may not be present, depending upon the URL.
 243   *
 244   * Optionally, the "user", "pass", "host" (if a name, not an IP address),
 245   * "path", "query", and "fragment" may have percent-encoded characters
 246   * decoded.  The "scheme" and "port" cannot include percent-encoded
 247   * characters and are never decoded.  Decoding occurs after the URL has
 248   * been parsed.
 249   *
 250   * Parameters:
 251   *     url        the URL to parse.
 252   *
 253   *     decode        an optional boolean flag selecting whether
 254   *             to decode percent encoding or not.  Default = TRUE.
 255   *
 256   * Return values:
 257   *     the associative array of URL parts, or FALSE if the URL is
 258   *     too malformed to recognize any parts.
 259   */
 260  function split_url( $url, $decode=FALSE)
 261  {
 262      // Character sets from RFC3986.
 263      $xunressub     = 'a-zA-Z\d\-._~\!$&\'()*+,;=';
 264      $xpchar        = $xunressub . ':@% ';
 265  
 266      // Scheme from RFC3986.
 267      $xscheme        = '([a-zA-Z][a-zA-Z\d+-.]*)';
 268  
 269      // User info (user + password) from RFC3986.
 270      $xuserinfo     = '((['  . $xunressub . '%]*)' .
 271                       '(:([' . $xunressub . ':%]*))?)';
 272  
 273      // IPv4 from RFC3986 (without digit constraints).
 274      $xipv4         = '(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})';
 275  
 276      // IPv6 from RFC2732 (without digit and grouping constraints).
 277      $xipv6         = '(\[([a-fA-F\d.:]+)\])';
 278  
 279      // Host name from RFC1035.  Technically, must start with a letter.
 280      // Relax that restriction to better parse URL structure, then
 281      // leave host name validation to application.
 282      $xhost_name    = '([a-zA-Z\d-.%]+)';
 283  
 284      // Authority from RFC3986.  Skip IP future.
 285      $xhost         = '(' . $xhost_name . '|' . $xipv4 . '|' . $xipv6 . ')';
 286      $xport         = '(\d*)';
 287      $xauthority    = '((' . $xuserinfo . '@)?' . $xhost .
 288                   '?(:' . $xport . ')?)';
 289  
 290      // Path from RFC3986.  Blend absolute & relative for efficiency.
 291      $xslash_seg    = '(/[' . $xpchar . ']*)';
 292      $xpath_authabs = '((//' . $xauthority . ')((/[' . $xpchar . ']*)*))';
 293      $xpath_rel     = '([' . $xpchar . ']+' . $xslash_seg . '*)';
 294      $xpath_abs     = '(/(' . $xpath_rel . ')?)';
 295      $xapath        = '(' . $xpath_authabs . '|' . $xpath_abs .
 296               '|' . $xpath_rel . ')';
 297  
 298      // Query and fragment from RFC3986.
 299      $xqueryfrag    = '([' . $xpchar . '/?' . ']*)';
 300  
 301      // URL.
 302      $xurl          = '^(' . $xscheme . ':)?' .  $xapath . '?' .
 303                       '(\?' . $xqueryfrag . ')?(#' . $xqueryfrag . ')?$';
 304  
 305  
 306      // Split the URL into components.
 307      if ( !preg_match( '!' . $xurl . '!', $url, $m ) )
 308          return FALSE;
 309  
 310      if ( !empty($m[2]) )        $parts['scheme']  = strtolower($m[2]);
 311  
 312      if ( !empty($m[7]) ) {
 313          if ( isset( $m[9] ) )    $parts['user']    = $m[9];
 314          else            $parts['user']    = '';
 315      }
 316      if ( !empty($m[10]) )        $parts['pass']    = $m[11];
 317  
 318      if ( !empty($m[13]) )        $h=$parts['host'] = $m[13];
 319      else if ( !empty($m[14]) )    $parts['host']    = $m[14];
 320      else if ( !empty($m[16]) )    $parts['host']    = $m[16];
 321      else if ( !empty( $m[5] ) )    $parts['host']    = '';
 322      if ( !empty($m[17]) )        $parts['port']    = $m[18];
 323  
 324      if ( !empty($m[19]) )        $parts['path']    = $m[19];
 325      else if ( !empty($m[21]) )    $parts['path']    = $m[21];
 326      else if ( !empty($m[25]) )    $parts['path']    = $m[25];
 327  
 328      if ( !empty($m[27]) )        $parts['query']   = $m[28];
 329      if ( !empty($m[29]) )        $parts['fragment']= $m[30];
 330  
 331      if ( !$decode )
 332          return $parts;
 333      if ( !empty($parts['user']) )
 334          $parts['user']     = rawurldecode( $parts['user'] );
 335      if ( !empty($parts['pass']) )
 336          $parts['pass']     = rawurldecode( $parts['pass'] );
 337      if ( !empty($parts['path']) )
 338          $parts['path']     = rawurldecode( $parts['path'] );
 339      if ( isset($h) )
 340          $parts['host']     = rawurldecode( $parts['host'] );
 341      if ( !empty($parts['query']) )
 342          $parts['query']    = rawurldecode( $parts['query'] );
 343      if ( !empty($parts['fragment']) )
 344          $parts['fragment'] = rawurldecode( $parts['fragment'] );
 345      return $parts;
 346  }
 347  
 348  /**
 349   * This function joins together URL components to form a complete URL.
 350   *
 351   * RFC3986 specifies the components of a Uniform Resource Identifier (URI).
 352   * This function implements the specification's "component recomposition"
 353   * algorithm for combining URI components into a full URI string.
 354   *
 355   * The $parts argument is an associative array containing zero or
 356   * more of the following:
 357   *
 358   *    "scheme"    The scheme, such as "http".
 359   *    "host"        The host name, IPv4, or IPv6 address.
 360   *    "port"        The port number.
 361   *    "user"        The user name.
 362   *    "pass"        The user password.
 363   *    "path"        The path, such as a file path for "http".
 364   *    "query"        The query.
 365   *    "fragment"    The fragment.
 366   *
 367   * The "port", "user", and "pass" values are only used when a "host"
 368   * is present.
 369   *
 370   * The optional $encode argument indicates if appropriate URL components
 371   * should be percent-encoded as they are assembled into the URL.  Encoding
 372   * is only applied to the "user", "pass", "host" (if a host name, not an
 373   * IP address), "path", "query", and "fragment" components.  The "scheme"
 374   * and "port" are never encoded.  When a "scheme" and "host" are both
 375   * present, the "path" is presumed to be hierarchical and encoding
 376   * processes each segment of the hierarchy separately (i.e., the slashes
 377   * are left alone).
 378   *
 379   * The assembled URL string is returned.
 380   *
 381   * Parameters:
 382   *     parts        an associative array of strings containing the
 383   *             individual parts of a URL.
 384   *
 385   *     encode        an optional boolean flag selecting whether
 386   *             to do percent encoding or not.  Default = true.
 387   *
 388   * Return values:
 389   *     Returns the assembled URL string.  The string is an absolute
 390   *     URL if a scheme is supplied, and a relative URL if not.  An
 391   *     empty string is returned if the $parts array does not contain
 392   *     any of the needed values.
 393   */
 394  function join_url( $parts, $encode=FALSE)
 395  {
 396      if ( $encode )
 397      {
 398          if ( isset( $parts['user'] ) )
 399              $parts['user']     = rawurlencode( $parts['user'] );
 400          if ( isset( $parts['pass'] ) )
 401              $parts['pass']     = rawurlencode( $parts['pass'] );
 402          if ( isset( $parts['host'] ) &&
 403              !preg_match( '!^(\[[\da-f.:]+\]])|([\da-f.:]+)$!ui', $parts['host'] ) )
 404              $parts['host']     = rawurlencode( $parts['host'] );
 405          if ( !empty( $parts['path'] ) )
 406              $parts['path']     = preg_replace( '!%2F!ui', '/',
 407                  rawurlencode( $parts['path'] ) );
 408          if ( isset( $parts['query'] ) )
 409              $parts['query']    = rawurlencode( $parts['query'] );
 410          if ( isset( $parts['fragment'] ) )
 411              $parts['fragment'] = rawurlencode( $parts['fragment'] );
 412      }
 413  
 414      $url = '';
 415      if ( !empty( $parts['scheme'] ) )
 416          $url .= $parts['scheme'] . ':';
 417      if ( isset( $parts['host'] ) )
 418      {
 419          $url .= '//';
 420          if ( isset( $parts['user'] ) )
 421          {
 422              $url .= $parts['user'];
 423              if ( isset( $parts['pass'] ) )
 424                  $url .= ':' . $parts['pass'];
 425              $url .= '@';
 426          }
 427          if ( preg_match( '!^[\da-f]*:[\da-f.:]+$!ui', $parts['host'] ) )
 428              $url .= '[' . $parts['host'] . ']';    // IPv6
 429          else
 430              $url .= $parts['host'];            // IPv4 or name
 431          if ( isset( $parts['port'] ) )
 432              $url .= ':' . $parts['port'];
 433          if ( !empty( $parts['path'] ) && $parts['path'][0] != '/' )
 434              $url .= '/';
 435      }
 436      if ( !empty( $parts['path'] ) )
 437          $url .= $parts['path'];
 438      if ( isset( $parts['query'] ) )
 439          $url .= '?' . $parts['query'];
 440      if ( isset( $parts['fragment'] ) )
 441          $url .= '#' . $parts['fragment'];
 442      return $url;
 443  }
 444  
 445  /**
 446   * This function encodes URL to form a URL which is properly
 447   * percent encoded to replace disallowed characters.
 448   *
 449   * RFC3986 specifies the allowed characters in the URL as well as
 450   * reserved characters in the URL. This function replaces all the
 451   * disallowed characters in the URL with their repective percent
 452   * encodings. Already encoded characters are not encoded again,
 453   * such as '%20' is not encoded to '%2520'.
 454   *
 455   * Parameters:
 456   *     url        the url to encode.
 457   *
 458   * Return values:
 459   *     Returns the encoded URL string.
 460   */
 461  function encode_url($url) {
 462    $reserved = array(
 463      ":" => '!%3A!ui',
 464      "/" => '!%2F!ui',
 465      "?" => '!%3F!ui',
 466      "#" => '!%23!ui',
 467      "[" => '!%5B!ui',
 468      "]" => '!%5D!ui',
 469      "@" => '!%40!ui',
 470      "!" => '!%21!ui',
 471      "$" => '!%24!ui',
 472      "&" => '!%26!ui',
 473      "'" => '!%27!ui',
 474      "(" => '!%28!ui',
 475      ")" => '!%29!ui',
 476      "*" => '!%2A!ui',
 477      "+" => '!%2B!ui',
 478      "," => '!%2C!ui',
 479      ";" => '!%3B!ui',
 480      "=" => '!%3D!ui',
 481      "%" => '!%25!ui',
 482    );
 483  
 484    $url = rawurlencode($url);
 485    $url = preg_replace(array_values($reserved), array_keys($reserved), $url);
 486    return $url;
 487  }
 488  
 489  /**
 490   * Extract URLs from a web page.
 491   *
 492   * URLs are extracted from a long list of tags and attributes as defined
 493   * by the HTML 2.0, HTML 3.2, HTML 4.01, and draft HTML 5.0 specifications.
 494   * URLs are also extracted from tags and attributes that are common
 495   * extensions of HTML, from the draft Forms 2.0 specification, from XHTML,
 496   * and from WML 1.3 and 2.0.
 497   *
 498   * The function returns an associative array of associative arrays of
 499   * arrays of URLs.  The outermost array's keys are the tag (element) name,
 500   * such as "a" for <a> or "img" for <img>.  The values for these entries
 501   * are associative arrays where the keys are attribute names for those
 502   * tags, such as "href" for <a href="...">.  Finally, the values for
 503   * those arrays are URLs found in those tags and attributes throughout
 504   * the text.
 505   *
 506   * Parameters:
 507   *     text        the UTF-8 text to scan
 508   *
 509   * Return values:
 510   *     an associative array where keys are tags and values are an
 511   *     associative array where keys are attributes and values are
 512   *     an array of URLs.
 513   *
 514   * See:
 515   *     http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_web_page
 516   */
 517  function extract_html_urls( $text )
 518  {
 519      $match_elements = array(
 520          // HTML
 521          array('element'=>'a',        'attribute'=>'href'),        // 2.0
 522          array('element'=>'a',        'attribute'=>'urn'),        // 2.0
 523          array('element'=>'base',    'attribute'=>'href'),        // 2.0
 524          array('element'=>'form',    'attribute'=>'action'),        // 2.0
 525          array('element'=>'img',        'attribute'=>'src'),        // 2.0
 526          array('element'=>'link',    'attribute'=>'href'),        // 2.0
 527  
 528          array('element'=>'applet',    'attribute'=>'code'),        // 3.2
 529          array('element'=>'applet',    'attribute'=>'codebase'),    // 3.2
 530          array('element'=>'area',    'attribute'=>'href'),        // 3.2
 531          array('element'=>'body',    'attribute'=>'background'),    // 3.2
 532          array('element'=>'img',        'attribute'=>'usemap'),        // 3.2
 533          array('element'=>'input',    'attribute'=>'src'),        // 3.2
 534  
 535          array('element'=>'applet',    'attribute'=>'archive'),    // 4.01
 536          array('element'=>'applet',    'attribute'=>'object'),        // 4.01
 537          array('element'=>'blockquote',    'attribute'=>'cite'),        // 4.01
 538          array('element'=>'del',        'attribute'=>'cite'),        // 4.01
 539          array('element'=>'frame',    'attribute'=>'longdesc'),    // 4.01
 540          array('element'=>'frame',    'attribute'=>'src'),        // 4.01
 541          array('element'=>'head',    'attribute'=>'profile'),    // 4.01
 542          array('element'=>'iframe',    'attribute'=>'longdesc'),    // 4.01
 543          array('element'=>'iframe',    'attribute'=>'src'),        // 4.01
 544          array('element'=>'img',        'attribute'=>'longdesc'),    // 4.01
 545          array('element'=>'input',    'attribute'=>'usemap'),        // 4.01
 546          array('element'=>'ins',        'attribute'=>'cite'),        // 4.01
 547          array('element'=>'object',    'attribute'=>'archive'),    // 4.01
 548          array('element'=>'object',    'attribute'=>'classid'),    // 4.01
 549          array('element'=>'object',    'attribute'=>'codebase'),    // 4.01
 550          array('element'=>'object',    'attribute'=>'data'),        // 4.01
 551          array('element'=>'object',    'attribute'=>'usemap'),        // 4.01
 552          array('element'=>'q',        'attribute'=>'cite'),        // 4.01
 553          array('element'=>'script',    'attribute'=>'src'),        // 4.01
 554  
 555          array('element'=>'audio',    'attribute'=>'src'),        // 5.0
 556          array('element'=>'command',    'attribute'=>'icon'),        // 5.0
 557          array('element'=>'embed',    'attribute'=>'src'),        // 5.0
 558          array('element'=>'event-source','attribute'=>'src'),        // 5.0
 559          array('element'=>'html',    'attribute'=>'manifest'),    // 5.0
 560          array('element'=>'source',    'attribute'=>'src'),        // 5.0
 561          array('element'=>'video',    'attribute'=>'src'),        // 5.0
 562          array('element'=>'video',    'attribute'=>'poster'),        // 5.0
 563  
 564          array('element'=>'bgsound',    'attribute'=>'src'),        // Extension
 565          array('element'=>'body',    'attribute'=>'credits'),    // Extension
 566          array('element'=>'body',    'attribute'=>'instructions'),    // Extension
 567          array('element'=>'body',    'attribute'=>'logo'),        // Extension
 568          array('element'=>'div',        'attribute'=>'href'),        // Extension
 569          array('element'=>'div',        'attribute'=>'src'),        // Extension
 570          array('element'=>'embed',    'attribute'=>'code'),        // Extension
 571          array('element'=>'embed',    'attribute'=>'pluginspage'),    // Extension
 572          array('element'=>'html',    'attribute'=>'background'),    // Extension
 573          array('element'=>'ilayer',    'attribute'=>'src'),        // Extension
 574          array('element'=>'img',        'attribute'=>'dynsrc'),        // Extension
 575          array('element'=>'img',        'attribute'=>'lowsrc'),        // Extension
 576          array('element'=>'input',    'attribute'=>'dynsrc'),        // Extension
 577          array('element'=>'input',    'attribute'=>'lowsrc'),        // Extension
 578          array('element'=>'table',    'attribute'=>'background'),    // Extension
 579          array('element'=>'td',        'attribute'=>'background'),    // Extension
 580          array('element'=>'th',        'attribute'=>'background'),    // Extension
 581          array('element'=>'layer',    'attribute'=>'src'),        // Extension
 582          array('element'=>'xml',        'attribute'=>'src'),        // Extension
 583  
 584          array('element'=>'button',    'attribute'=>'action'),        // Forms 2.0
 585          array('element'=>'datalist',    'attribute'=>'data'),        // Forms 2.0
 586          array('element'=>'form',    'attribute'=>'data'),        // Forms 2.0
 587          array('element'=>'input',    'attribute'=>'action'),        // Forms 2.0
 588          array('element'=>'select',    'attribute'=>'data'),        // Forms 2.0
 589  
 590          // XHTML
 591          array('element'=>'html',    'attribute'=>'xmlns'),
 592  
 593          // WML
 594          array('element'=>'access',    'attribute'=>'path'),        // 1.3
 595          array('element'=>'card',    'attribute'=>'onenterforward'),    // 1.3
 596          array('element'=>'card',    'attribute'=>'onenterbackward'),// 1.3
 597          array('element'=>'card',    'attribute'=>'ontimer'),    // 1.3
 598          array('element'=>'go',        'attribute'=>'href'),        // 1.3
 599          array('element'=>'option',    'attribute'=>'onpick'),        // 1.3
 600          array('element'=>'template',    'attribute'=>'onenterforward'),    // 1.3
 601          array('element'=>'template',    'attribute'=>'onenterbackward'),// 1.3
 602          array('element'=>'template',    'attribute'=>'ontimer'),    // 1.3
 603          array('element'=>'wml',        'attribute'=>'xmlns'),        // 2.0
 604      );
 605  
 606      $match_metas = array(
 607          'content-base',
 608          'content-location',
 609          'referer',
 610          'location',
 611          'refresh',
 612      );
 613  
 614      // Extract all elements
 615      if ( !preg_match_all( '/<([a-z][^>]*)>/iu', $text, $matches ) )
 616          return array( );
 617      $elements = $matches[1];
 618      $value_pattern = '=(("([^"]*)")|([^\s]*))';
 619  
 620      // Match elements and attributes
 621      foreach ( $match_elements as $match_element )
 622      {
 623          $name = $match_element['element'];
 624          $attr = $match_element['attribute'];
 625          $pattern = '/^' . $name . '\s.*' . $attr . $value_pattern . '/iu';
 626          if ( $name == 'object' )
 627              $split_pattern = '/\s*/u';    // Space-separated URL list
 628          else if ( $name == 'archive' )
 629              $split_pattern = '/,\s*/u';    // Comma-separated URL list
 630          else
 631              unset( $split_pattern );    // Single URL
 632          foreach ( $elements as $element )
 633          {
 634              if ( !preg_match( $pattern, $element, $match ) )
 635                  continue;
 636              $m = empty($match[3]) ? (!empty($match[4])?$match[4]:'') : $match[3];
 637              if ( !isset( $split_pattern ) )
 638                  $urls[$name][$attr][] = $m;
 639              else
 640              {
 641                  $msplit = preg_split( $split_pattern, $m );
 642                  foreach ( $msplit as $ms )
 643                      $urls[$name][$attr][] = $ms;
 644              }
 645          }
 646      }
 647  
 648      // Match meta http-equiv elements
 649      foreach ( $match_metas as $match_meta )
 650      {
 651          $attr_pattern    = '/http-equiv="?' . $match_meta . '"?/iu';
 652          $content_pattern = '/content'  . $value_pattern . '/iu';
 653          $refresh_pattern = '/\d*;\s*(url=)?(.*)$/iu';
 654          foreach ( $elements as $element )
 655          {
 656              if ( !preg_match( '/^meta/iu', $element ) ||
 657                  !preg_match( $attr_pattern, $element ) ||
 658                  !preg_match( $content_pattern, $element, $match ) )
 659                  continue;
 660              $m = empty($match[3]) ? $match[4] : $match[3];
 661              if ( $match_meta != 'refresh' )
 662                  $urls['meta']['http-equiv'][] = $m;
 663              else if ( preg_match( $refresh_pattern, $m, $match ) )
 664                  $urls['meta']['http-equiv'][] = $match[2];
 665          }
 666      }
 667  
 668      // Match style attributes
 669      $urls['style'] = array( );
 670      $style_pattern = '/style' . $value_pattern . '/iu';
 671      foreach ( $elements as $element )
 672      {
 673          if ( !preg_match( $style_pattern, $element, $match ) )
 674              continue;
 675          $m = empty($match[3]) ? $match[4] : $match[3];
 676          $style_urls = extract_css_urls( $m );
 677          if ( !empty( $style_urls ) )
 678              $urls['style'] = array_merge_recursive(
 679                  $urls['style'], $style_urls );
 680      }
 681  
 682      // Match style bodies
 683      if ( preg_match_all( '/<style[^>]*>(.*?)<\/style>/siu', $text, $style_bodies ) )
 684      {
 685          foreach ( $style_bodies[1] as $style_body )
 686          {
 687              $style_urls = extract_css_urls( $style_body );
 688              if ( !empty( $style_urls ) )
 689                  $urls['style'] = array_merge_recursive(
 690                      $urls['style'], $style_urls );
 691          }
 692      }
 693      if ( empty($urls['style']) )
 694          unset( $urls['style'] );
 695  
 696      return $urls;
 697  }
 698  /**
 699   * Extract URLs from UTF-8 CSS text.
 700   *
 701   * URLs within @import statements and url() property functions are extracted
 702   * and returned in an associative array of arrays.  Array keys indicate
 703   * the use context for the URL, including:
 704   *
 705   *     "import"
 706   *     "property"
 707   *
 708   * Each value in the associative array is an array of URLs.
 709   *
 710   * Parameters:
 711   *     text        the UTF-8 text to scan
 712   *
 713   * Return values:
 714   *     an associative array of arrays of URLs.
 715   *
 716   * See:
 717   *     http://nadeausoftware.com/articles/2008/01/php_tip_how_extract_urls_css_file
 718   */
 719  function extract_css_urls( $text )
 720  {
 721      $urls = array( );
 722  
 723      $url_pattern     = '(([^\\\\\'", \(\)]*(\\\\.)?)+)';
 724      $urlfunc_pattern = 'url\(\s*[\'"]?' . $url_pattern . '[\'"]?\s*\)';
 725      $pattern         = '/(' .
 726           '(@import\s*[\'"]' . $url_pattern     . '[\'"])' .
 727          '|(@import\s*'      . $urlfunc_pattern . ')'      .
 728          '|('                . $urlfunc_pattern . ')'      .  ')/iu';
 729      if ( !preg_match_all( $pattern, $text, $matches ) )
 730          return $urls;
 731  
 732      // @import '...'
 733      // @import "..."
 734      foreach ( $matches[3] as $match )
 735          if ( !empty($match) )
 736              $urls['import'][] =
 737                  preg_replace( '/\\\\(.)/u', '\\1', $match );
 738  
 739      // @import url(...)
 740      // @import url('...')
 741      // @import url("...")
 742      foreach ( $matches[7] as $match )
 743          if ( !empty($match) )
 744              $urls['import'][] =
 745                  preg_replace( '/\\\\(.)/u', '\\1', $match );
 746  
 747      // url(...)
 748      // url('...')
 749      // url("...")
 750      foreach ( $matches[11] as $match )
 751          if ( !empty($match) )
 752              $urls['property'][] =
 753                  preg_replace( '/\\\\(.)/u', '\\1', $match );
 754  
 755      return $urls;
 756  }


Generated: Thu Aug 11 10:00:09 2016 Cross-referenced by PHPXref 0.7.1