PHPXRef 0.7.1 : Unnamed Project : /lib/horde/framework/Horde/String.php source

[Summary view] [Print] [Text view]
   1  <?php
   2  /**
   3   * Provides static methods for charset and locale safe string manipulation.
   4   *
   5   * Copyright 2003-2014 Horde LLC (http://www.horde.org/)
   6   *
   7   * See the enclosed file COPYING for license information (LGPL). If you
   8   * did not receive this file, see http://www.horde.org/licenses/lgpl21.
   9   *
  10   * @author   Jan Schneider <jan@horde.org>
  11   * @category Horde
  12   * @license  http://www.horde.org/licenses/lgpl21 LGPL 2.1
  13   * @package  Util
  14   */
  15  class Horde_String
  16  {
  17      /**
  18       * lower() cache.
  19       *
  20       * @var array
  21       */
  22      static protected $_lowers = array();
  23  
  24      /**
  25       * upper() cache.
  26       *
  27       * @var array
  28       */
  29      static protected $_uppers = array();
  30  
  31      /**
  32       * Converts a string from one charset to another.
  33       *
  34       * Uses the iconv or the mbstring extensions.
  35       * The original string is returned if conversion failed or none
  36       * of the extensions were available.
  37       *
  38       * @param mixed $input    The data to be converted. If $input is an an
  39       *                        array, the array's values get converted
  40       *                        recursively.
  41       * @param string $from    The string's current charset.
  42       * @param string $to      The charset to convert the string to.
  43       * @param boolean $force  Force conversion?
  44       *
  45       * @return mixed  The converted input data.
  46       */
  47      static public function convertCharset($input, $from, $to, $force = false)
  48      {
  49          /* Don't bother converting numbers. */
  50          if (is_numeric($input)) {
  51              return $input;
  52          }
  53  
  54          /* If the from and to character sets are identical, return now. */
  55          if (!$force && $from == $to) {
  56              return $input;
  57          }
  58          $from = self::lower($from);
  59          $to = self::lower($to);
  60          if (!$force && $from == $to) {
  61              return $input;
  62          }
  63  
  64          if (is_array($input)) {
  65              $tmp = array();
  66              reset($input);
  67              while (list($key, $val) = each($input)) {
  68                  $tmp[self::_convertCharset($key, $from, $to)] = self::convertCharset($val, $from, $to, $force);
  69              }
  70              return $tmp;
  71          }
  72  
  73          if (is_object($input)) {
  74              // PEAR_Error/Exception objects are almost guaranteed to contain
  75              // recursion, which will cause a segfault in PHP. We should never
  76              // reach this line, but add a check.
  77              if (($input instanceof Exception) ||
  78                  ($input instanceof PEAR_Error)) {
  79                  return '';
  80              }
  81  
  82              $input = clone $input;
  83              $vars = get_object_vars($input);
  84              while (list($key, $val) = each($vars)) {
  85                  $input->$key = self::convertCharset($val, $from, $to, $force);
  86              }
  87              return $input;
  88          }
  89  
  90          if (!is_string($input)) {
  91              return $input;
  92          }
  93  
  94          return self::_convertCharset($input, $from, $to);
  95      }
  96  
  97      /**
  98       * Internal function used to do charset conversion.
  99       *
 100       * @param string $input  See self::convertCharset().
 101       * @param string $from   See self::convertCharset().
 102       * @param string $to     See self::convertCharset().
 103       *
 104       * @return string  The converted string.
 105       */
 106      static protected function _convertCharset($input, $from, $to)
 107      {
 108          /* Use utf8_[en|de]code() if possible and if the string isn't too
 109           * large (less than 16 MB = 16 * 1024 * 1024 = 16777216 bytes) - these
 110           * functions use more memory. */
 111          if (Horde_Util::extensionExists('xml') &&
 112              ((strlen($input) < 16777216) ||
 113               !Horde_Util::extensionExists('iconv') ||
 114               !Horde_Util::extensionExists('mbstring'))) {
 115              if (($to == 'utf-8') &&
 116                  in_array($from, array('iso-8859-1', 'us-ascii', 'utf-8'))) {
 117                  return utf8_encode($input);
 118              }
 119  
 120              if (($from == 'utf-8') &&
 121                  in_array($to, array('iso-8859-1', 'us-ascii', 'utf-8'))) {
 122                  return utf8_decode($input);
 123              }
 124          }
 125  
 126          /* Try UTF7-IMAP conversions. */
 127          if (($from == 'utf7-imap') || ($to == 'utf7-imap')) {
 128              try {
 129                  if ($from == 'utf7-imap') {
 130                      return self::convertCharset(Horde_Imap_Client_Utf7imap::Utf7ImapToUtf8($input), 'UTF-8', $to);
 131                  } else {
 132                      if ($from == 'utf-8') {
 133                          $conv = $input;
 134                      } else {
 135                          $conv = self::convertCharset($input, $from, 'UTF-8');
 136                      }
 137                      return Horde_Imap_Client_Utf7imap::Utf8ToUtf7Imap($conv);
 138                  }
 139              } catch (Horde_Imap_Client_Exception $e) {
 140                  return $input;
 141              }
 142          }
 143  
 144          /* Try iconv with transliteration. */
 145          if (Horde_Util::extensionExists('iconv')) {
 146              unset($php_errormsg);
 147              ini_set('track_errors', 1);
 148              $out = @iconv($from, $to . '//TRANSLIT', $input);
 149              $errmsg = isset($php_errormsg);
 150              ini_restore('track_errors');
 151              if (!$errmsg && $out !== false) {
 152                  return $out;
 153              }
 154          }
 155  
 156          /* Try mbstring. */
 157          if (Horde_Util::extensionExists('mbstring')) {
 158              $out = @mb_convert_encoding($input, $to, self::_mbstringCharset($from));
 159              if (!empty($out)) {
 160                  return $out;
 161              }
 162          }
 163  
 164          return $input;
 165      }
 166  
 167      /**
 168       * Makes a string lowercase.
 169       *
 170       * @param string $string   The string to be converted.
 171       * @param boolean $locale  If true the string will be converted based on
 172       *                         a given charset, locale independent else.
 173       * @param string $charset  If $locale is true, the charset to use when
 174       *                         converting.
 175       *
 176       * @return string  The string with lowercase characters.
 177       */
 178      static public function lower($string, $locale = false, $charset = null)
 179      {
 180          if ($locale) {
 181              if (Horde_Util::extensionExists('mbstring')) {
 182                  if (is_null($charset)) {
 183                      throw new InvalidArgumentException('$charset argument must not be null');
 184                  }
 185                  $ret = @mb_strtolower($string, self::_mbstringCharset($charset));
 186                  if (!empty($ret)) {
 187                      return $ret;
 188                  }
 189              }
 190              return strtolower($string);
 191          }
 192  
 193          if (!isset(self::$_lowers[$string])) {
 194              $language = setlocale(LC_CTYPE, 0);
 195              setlocale(LC_CTYPE, 'C');
 196              self::$_lowers[$string] = strtolower($string);
 197              setlocale(LC_CTYPE, $language);
 198          }
 199  
 200          return self::$_lowers[$string];
 201      }
 202  
 203      /**
 204       * Makes a string uppercase.
 205       *
 206       * @param string $string   The string to be converted.
 207       * @param boolean $locale  If true the string will be converted based on a
 208       *                         given charset, locale independent else.
 209       * @param string $charset  If $locale is true, the charset to use when
 210       *                         converting. If not provided the current charset.
 211       *
 212       * @return string  The string with uppercase characters.
 213       */
 214      static public function upper($string, $locale = false, $charset = null)
 215      {
 216          if ($locale) {
 217              if (Horde_Util::extensionExists('mbstring')) {
 218                  if (is_null($charset)) {
 219                      throw new InvalidArgumentException('$charset argument must not be null');
 220                  }
 221                  $ret = @mb_strtoupper($string, self::_mbstringCharset($charset));
 222                  if (!empty($ret)) {
 223                      return $ret;
 224                  }
 225              }
 226              return strtoupper($string);
 227          }
 228  
 229          if (!isset(self::$_uppers[$string])) {
 230              $language = setlocale(LC_CTYPE, 0);
 231              setlocale(LC_CTYPE, 'C');
 232              self::$_uppers[$string] = strtoupper($string);
 233              setlocale(LC_CTYPE, $language);
 234          }
 235  
 236          return self::$_uppers[$string];
 237      }
 238  
 239      /**
 240       * Returns a string with the first letter capitalized if it is
 241       * alphabetic.
 242       *
 243       * @param string $string   The string to be capitalized.
 244       * @param boolean $locale  If true the string will be converted based on a
 245       *                         given charset, locale independent else.
 246       * @param string $charset  The charset to use, defaults to current charset.
 247       *
 248       * @return string  The capitalized string.
 249       */
 250      static public function ucfirst($string, $locale = false, $charset = null)
 251      {
 252          if ($locale) {
 253              if (is_null($charset)) {
 254                  throw new InvalidArgumentException('$charset argument must not be null');
 255              }
 256              $first = self::substr($string, 0, 1, $charset);
 257              if (self::isAlpha($first, $charset)) {
 258                  $string = self::upper($first, true, $charset) . self::substr($string, 1, null, $charset);
 259              }
 260          } else {
 261              $string = self::upper(substr($string, 0, 1), false) . substr($string, 1);
 262          }
 263  
 264          return $string;
 265      }
 266  
 267      /**
 268       * Returns a string with the first letter of each word capitalized if it is
 269       * alphabetic.
 270       *
 271       * Sentences are splitted into words at whitestrings.
 272       *
 273       * @param string $string   The string to be capitalized.
 274       * @param boolean $locale  If true the string will be converted based on a
 275       *                         given charset, locale independent else.
 276       * @param string $charset  The charset to use, defaults to current charset.
 277       *
 278       * @return string  The capitalized string.
 279       */
 280      static public function ucwords($string, $locale = false, $charset = null)
 281      {
 282          $words = preg_split('/(\s+)/', $string, -1, PREG_SPLIT_DELIM_CAPTURE);
 283          for ($i = 0, $c = count($words); $i < $c; $i += 2) {
 284              $words[$i] = self::ucfirst($words[$i], $locale, $charset);
 285          }
 286          return implode('', $words);
 287      }
 288  
 289      /**
 290       * Returns part of a string.
 291       *
 292       * @param string $string   The string to be converted.
 293       * @param integer $start   The part's start position, zero based.
 294       * @param integer $length  The part's length.
 295       * @param string $charset  The charset to use when calculating the part's
 296       *                         position and length, defaults to current
 297       *                         charset.
 298       *
 299       * @return string  The string's part.
 300       */
 301      static public function substr($string, $start, $length = null,
 302                                    $charset = 'UTF-8')
 303      {
 304          if (is_null($length)) {
 305              $length = self::length($string, $charset) - $start;
 306          }
 307  
 308          if ($length == 0) {
 309              return '';
 310          }
 311  
 312          /* Try mbstring. */
 313          if (Horde_Util::extensionExists('mbstring')) {
 314              $ret = @mb_substr($string, $start, $length, self::_mbstringCharset($charset));
 315  
 316              /* mb_substr() returns empty string on failure. */
 317              if (strlen($ret)) {
 318                  return $ret;
 319              }
 320          }
 321  
 322          /* Try iconv. */
 323          if (Horde_Util::extensionExists('iconv')) {
 324              $ret = @iconv_substr($string, $start, $length, $charset);
 325  
 326              /* iconv_substr() returns false on failure. */
 327              if ($ret !== false) {
 328                  return $ret;
 329              }
 330          }
 331  
 332          return substr($string, $start, $length);
 333      }
 334  
 335      /**
 336       * Returns the character (not byte) length of a string.
 337       *
 338       * @param string $string  The string to return the length of.
 339       * @param string $charset The charset to use when calculating the string's
 340       *                        length.
 341       *
 342       * @return integer  The string's length.
 343       */
 344      static public function length($string, $charset = 'UTF-8')
 345      {
 346          $charset = self::lower($charset);
 347  
 348          if ($charset == 'utf-8' || $charset == 'utf8') {
 349              return strlen(utf8_decode($string));
 350          }
 351  
 352          if (Horde_Util::extensionExists('mbstring')) {
 353              $ret = @mb_strlen($string, self::_mbstringCharset($charset));
 354              if (!empty($ret)) {
 355                  return $ret;
 356              }
 357          }
 358  
 359          return strlen($string);
 360      }
 361  
 362      /**
 363       * Returns the numeric position of the first occurrence of $needle
 364       * in the $haystack string.
 365       *
 366       * @param string $haystack  The string to search through.
 367       * @param string $needle    The string to search for.
 368       * @param integer $offset   Allows to specify which character in haystack
 369       *                          to start searching.
 370       * @param string $charset   The charset to use when searching for the
 371       *                          $needle string.
 372       *
 373       * @return integer  The position of first occurrence.
 374       */
 375      static public function pos($haystack, $needle, $offset = 0,
 376                                 $charset = 'UTF-8')
 377      {
 378          if (Horde_Util::extensionExists('mbstring')) {
 379              $track_errors = ini_set('track_errors', 1);
 380              $ret = @mb_strpos($haystack, $needle, $offset, self::_mbstringCharset($charset));
 381              ini_set('track_errors', $track_errors);
 382              if (!isset($php_errormsg)) {
 383                  return $ret;
 384              }
 385          }
 386  
 387          return strpos($haystack, $needle, $offset);
 388      }
 389  
 390      /**
 391       * Returns the numeric position of the last occurrence of $needle
 392       * in the $haystack string.
 393       *
 394       * @param string $haystack  The string to search through.
 395       * @param string $needle    The string to search for.
 396       * @param integer $offset   Allows to specify which character in haystack
 397       *                          to start searching.
 398       * @param string $charset   The charset to use when searching for the
 399       *                          $needle string.
 400       *
 401       * @return integer  The position of first occurrence.
 402       */
 403      static public function rpos($haystack, $needle, $offset = 0,
 404                                  $charset = 'UTF-8')
 405      {
 406          if (Horde_Util::extensionExists('mbstring')) {
 407              $track_errors = ini_set('track_errors', 1);
 408              $ret = @mb_strrpos($haystack, $needle, $offset, self::_mbstringCharset($charset));
 409              ini_set('track_errors', $track_errors);
 410              if (!isset($php_errormsg)) {
 411                  return $ret;
 412              }
 413          }
 414  
 415          return strrpos($haystack, $needle, $offset);
 416      }
 417  
 418      /**
 419       * Returns a string padded to a certain length with another string.
 420       * This method behaves exactly like str_pad() but is multibyte safe.
 421       *
 422       * @param string $input    The string to be padded.
 423       * @param integer $length  The length of the resulting string.
 424       * @param string $pad      The string to pad the input string with. Must
 425       *                         be in the same charset like the input string.
 426       * @param const $type      The padding type. One of STR_PAD_LEFT,
 427       *                         STR_PAD_RIGHT, or STR_PAD_BOTH.
 428       * @param string $charset  The charset of the input and the padding
 429       *                         strings.
 430       *
 431       * @return string  The padded string.
 432       */
 433      static public function pad($input, $length, $pad = ' ',
 434                                 $type = STR_PAD_RIGHT, $charset = 'UTF-8')
 435      {
 436          $mb_length = self::length($input, $charset);
 437          $sb_length = strlen($input);
 438          $pad_length = self::length($pad, $charset);
 439  
 440          /* Return if we already have the length. */
 441          if ($mb_length >= $length) {
 442              return $input;
 443          }
 444  
 445          /* Shortcut for single byte strings. */
 446          if ($mb_length == $sb_length && $pad_length == strlen($pad)) {
 447              return str_pad($input, $length, $pad, $type);
 448          }
 449  
 450          switch ($type) {
 451          case STR_PAD_LEFT:
 452              $left = $length - $mb_length;
 453              $output = self::substr(str_repeat($pad, ceil($left / $pad_length)), 0, $left, $charset) . $input;
 454              break;
 455  
 456          case STR_PAD_BOTH:
 457              $left = floor(($length - $mb_length) / 2);
 458              $right = ceil(($length - $mb_length) / 2);
 459              $output = self::substr(str_repeat($pad, ceil($left / $pad_length)), 0, $left, $charset) .
 460                  $input .
 461                  self::substr(str_repeat($pad, ceil($right / $pad_length)), 0, $right, $charset);
 462              break;
 463  
 464          case STR_PAD_RIGHT:
 465              $right = $length - $mb_length;
 466              $output = $input . self::substr(str_repeat($pad, ceil($right / $pad_length)), 0, $right, $charset);
 467              break;
 468          }
 469  
 470          return $output;
 471      }
 472  
 473      /**
 474       * Wraps the text of a message.
 475       *
 476       * @param string $string         String containing the text to wrap.
 477       * @param integer $width         Wrap the string at this number of
 478       *                               characters.
 479       * @param string $break          Character(s) to use when breaking lines.
 480       * @param boolean $cut           Whether to cut inside words if a line
 481       *                               can't be wrapped.
 482       * @param boolean $line_folding  Whether to apply line folding rules per
 483       *                               RFC 822 or similar. The correct break
 484       *                               characters including leading whitespace
 485       *                               have to be specified too.
 486       *
 487       * @return string  String containing the wrapped text.
 488       */
 489      static public function wordwrap($string, $width = 75, $break = "\n",
 490                                      $cut = false, $line_folding = false)
 491      {
 492          $wrapped = '';
 493  
 494          while (self::length($string, 'UTF-8') > $width) {
 495              $line = self::substr($string, 0, $width, 'UTF-8');
 496              $string = self::substr($string, self::length($line, 'UTF-8'), null, 'UTF-8');
 497  
 498              // Make sure we didn't cut a word, unless we want hard breaks
 499              // anyway.
 500              if (!$cut && preg_match('/^(.+?)((\s|\r?\n).*)/us', $string, $match)) {
 501                  $line .= $match[1];
 502                  $string = $match[2];
 503              }
 504  
 505              // Wrap at existing line breaks.
 506              if (preg_match('/^(.*?)(\r?\n)(.*)$/su', $line, $match)) {
 507                  $wrapped .= $match[1] . $match[2];
 508                  $string = $match[3] . $string;
 509                  continue;
 510              }
 511  
 512              // Wrap at the last colon or semicolon followed by a whitespace if
 513              // doing line folding.
 514              if ($line_folding &&
 515                  preg_match('/^(.*?)(;|:)(\s+.*)$/u', $line, $match)) {
 516                  $wrapped .= $match[1] . $match[2] . $break;
 517                  $string = $match[3] . $string;
 518                  continue;
 519              }
 520  
 521              // Wrap at the last whitespace of $line.
 522              $sub = $line_folding
 523                  ? '(.+[^\s])'
 524                  : '(.*)';
 525  
 526              if (preg_match('/^' . $sub . '(\s+)(.*)$/u', $line, $match)) {
 527                  $wrapped .= $match[1] . $break;
 528                  $string = ($line_folding ? $match[2] : '') . $match[3] . $string;
 529                  continue;
 530              }
 531  
 532              // Hard wrap if necessary.
 533              if ($cut) {
 534                  $wrapped .= $line . $break;
 535                  continue;
 536              }
 537  
 538              $wrapped .= $line;
 539          }
 540  
 541          return $wrapped . $string;
 542      }
 543  
 544      /**
 545       * Wraps the text of a message.
 546       *
 547       * @param string $text        String containing the text to wrap.
 548       * @param integer $length     Wrap $text at this number of characters.
 549       * @param string $break_char  Character(s) to use when breaking lines.
 550       * @param boolean $quote      Ignore lines that are wrapped with the '>'
 551       *                            character (RFC 2646)? If true, we don't
 552       *                            remove any padding whitespace at the end of
 553       *                            the string.
 554       *
 555       * @return string  String containing the wrapped text.
 556       */
 557      static public function wrap($text, $length = 80, $break_char = "\n",
 558                                  $quote = false)
 559      {
 560          $paragraphs = array();
 561  
 562          foreach (preg_split('/\r?\n/', $text) as $input) {
 563              if ($quote && (strpos($input, '>') === 0)) {
 564                  $line = $input;
 565              } else {
 566                  /* We need to handle the Usenet-style signature line
 567                   * separately; since the space after the two dashes is
 568                   * REQUIRED, we don't want to trim the line. */
 569                  if ($input != '-- ') {
 570                      $input = rtrim($input);
 571                  }
 572                  $line = self::wordwrap($input, $length, $break_char);
 573              }
 574  
 575              $paragraphs[] = $line;
 576          }
 577  
 578          return implode($break_char, $paragraphs);
 579      }
 580  
 581      /**
 582       * Return a truncated string, suitable for notifications.
 583       *
 584       * @param string $text     The original string.
 585       * @param integer $length  The maximum length.
 586       *
 587       * @return string  The truncated string, if longer than $length.
 588       */
 589      static public function truncate($text, $length = 100)
 590      {
 591          return (self::length($text) > $length)
 592              ? rtrim(self::substr($text, 0, $length - 3)) . '...'
 593              : $text;
 594      }
 595  
 596      /**
 597       * Return an abbreviated string, with characters in the middle of the
 598       * excessively long string replaced by '...'.
 599       *
 600       * @param string $text     The original string.
 601       * @param integer $length  The length at which to abbreviate.
 602       *
 603       * @return string  The abbreviated string, if longer than $length.
 604       */
 605      static public function abbreviate($text, $length = 20)
 606      {
 607          return (self::length($text) > $length)
 608              ? rtrim(self::substr($text, 0, round(($length - 3) / 2))) . '...' . ltrim(self::substr($text, (($length - 3) / 2) * -1))
 609              : $text;
 610      }
 611  
 612      /**
 613       * Returns the common leading part of two strings.
 614       *
 615       * @param string $str1  A string.
 616       * @param string $str2  Another string.
 617       *
 618       * @return string  The start of $str1 and $str2 that is identical in both.
 619       */
 620      static public function common($str1, $str2)
 621      {
 622          for ($result = '', $i = 0;
 623               isset($str1[$i]) && isset($str2[$i]) && $str1[$i] == $str2[$i];
 624               $i++) {
 625              $result .= $str1[$i];
 626          }
 627          return $result;
 628      }
 629  
 630      /**
 631       * Returns true if the every character in the parameter is an alphabetic
 632       * character.
 633       *
 634       * @param string $string   The string to test.
 635       * @param string $charset  The charset to use when testing the string.
 636       *
 637       * @return boolean  True if the parameter was alphabetic only.
 638       */
 639      static public function isAlpha($string, $charset)
 640      {
 641          if (!Horde_Util::extensionExists('mbstring')) {
 642              return ctype_alpha($string);
 643          }
 644  
 645          $charset = self::_mbstringCharset($charset);
 646          $old_charset = mb_regex_encoding();
 647  
 648          if ($charset != $old_charset) {
 649              @mb_regex_encoding($charset);
 650          }
 651          $alpha = !@mb_ereg_match('[^[:alpha:]]', $string);
 652          if ($charset != $old_charset) {
 653              @mb_regex_encoding($old_charset);
 654          }
 655  
 656          return $alpha;
 657      }
 658  
 659      /**
 660       * Returns true if ever character in the parameter is a lowercase letter in
 661       * the current locale.
 662       *
 663       * @param string $string   The string to test.
 664       * @param string $charset  The charset to use when testing the string.
 665       *
 666       * @return boolean  True if the parameter was lowercase.
 667       */
 668      static public function isLower($string, $charset)
 669      {
 670          return ((self::lower($string, true, $charset) === $string) &&
 671                  self::isAlpha($string, $charset));
 672      }
 673  
 674      /**
 675       * Returns true if every character in the parameter is an uppercase letter
 676       * in the current locale.
 677       *
 678       * @param string $string   The string to test.
 679       * @param string $charset  The charset to use when testing the string.
 680       *
 681       * @return boolean  True if the parameter was uppercase.
 682       */
 683      static public function isUpper($string, $charset)
 684      {
 685          return ((self::upper($string, true, $charset) === $string) &&
 686                  self::isAlpha($string, $charset));
 687      }
 688  
 689      /**
 690       * Performs a multibyte safe regex match search on the text provided.
 691       *
 692       * @param string $text     The text to search.
 693       * @param array $regex     The regular expressions to use, without perl
 694       *                         regex delimiters (e.g. '/' or '|').
 695       * @param string $charset  The character set of the text.
 696       *
 697       * @return array  The matches array from the first regex that matches.
 698       */
 699      static public function regexMatch($text, $regex, $charset = null)
 700      {
 701          if (!empty($charset)) {
 702              $regex = self::convertCharset($regex, $charset, 'utf-8');
 703              $text = self::convertCharset($text, $charset, 'utf-8');
 704          }
 705  
 706          $matches = array();
 707          foreach ($regex as $val) {
 708              if (preg_match('/' . $val . '/u', $text, $matches)) {
 709                  break;
 710              }
 711          }
 712  
 713          if (!empty($charset)) {
 714              $matches = self::convertCharset($matches, 'utf-8', $charset);
 715          }
 716  
 717          return $matches;
 718      }
 719  
 720      /**
 721       * Check to see if a string is valid UTF-8.
 722       *
 723       * @param string $text  The text to check.
 724       *
 725       * @return boolean  True if valid UTF-8.
 726       */
 727      static public function validUtf8($text)
 728      {
 729          $text = strval($text);
 730  
 731          for ($i = 0, $len = strlen($text); $i < $len; ++$i) {
 732              $c = ord($text[$i]);
 733  
 734              if ($c > 128) {
 735                  if ($c > 247) {
 736                      // STD 63 (RFC 3629) eliminates 5 & 6-byte characters.
 737                      return false;
 738                  } elseif ($c > 239) {
 739                      $j = 3;
 740                  } elseif ($c > 223) {
 741                      $j = 2;
 742                  } elseif ($c > 191) {
 743                      $j = 1;
 744                  } else {
 745                      return false;
 746                  }
 747  
 748                  if (($i + $j) > $len) {
 749                      return false;
 750                  }
 751  
 752                  do {
 753                      $c = ord($text[++$i]);
 754                      if (($c < 128) || ($c > 191)) {
 755                          return false;
 756                      }
 757                  } while (--$j);
 758              }
 759          }
 760  
 761          return true;
 762      }
 763  
 764      /**
 765       * Workaround charsets that don't work with mbstring functions.
 766       *
 767       * @param string $charset  The original charset.
 768       *
 769       * @return string  The charset to use with mbstring functions.
 770       */
 771      static protected function _mbstringCharset($charset)
 772      {
 773          /* mbstring functions do not handle the 'ks_c_5601-1987' &
 774           * 'ks_c_5601-1989' charsets. However, these charsets are used, for
 775           * example, by various versions of Outlook to send Korean characters.
 776           * Use UHC (CP949) encoding instead. See, e.g.,
 777           * http://lists.w3.org/Archives/Public/ietf-charsets/2001AprJun/0030.html */
 778          return in_array(self::lower($charset), array('ks_c_5601-1987', 'ks_c_5601-1989'))
 779              ? 'UHC'
 780              : $charset;
 781      }
 782  
 783      /**
 784       * Strip UTF-8 byte order mark (BOM) from string data.
 785       *
 786       * @param string $str  Input string (UTF-8).
 787       *
 788       * @return string  Stripped string (UTF-8).
 789       */
 790      static public function trimUtf8Bom($str)
 791      {
 792          return (substr($str, 0, 3) == pack('CCC', 239, 187, 191))
 793              ? substr($str, 3)
 794              : $str;
 795      }
 796  
 797  }
PHP Cross Reference of Unnamed Project

/lib/horde/framework/Horde/ -> String.php (source)