PHPXRef 0.7.1 : Unnamed Project : /lib/typo3/class.t3lib

[Summary view] [Print] [Text view]
   1  <?php
   2  /***************************************************************
   3   *  Copyright notice
   4   *
   5   *  (c) 2003-2011 Kasper Skårhøj (kasperYYYY@typo3.com)
   6   *  All rights reserved
   7   *
   8   *  This script is part of the Typo3 project. The Typo3 project is
   9   *  free software; you can redistribute it and/or modify
  10   *  it under the terms of the GNU General Public License as published by
  11   *  the Free Software Foundation; either version 2 of the License, or
  12   *  (at your option) any later version.
  13   *
  14   *  The GNU General Public License can be found at
  15   *  http://www.gnu.org/copyleft/gpl.html.
  16   *
  17   *  This script is distributed in the hope that it will be useful,
  18   *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  19   *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20   *  GNU General Public License for more details.
  21   *
  22   *  This copyright notice MUST APPEAR in all copies of the script!
  23   ***************************************************************/
  24  /**
  25   * Class for conversion between charsets.
  26   *
  27   * @author    Kasper Skårhøj <kasperYYYY@typo3.com>
  28   * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
  29   */
  30  
  31  
  32  /**
  33   * Notes on UTF-8
  34   *
  35   * Functions working on UTF-8 strings:
  36   *
  37   * - strchr/strstr
  38   * - strrchr
  39   * - substr_count
  40   * - implode/explode/join
  41   *
  42   * Functions nearly working on UTF-8 strings:
  43   *
  44   * - strlen: returns the length in BYTES, if you need the length in CHARACTERS use utf8_strlen
  45   * - trim/ltrim/rtrim: the second parameter 'charlist' won't work for characters not contained in 7-bit ASCII
  46   * - strpos/strrpos: they return the BYTE position, if you need the CHARACTER position use utf8_strpos/utf8_strrpos
  47   * - htmlentities: charset support for UTF-8 only since PHP 4.3.0
  48   * - preg_*: Support compiled into PHP by default nowadays, but could be unavailable, need to use modifier
  49   *
  50   * Functions NOT working on UTF-8 strings:
  51   *
  52   * - str*cmp
  53   * - stristr
  54   * - stripos
  55   * - substr
  56   * - strrev
  57   * - split/spliti
  58   * - ...
  59   *
  60   */
  61  /**
  62   * Class for conversion between charsets
  63   *
  64   * @author    Kasper Skårhøj <kasperYYYY@typo3.com>
  65   * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
  66   * @package TYPO3
  67   * @subpackage t3lib
  68   */
  69  class t3lib_cs {
  70  
  71      /**
  72       * @var t3lib_l10n_Locales
  73       */
  74      protected $locales;
  75  
  76      var $noCharByteVal = 63; // ASCII Value for chars with no equivalent.
  77  
  78          // This is the array where parsed conversion tables are stored (cached)
  79      var $parsedCharsets = array();
  80  
  81          // An array where case folding data will be stored (cached)
  82      var $caseFolding = array();
  83  
  84          // An array where charset-to-ASCII mappings are stored (cached)
  85      var $toASCII = array();
  86  
  87          // This tells the converter which charsets has two bytes per char:
  88      var $twoByteSets = array(
  89          'ucs-2' => 1, // 2-byte Unicode
  90      );
  91  
  92          // This tells the converter which charsets has four bytes per char:
  93      var $fourByteSets = array(
  94          'ucs-4' => 1, // 4-byte Unicode
  95          'utf-32' => 1, // 4-byte Unicode (limited to the 21-bits of UTF-16)
  96      );
  97  
  98          // This tells the converter which charsets use a scheme like the Extended Unix Code:
  99      var $eucBasedSets = array(
 100          'gb2312' => 1, // Chinese, simplified.
 101          'big5' => 1, // Chinese, traditional.
 102          'euc-kr' => 1, // Korean
 103          'shift_jis' => 1, // Japanese - WARNING: Shift-JIS includes half-width katakana single-bytes characters above 0x80!
 104      );
 105  
 106          // see    http://developer.apple.com/documentation/macos8/TextIntlSvcs/TextEncodingConversionManager/TEC1.5/TEC.b0.html
 107          // http://czyborra.com/charsets/iso8859.html
 108      var $synonyms = array(
 109          'us' => 'ascii',
 110          'us-ascii' => 'ascii',
 111          'cp819' => 'iso-8859-1',
 112          'ibm819' => 'iso-8859-1',
 113          'iso-ir-100' => 'iso-8859-1',
 114          'iso-ir-101' => 'iso-8859-2',
 115          'iso-ir-109' => 'iso-8859-3',
 116          'iso-ir-110' => 'iso-8859-4',
 117          'iso-ir-144' => 'iso-8859-5',
 118          'iso-ir-127' => 'iso-8859-6',
 119          'iso-ir-126' => 'iso-8859-7',
 120          'iso-ir-138' => 'iso-8859-8',
 121          'iso-ir-148' => 'iso-8859-9',
 122          'iso-ir-157' => 'iso-8859-10',
 123          'iso-ir-179' => 'iso-8859-13',
 124          'iso-ir-199' => 'iso-8859-14',
 125          'iso-ir-203' => 'iso-8859-15',
 126          'csisolatin1' => 'iso-8859-1',
 127          'csisolatin2' => 'iso-8859-2',
 128          'csisolatin3' => 'iso-8859-3',
 129          'csisolatin5' => 'iso-8859-9',
 130          'csisolatin8' => 'iso-8859-14',
 131          'csisolatin9' => 'iso-8859-15',
 132          'csisolatingreek' => 'iso-8859-7',
 133          'iso-celtic' => 'iso-8859-14',
 134          'latin1' => 'iso-8859-1',
 135          'latin2' => 'iso-8859-2',
 136          'latin3' => 'iso-8859-3',
 137          'latin5' => 'iso-8859-9',
 138          'latin6' => 'iso-8859-10',
 139          'latin8' => 'iso-8859-14',
 140          'latin9' => 'iso-8859-15',
 141          'l1' => 'iso-8859-1',
 142          'l2' => 'iso-8859-2',
 143          'l3' => 'iso-8859-3',
 144          'l5' => 'iso-8859-9',
 145          'l6' => 'iso-8859-10',
 146          'l8' => 'iso-8859-14',
 147          'l9' => 'iso-8859-15',
 148          'cyrillic' => 'iso-8859-5',
 149          'arabic' => 'iso-8859-6',
 150          'tis-620' => 'iso-8859-11',
 151          'win874' => 'windows-874',
 152          'win1250' => 'windows-1250',
 153          'win1251' => 'windows-1251',
 154          'win1252' => 'windows-1252',
 155          'win1253' => 'windows-1253',
 156          'win1254' => 'windows-1254',
 157          'win1255' => 'windows-1255',
 158          'win1256' => 'windows-1256',
 159          'win1257' => 'windows-1257',
 160          'win1258' => 'windows-1258',
 161          'cp1250' => 'windows-1250',
 162          'cp1251' => 'windows-1251',
 163          'cp1252' => 'windows-1252',
 164          'ms-ee' => 'windows-1250',
 165          'ms-ansi' => 'windows-1252',
 166          'ms-greek' => 'windows-1253',
 167          'ms-turk' => 'windows-1254',
 168          'winbaltrim' => 'windows-1257',
 169          'koi-8ru' => 'koi-8r',
 170          'koi8r' => 'koi-8r',
 171          'cp878' => 'koi-8r',
 172          'mac' => 'macroman',
 173          'macintosh' => 'macroman',
 174          'euc-cn' => 'gb2312',
 175          'x-euc-cn' => 'gb2312',
 176          'euccn' => 'gb2312',
 177          'cp936' => 'gb2312',
 178          'big-5' => 'big5',
 179          'cp950' => 'big5',
 180          'eucjp' => 'euc-jp',
 181          'sjis' => 'shift_jis',
 182          'shift-jis' => 'shift_jis',
 183          'cp932' => 'shift_jis',
 184          'cp949' => 'euc-kr',
 185          'utf7' => 'utf-7',
 186          'utf8' => 'utf-8',
 187          'utf16' => 'utf-16',
 188          'utf32' => 'utf-32',
 189          'utf8' => 'utf-8',
 190          'ucs2' => 'ucs-2',
 191          'ucs4' => 'ucs-4',
 192      );
 193  
 194          // mapping of iso-639-1 language codes to script names
 195      var $lang_to_script = array(
 196              // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
 197          'af' => 'west_european', //Afrikaans
 198          'ar' => 'arabic',
 199          'bg' => 'cyrillic', // Bulgarian
 200          'bs' => 'east_european', // Bosnian
 201          'cs' => 'east_european', // Czech
 202          'da' => 'west_european', // Danish
 203          'de' => 'west_european', // German
 204          'es' => 'west_european', // Spanish
 205          'et' => 'estonian',
 206          'eo' => 'unicode', // Esperanto
 207          'eu' => 'west_european', // Basque
 208          'fa' => 'arabic', // Persian
 209          'fi' => 'west_european', // Finish
 210          'fo' => 'west_european', // Faroese
 211          'fr' => 'west_european', // French
 212          'ga' => 'west_european', // Irish
 213          'gl' => 'west_european', // Galician
 214          'gr' => 'greek',
 215          'he' => 'hebrew', // Hebrew (since 1998)
 216          'hi' => 'unicode', // Hindi
 217          'hr' => 'east_european', // Croatian
 218          'hu' => 'east_european', // Hungarian
 219          'iw' => 'hebrew', // Hebrew (til 1998)
 220          'is' => 'west_european', // Icelandic
 221          'it' => 'west_european', // Italian
 222          'ja' => 'japanese',
 223          'ka' => 'unicode', // Georgian
 224          'kl' => 'west_european', // Greenlandic
 225          'km' => 'unicode', // Khmer
 226          'ko' => 'korean',
 227          'lt' => 'lithuanian',
 228          'lv' => 'west_european', // Latvian/Lettish
 229          'nl' => 'west_european', // Dutch
 230          'no' => 'west_european', // Norwegian
 231          'nb' => 'west_european', // Norwegian Bokmal
 232          'nn' => 'west_european', // Norwegian Nynorsk
 233          'pl' => 'east_european', // Polish
 234          'pt' => 'west_european', // Portuguese
 235          'ro' => 'east_european', // Romanian
 236          'ru' => 'cyrillic', // Russian
 237          'sk' => 'east_european', // Slovak
 238          'sl' => 'east_european', // Slovenian
 239          'sr' => 'cyrillic', // Serbian
 240          'sv' => 'west_european', // Swedish
 241          'sq' => 'albanian', // Albanian
 242          'th' => 'thai',
 243          'uk' => 'cyrillic', // Ukranian
 244          'vi' => 'vietnamese',
 245          'zh' => 'chinese',
 246              // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
 247              // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
 248          'afk'=> 'west_european', // Afrikaans
 249          'ara' => 'arabic',
 250          'bgr' => 'cyrillic', // Bulgarian
 251          'cat' => 'west_european', // Catalan
 252          'chs' => 'simpl_chinese',
 253          'cht' => 'trad_chinese',
 254          'csy' => 'east_european', // Czech
 255          'dan' => 'west_european', // Danisch
 256          'deu' => 'west_european', // German
 257          'dea' => 'west_european', // German (Austrian)
 258          'des' => 'west_european', // German (Swiss)
 259          'ena' => 'west_european', // English (Australian)
 260          'enc' => 'west_european', // English (Canadian)
 261          'eng' => 'west_european', // English
 262          'enz' => 'west_european', // English (New Zealand)
 263          'enu' => 'west_european', // English (United States)
 264          'euq' => 'west_european', // Basque
 265          'fos' => 'west_european', // Faroese
 266          'far' => 'arabic', // Persian
 267          'fin' => 'west_european', // Finish
 268          'fra' => 'west_european', // French
 269          'frb' => 'west_european', // French (Belgian)
 270          'frc' => 'west_european', // French (Canadian)
 271          'frs' => 'west_european', // French (Swiss)
 272          'geo' => 'unicode', // Georgian
 273          'glg' => 'west_european', // Galician
 274          'ell' => 'greek',
 275          'heb' => 'hebrew',
 276          'hin' => 'unicode', // Hindi
 277          'hun' => 'east_european', // Hungarian
 278          'isl' => 'west_euorpean', // Icelandic
 279          'ita' => 'west_european', // Italian
 280          'its' => 'west_european', // Italian (Swiss)
 281          'jpn' => 'japanese',
 282          'khm' => 'unicode', // Khmer
 283          'kor' => 'korean',
 284          'lth' => 'lithuanian',
 285          'lvi' => 'west_european', // Latvian/Lettish
 286          'msl' => 'west_european', // Malay
 287          'nlb' => 'west_european', // Dutch (Belgian)
 288          'nld' => 'west_european', // Dutch
 289          'nor' => 'west_european', // Norwegian (bokmal)
 290          'non' => 'west_european', // Norwegian (nynorsk)
 291          'plk' => 'east_european', // Polish
 292          'ptg' => 'west_european', // Portuguese
 293          'ptb' => 'west_european', // Portuguese (Brazil)
 294          'rom' => 'east_european', // Romanian
 295          'rus' => 'cyrillic', // Russian
 296          'slv' => 'east_european', // Slovenian
 297          'sky' => 'east_european', // Slovak
 298          'srl' => 'east_european', // Serbian (Latin)
 299          'srb' => 'cyrillic', // Serbian (Cyrillic)
 300          'esp' => 'west_european', // Spanish (trad. sort)
 301          'esm' => 'west_european', // Spanish (Mexican)
 302          'esn' => 'west_european', // Spanish (internat. sort)
 303          'sve' => 'west_european', // Swedish
 304          'sqi' => 'albanian', // Albanian
 305          'tha' => 'thai',
 306          'trk' => 'turkish',
 307          'ukr' => 'cyrillic', // Ukrainian
 308              // English language names
 309          'afrikaans' => 'west_european',
 310          'albanian' => 'albanian',
 311          'arabic' => 'arabic',
 312          'basque' => 'west_european',
 313          'bosnian' => 'east_european',
 314          'bulgarian' => 'east_european',
 315          'catalan' => 'west_european',
 316          'croatian' => 'east_european',
 317          'czech' => 'east_european',
 318          'danish' => 'west_european',
 319          'dutch' => 'west_european',
 320          'english' => 'west_european',
 321          'esperanto' => 'unicode',
 322          'estonian' => 'estonian',
 323          'faroese' => 'west_european',
 324          'farsi' => 'arabic',
 325          'finnish' => 'west_european',
 326          'french' => 'west_european',
 327          'galician' => 'west_european',
 328          'georgian' => 'unicode',
 329          'german' => 'west_european',
 330          'greek' => 'greek',
 331          'greenlandic' => 'west_european',
 332          'hebrew' => 'hebrew',
 333          'hindi' => 'unicode',
 334          'hungarian' => 'east_european',
 335          'icelandic' => 'west_european',
 336          'italian' => 'west_european',
 337          'khmer' => 'unicode',
 338          'latvian' => 'west_european',
 339          'lettish' => 'west_european',
 340          'lithuanian' => 'lithuanian',
 341          'malay' => 'west_european',
 342          'norwegian' => 'west_european',
 343          'persian' => 'arabic',
 344          'polish' => 'east_european',
 345          'portuguese' => 'west_european',
 346          'russian' => 'cyrillic',
 347          'romanian' => 'east_european',
 348          'serbian' => 'cyrillic',
 349          'slovak' => 'east_european',
 350          'slovenian' => 'east_european',
 351          'spanish' => 'west_european',
 352          'svedish' => 'west_european',
 353          'that' => 'thai',
 354          'turkish' => 'turkish',
 355          'ukrainian' => 'cyrillic',
 356      );
 357  
 358          // mapping of language (family) names to charsets on Unix
 359      var $script_to_charset_unix = array(
 360          'west_european' => 'iso-8859-1',
 361          'estonian' => 'iso-8859-1',
 362          'east_european' => 'iso-8859-2',
 363          'baltic' => 'iso-8859-4',
 364          'cyrillic' => 'iso-8859-5',
 365          'arabic' => 'iso-8859-6',
 366          'greek' => 'iso-8859-7',
 367          'hebrew' => 'iso-8859-8',
 368          'turkish' => 'iso-8859-9',
 369          'thai' => 'iso-8859-11', // = TIS-620
 370          'lithuanian' => 'iso-8859-13',
 371          'chinese' => 'gb2312', // = euc-cn
 372          'japanese' => 'euc-jp',
 373          'korean' => 'euc-kr',
 374          'simpl_chinese' => 'gb2312',
 375          'trad_chinese' => 'big5',
 376          'vietnamese' => '',
 377          'unicode' => 'utf-8',
 378          'albanian' => 'utf-8'
 379      );
 380  
 381          // mapping of language (family) names to charsets on Windows
 382      var $script_to_charset_windows = array(
 383          'east_european' => 'windows-1250',
 384          'cyrillic' => 'windows-1251',
 385          'west_european' => 'windows-1252',
 386          'greek' => 'windows-1253',
 387          'turkish' => 'windows-1254',
 388          'hebrew' => 'windows-1255',
 389          'arabic' => 'windows-1256',
 390          'baltic' => 'windows-1257',
 391          'estonian' => 'windows-1257',
 392          'lithuanian' => 'windows-1257',
 393          'vietnamese' => 'windows-1258',
 394          'thai' => 'cp874',
 395          'korean' => 'cp949',
 396          'chinese' => 'gb2312',
 397          'japanese' => 'shift_jis',
 398          'simpl_chinese' => 'gb2312',
 399          'trad_chinese' => 'big5',
 400          'albanian' => 'windows-1250',
 401          'unicode' => 'utf-8'
 402      );
 403  
 404          // mapping of locale names to charsets
 405      var $locale_to_charset = array(
 406          'japanese.euc' => 'euc-jp',
 407          'ja_jp.ujis' => 'euc-jp',
 408          'korean.euc' => 'euc-kr',
 409          'sr@Latn' => 'iso-8859-2',
 410          'zh_cn' => 'gb2312',
 411          'zh_hk' => 'big5',
 412          'zh_tw' => 'big5',
 413      );
 414  
 415          // TYPO3 specific: Array with the system charsets used for each system language in TYPO3:
 416          // Empty values means "iso-8859-1"
 417      var $charSetArray = array(
 418          'af' => '',
 419          'ar' => 'iso-8859-6',
 420          'ba' => 'iso-8859-2',
 421          'bg' => 'windows-1251',
 422          'br' => '',
 423          'ca' => 'iso-8859-15',
 424          'ch' => 'gb2312',
 425          'cs' => 'windows-1250',
 426          'cz' => 'windows-1250',
 427          'da' => '',
 428          'de' => '',
 429          'dk' => '',
 430          'el' => 'iso-8859-7',
 431          'eo' => 'utf-8',
 432          'es' => '',
 433          'et' => 'iso-8859-4',
 434          'eu' => '',
 435          'fa' => 'utf-8',
 436          'fi' => '',
 437          'fo' => 'utf-8',
 438          'fr' => '',
 439          'fr_CA' => '',
 440          'ga' => '',
 441          'ge' => 'utf-8',
 442          'gl' => '',
 443          'gr' => 'iso-8859-7',
 444          'he' => 'utf-8',
 445          'hi' => 'utf-8',
 446          'hk' => 'big5',
 447          'hr' => 'windows-1250',
 448          'hu' => 'iso-8859-2',
 449          'is' => 'utf-8',
 450          'it' => '',
 451          'ja' => 'shift_jis',
 452          'jp' => 'shift_jis',
 453          'ka' => 'utf-8',
 454          'kl' => 'utf-8',
 455          'km' => 'utf-8',
 456          'ko' => 'euc-kr',
 457          'kr' => 'euc-kr',
 458          'lt' => 'windows-1257',
 459          'lv' => 'utf-8',
 460          'ms' => '',
 461          'my' => '',
 462          'nl' => '',
 463          'no' => '',
 464          'pl' => 'iso-8859-2',
 465          'pt' => '',
 466          'pt_BR' => '',
 467          'qc' => '',
 468          'ro' => 'iso-8859-2',
 469          'ru' => 'windows-1251',
 470          'se' => '',
 471          'si' => 'windows-1250',
 472          'sk' => 'windows-1250',
 473          'sl' => 'windows-1250',
 474          'sq' => 'utf-8',
 475          'sr' => 'utf-8',
 476          'sv' => '',
 477          'th' => 'iso-8859-11',
 478          'tr' => 'iso-8859-9',
 479          'ua' => 'windows-1251',
 480          'uk' => 'windows-1251',
 481          'vi' => 'utf-8',
 482          'vn' => 'utf-8',
 483          'zh' => 'big5',
 484      );
 485  
 486          // TYPO3 specific: Array with the iso names used for each system language in TYPO3:
 487          // Missing keys means: same as TYPO3
 488          // @deprecated since TYPO3 4.6, will be removed in TYPO3 6.0 - use t3lib_l10n_Locales::getIsoMapping()
 489      var $isoArray = array(
 490          'ba' => 'bs',
 491          'br' => 'pt_BR',
 492          'ch' => 'zh_CN',
 493          'cz' => 'cs',
 494          'dk' => 'da',
 495          'si' => 'sl',
 496          'se' => 'sv',
 497          'gl' => 'kl',
 498          'gr' => 'el',
 499          'hk' => 'zh_HK',
 500          'kr' => 'ko',
 501          'ua' => 'uk',
 502          'jp' => 'ja',
 503          'qc' => 'fr_CA',
 504          'vn' => 'vi',
 505          'ge' => 'ka',
 506          'ga' => 'gl',
 507      );
 508  
 509      /**
 510       * Default constructor.
 511       */
 512  	public function __construct() {
 513          $this->locales = t3lib_div::makeInstance('t3lib_l10n_Locales');
 514      }
 515  
 516      /**
 517       * Normalize - changes input character set to lowercase letters.
 518       *
 519       * @param    string        Input charset
 520       * @return    string        Normalized charset
 521       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
 522       */
 523  	function parse_charset($charset) {
 524          $charset = trim(strtolower($charset));
 525          if (isset($this->synonyms[$charset])) {
 526              $charset = $this->synonyms[$charset];
 527          }
 528  
 529          return $charset;
 530      }
 531  
 532      /**
 533       * Get the charset of a locale.
 534       *
 535       * ln            language
 536       * ln_CN         language / country
 537       * ln_CN.cs      language / country / charset
 538       * ln_CN.cs@mod  language / country / charset / modifier
 539       *
 540       * @param    string        Locale string
 541       * @return    string        Charset resolved for locale string
 542       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
 543       */
 544  	function get_locale_charset($locale) {
 545          $locale = strtolower($locale);
 546  
 547              // exact locale specific charset?
 548          if (isset($this->locale_to_charset[$locale])) {
 549              return $this->locale_to_charset[$locale];
 550          }
 551  
 552              // get modifier
 553          list($locale, $modifier) = explode('@', $locale);
 554  
 555              // locale contains charset: use it
 556          list($locale, $charset) = explode('.', $locale);
 557          if ($charset) {
 558              return $this->parse_charset($charset);
 559          }
 560  
 561              // modifier is 'euro' (after charset check, because of xx.utf-8@euro)
 562          if ($modifier == 'euro') {
 563              return 'iso-8859-15';
 564          }
 565  
 566              // get language
 567          list($language, $country) = explode('_', $locale);
 568          if (isset($this->lang_to_script[$language])) {
 569              $script = $this->lang_to_script[$language];
 570          }
 571  
 572          if (TYPO3_OS == 'WIN') {
 573              $cs = $this->script_to_charset_windows[$script] ? $this->script_to_charset_windows[$script] : 'windows-1252';
 574          } else {
 575              $cs = $this->script_to_charset_unix[$script] ? $this->script_to_charset_unix[$script] : 'utf-8';
 576          }
 577  
 578          return $cs;
 579      }
 580  
 581  
 582      /********************************************
 583       *
 584       * Charset Conversion functions
 585       *
 586       ********************************************/
 587  
 588      /**
 589       * Convert from one charset to another charset.
 590       *
 591       * @param    string        Input string
 592       * @param    string        From charset (the current charset of the string)
 593       * @param    string        To charset (the output charset wanted)
 594       * @param    boolean        If set, then characters that are not available in the destination character set will be encoded as numeric entities
 595       * @return    string        Converted string
 596       * @see convArray()
 597       */
 598  	function conv($str, $fromCS, $toCS, $useEntityForNoChar = 0) {
 599          if ($fromCS == $toCS) {
 600              return $str;
 601          }
 602  
 603              // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
 604          if ($toCS == 'utf-8' || !$useEntityForNoChar) {
 605              switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
 606                  case 'mbstring':
 607                      $conv_str = mb_convert_encoding($str, $toCS, $fromCS);
 608                      if (FALSE !== $conv_str) {
 609                          return $conv_str;
 610                      } // returns FALSE for unsupported charsets
 611                      break;
 612  
 613                  case 'iconv':
 614                      $conv_str = iconv($fromCS, $toCS . '//TRANSLIT', $str);
 615                      if (FALSE !== $conv_str) {
 616                          return $conv_str;
 617                      }
 618                      break;
 619  
 620                  case 'recode':
 621                      $conv_str = recode_string($fromCS . '..' . $toCS, $str);
 622                      if (FALSE !== $conv_str) {
 623                          return $conv_str;
 624                      }
 625                      break;
 626              }
 627              // fallback to TYPO3 conversion
 628          }
 629  
 630          if ($fromCS != 'utf-8') {
 631              $str = $this->utf8_encode($str, $fromCS);
 632          }
 633          if ($toCS != 'utf-8') {
 634              $str = $this->utf8_decode($str, $toCS, $useEntityForNoChar);
 635          }
 636          return $str;
 637      }
 638  
 639      /**
 640       * Convert all elements in ARRAY with type string from one charset to another charset.
 641       * NOTICE: Array is passed by reference!
 642       *
 643       * @param    string        Input array, possibly multidimensional
 644       * @param    string        From charset (the current charset of the string)
 645       * @param    string        To charset (the output charset wanted)
 646       * @param    boolean        If set, then characters that are not available in the destination character set will be encoded as numeric entities
 647       * @return    void
 648       * @see conv()
 649       */
 650  	function convArray(&$array, $fromCS, $toCS, $useEntityForNoChar = 0) {
 651          foreach ($array as $key => $value) {
 652              if (is_array($array[$key])) {
 653                  $this->convArray($array[$key], $fromCS, $toCS, $useEntityForNoChar);
 654              } elseif (is_string($array[$key])) {
 655                  $array[$key] = $this->conv($array[$key], $fromCS, $toCS, $useEntityForNoChar);
 656              }
 657          }
 658      }
 659  
 660      /**
 661       * Converts $str from $charset to UTF-8
 662       *
 663       * @param    string        String in local charset to convert to UTF-8
 664       * @param    string        Charset, lowercase. Must be found in csconvtbl/ folder.
 665       * @return    string        Output string, converted to UTF-8
 666       */
 667  	function utf8_encode($str, $charset) {
 668  
 669          if ($charset === 'utf-8') {
 670              return $str;
 671          }
 672  
 673              // Charset is case-insensitive.
 674          if ($this->initCharset($charset)) { // Parse conv. table if not already...
 675              $strLen = strlen($str);
 676              $outStr = '';
 677  
 678              for ($a = 0; $a < $strLen; $a++) { // Traverse each char in string.
 679                  $chr = substr($str, $a, 1);
 680                  $ord = ord($chr);
 681                  if (isset($this->twoByteSets[$charset])) { // If the charset has two bytes per char
 682                      $ord2 = ord($str{$a + 1});
 683                      $ord = $ord << 8 | $ord2; // assume big endian
 684  
 685                      if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 686                          $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
 687                      } else {
 688                          $outStr .= chr($this->noCharByteVal);
 689                      } // No char exists
 690                      $a++;
 691                  } elseif ($ord > 127) { // If char has value over 127 it's a multibyte char in UTF-8
 692                      if (isset($this->eucBasedSets[$charset])) { // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
 693                          if ($charset != 'shift_jis' || ($ord < 0xA0 || $ord > 0xDF)) { // Shift-JIS: chars between 160 and 223 are single byte
 694                              $a++;
 695                              $ord2 = ord(substr($str, $a, 1));
 696                              $ord = $ord * 256 + $ord2;
 697                          }
 698                      }
 699  
 700                      if (isset($this->parsedCharsets[$charset]['local'][$ord])) { // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
 701                          $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
 702                      } else {
 703                          $outStr .= chr($this->noCharByteVal);
 704                      } // No char exists
 705                  } else {
 706                      $outStr .= $chr;
 707                  } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 708              }
 709              return $outStr;
 710          }
 711      }
 712  
 713      /**
 714       * Converts $str from UTF-8 to $charset
 715       *
 716       * @param    string        String in UTF-8 to convert to local charset
 717       * @param    string        Charset, lowercase. Must be found in csconvtbl/ folder.
 718       * @param    boolean        If set, then characters that are not available in the destination character set will be encoded as numeric entities
 719       * @return    string        Output string, converted to local charset
 720       */
 721  	function utf8_decode($str, $charset, $useEntityForNoChar = 0) {
 722  
 723          if ($charset === 'utf-8') {
 724              return $str;
 725          }
 726  
 727              // Charset is case-insensitive.
 728          if ($this->initCharset($charset)) { // Parse conv. table if not already...
 729              $strLen = strlen($str);
 730              $outStr = '';
 731              $buf = '';
 732              for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) { // Traverse each char in UTF-8 string.
 733                  $chr = substr($str, $a, 1);
 734                  $ord = ord($chr);
 735                  if ($ord > 127) { // This means multibyte! (first byte!)
 736                      if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 737  
 738                          $buf = $chr; // Add first byte
 739                          for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 740                              $ord = $ord << 1; // Shift it left and ...
 741                              if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 742                                  $a++; // Increase pointer...
 743                                  $buf .= substr($str, $a, 1); // ... and add the next char.
 744                              } else {
 745                                  break;
 746                              }
 747                          }
 748  
 749                          if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) { // If the UTF-8 char-sequence is found then...
 750                              $mByte = $this->parsedCharsets[$charset]['utf8'][$buf]; // The local number
 751                              if ($mByte > 255) { // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
 752                                  $outStr .= chr(($mByte >> 8) & 255) . chr($mByte & 255);
 753                              } else {
 754                                  $outStr .= chr($mByte);
 755                              }
 756                          } elseif ($useEntityForNoChar) { // Create num entity:
 757                              $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
 758                          } else {
 759                              $outStr .= chr($this->noCharByteVal);
 760                          } // No char exists
 761                      } else {
 762                          $outStr .= chr($this->noCharByteVal);
 763                      } // No char exists (MIDDLE of MB sequence!)
 764                  } else {
 765                      $outStr .= $chr;
 766                  } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 767              }
 768              return $outStr;
 769          }
 770      }
 771  
 772      /**
 773       * Converts all chars > 127 to numeric entities.
 774       *
 775       * @param    string        Input string
 776       * @return    string        Output string
 777       */
 778  	function utf8_to_entities($str) {
 779          $strLen = strlen($str);
 780          $outStr = '';
 781          $buf = '';
 782          for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
 783              $chr = substr($str, $a, 1);
 784              $ord = ord($chr);
 785              if ($ord > 127) { // This means multibyte! (first byte!)
 786                  if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 787                      $buf = $chr; // Add first byte
 788                      for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 789                          $ord = $ord << 1; // Shift it left and ...
 790                          if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 791                              $a++; // Increase pointer...
 792                              $buf .= substr($str, $a, 1); // ... and add the next char.
 793                          } else {
 794                              break;
 795                          }
 796                      }
 797  
 798                      $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
 799                  } else {
 800                      $outStr .= chr($this->noCharByteVal);
 801                  } // No char exists (MIDDLE of MB sequence!)
 802              } else {
 803                  $outStr .= $chr;
 804              } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 805          }
 806  
 807          return $outStr;
 808      }
 809  
 810      /**
 811       * Converts numeric entities (UNICODE, eg. decimal (&#1234;) or hexadecimal (&#x1b;)) to UTF-8 multibyte chars
 812       *
 813       * @param    string        Input string, UTF-8
 814       * @param    boolean        If set, then all string-HTML entities (like &amp; or &pound; will be converted as well)
 815       * @return    string        Output string
 816       */
 817  	function entities_to_utf8($str, $alsoStdHtmlEnt = FALSE) {
 818          // Workaround for #39287: 3rd parameter for get_html_translation_table() was only added in PHP 5.3.4 and later
 819          // see http://php.net/manual/en/function.get-html-translation-table.php
 820          $applyPhpCompatibilityFix = version_compare(phpversion(), '5.3.4', '<');
 821  
 822          if ($alsoStdHtmlEnt) {
 823              if ($applyPhpCompatibilityFix === TRUE) {
 824                  $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT));
 825              } else {
 826                  $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
 827              }
 828          }
 829  
 830          $token = md5(microtime());
 831          $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '$2}' . $token, $str));
 832          foreach ($parts as $k => $v) {
 833                  // only take every second element
 834              if ($k % 2 === 0) {
 835                  continue;
 836              }
 837  
 838              $position = 0;
 839              if (substr($v, $position, 1) == '#') { // Dec or hex entities:
 840                  $position++;
 841                  if (substr($v, $position, 1) == 'x') {
 842                      $v = hexdec(substr($v, ++$position));
 843                  } else {
 844                      $v = substr($v, $position);
 845                  }
 846                  $parts[$k] = $this->UnumberToChar($v);
 847              } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) { // Other entities:
 848                  $v = $trans_tbl['&' . $v . ';'];
 849                  if ($applyPhpCompatibilityFix === TRUE) {
 850                      $v = $this->utf8_encode($v, 'iso-8859-1');
 851                  }
 852                  $parts[$k] = $v;
 853              } else { // No conversion:
 854                  $parts[$k] = '&' . $v . ';';
 855              }
 856          }
 857  
 858          return implode('', $parts);
 859      }
 860  
 861      /**
 862       * Converts all chars in the input UTF-8 string into integer numbers returned in an array
 863       *
 864       * @param    string        Input string, UTF-8
 865       * @param    boolean        If set, then all HTML entities (like &amp; or &pound; or &#123; or &#x3f5d;) will be detected as characters.
 866       * @param    boolean        If set, then instead of integer numbers the real UTF-8 char is returned.
 867       * @return    array        Output array with the char numbers
 868       */
 869  	function utf8_to_numberarray($str, $convEntities = 0, $retChar = 0) {
 870              // If entities must be registered as well...:
 871          if ($convEntities) {
 872              $str = $this->entities_to_utf8($str, 1);
 873          }
 874              // Do conversion:
 875          $strLen = strlen($str);
 876          $outArr = array();
 877          $buf = '';
 878          for ($a = 0; $a < $strLen; $a++) { // Traverse each char in UTF-8 string.
 879              $chr = substr($str, $a, 1);
 880              $ord = ord($chr);
 881              if ($ord > 127) { // This means multibyte! (first byte!)
 882                  if ($ord & 64) { // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
 883                      $buf = $chr; // Add first byte
 884                      for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 885                          $ord = $ord << 1; // Shift it left and ...
 886                          if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 887                              $a++; // Increase pointer...
 888                              $buf .= substr($str, $a, 1); // ... and add the next char.
 889                          } else {
 890                              break;
 891                          }
 892                      }
 893  
 894                      $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
 895                  } else {
 896                      $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
 897                  } // No char exists (MIDDLE of MB sequence!)
 898              } else {
 899                  $outArr[] = $retChar ? chr($ord) : $ord;
 900              } // ... otherwise it's just ASCII 0-127 and one byte. Transparent
 901          }
 902  
 903          return $outArr;
 904      }
 905  
 906      /**
 907       * Converts a UNICODE number to a UTF-8 multibyte character
 908       * Algorithm based on script found at From: http://czyborra.com/utf/
 909       * Unit-tested by Kasper
 910       *
 911       * The binary representation of the character's integer value is thus simply spread across the bytes and the number of high bits set in the lead byte announces the number of bytes in the multibyte sequence:
 912       *
 913       *  bytes | bits | representation
 914       *      1 |    7 | 0vvvvvvv
 915       *      2 |   11 | 110vvvvv 10vvvvvv
 916       *      3 |   16 | 1110vvvv 10vvvvvv 10vvvvvv
 917       *      4 |   21 | 11110vvv 10vvvvvv 10vvvvvv 10vvvvvv
 918       *      5 |   26 | 111110vv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 919       *      6 |   31 | 1111110v 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv 10vvvvvv
 920       *
 921       * @param    integer        UNICODE integer
 922       * @return    string        UTF-8 multibyte character string
 923       * @see utf8CharToUnumber()
 924       */
 925  	function UnumberToChar($cbyte) {
 926          $str = '';
 927  
 928          if ($cbyte < 0x80) {
 929              $str .= chr($cbyte);
 930          } else {
 931              if ($cbyte < 0x800) {
 932                  $str .= chr(0xC0 | ($cbyte >> 6));
 933                  $str .= chr(0x80 | ($cbyte & 0x3F));
 934              } else {
 935                  if ($cbyte < 0x10000) {
 936                      $str .= chr(0xE0 | ($cbyte >> 12));
 937                      $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 938                      $str .= chr(0x80 | ($cbyte & 0x3F));
 939                  } else {
 940                      if ($cbyte < 0x200000) {
 941                          $str .= chr(0xF0 | ($cbyte >> 18));
 942                          $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
 943                          $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 944                          $str .= chr(0x80 | ($cbyte & 0x3F));
 945                      } else {
 946                          if ($cbyte < 0x4000000) {
 947                              $str .= chr(0xF8 | ($cbyte >> 24));
 948                              $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
 949                              $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
 950                              $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 951                              $str .= chr(0x80 | ($cbyte & 0x3F));
 952                          } else {
 953                              if ($cbyte < 0x80000000) {
 954                                  $str .= chr(0xFC | ($cbyte >> 30));
 955                                  $str .= chr(0x80 | (($cbyte >> 24) & 0x3F));
 956                                  $str .= chr(0x80 | (($cbyte >> 18) & 0x3F));
 957                                  $str .= chr(0x80 | (($cbyte >> 12) & 0x3F));
 958                                  $str .= chr(0x80 | (($cbyte >> 6) & 0x3F));
 959                                  $str .= chr(0x80 | ($cbyte & 0x3F));
 960                              } else { // Cannot express a 32-bit character in UTF-8
 961                                  $str .= chr($this->noCharByteVal);
 962                              }
 963                          }
 964                      }
 965                  }
 966              }
 967          }
 968          return $str;
 969      }
 970  
 971      /**
 972       * Converts a UTF-8 Multibyte character to a UNICODE number
 973       * Unit-tested by Kasper
 974       *
 975       * @param    string        UTF-8 multibyte character string
 976       * @param    boolean        If set, then a hex. number is returned.
 977       * @return    integer        UNICODE integer
 978       * @see UnumberToChar()
 979       */
 980  	function utf8CharToUnumber($str, $hex = 0) {
 981          $ord = ord(substr($str, 0, 1)); // First char
 982  
 983          if (($ord & 192) == 192) { // This verifyes that it IS a multi byte string
 984              $binBuf = '';
 985              for ($b = 0; $b < 8; $b++) { // for each byte in multibyte string...
 986                  $ord = $ord << 1; // Shift it left and ...
 987                  if ($ord & 128) { // ... and with 8th bit - if that is set, then there are still bytes in sequence.
 988                      $binBuf .= substr('00000000' . decbin(ord(substr($str, $b + 1, 1))), -6);
 989                  } else {
 990                      break;
 991                  }
 992              }
 993              $binBuf = substr('00000000' . decbin(ord(substr($str, 0, 1))), -(6 - $b)) . $binBuf;
 994  
 995              $int = bindec($binBuf);
 996          } else {
 997              $int = $ord;
 998          }
 999  
1000          return $hex ? 'x' . dechex($int) : $int;
1001      }
1002  
1003  
1004      /********************************************
1005       *
1006       * Init functions
1007       *
1008       ********************************************/
1009  
1010      /**
1011       * This will initialize a charset for use if it's defined in the PATH_t3lib.'csconvtbl/' folder
1012       * This function is automatically called by the conversion functions
1013       *
1014       * PLEASE SEE: http://www.unicode.org/Public/MAPPINGS/
1015       *
1016       * @param    string        The charset to be initialized. Use lowercase charset always (the charset must match exactly with a filename in csconvtbl/ folder ([charset].tbl)
1017       * @return    integer        Returns '1' if already loaded. Returns FALSE if charset conversion table was not found. Returns '2' if the charset conversion table was found and parsed.
1018       * @access private
1019       */
1020  	function initCharset($charset) {
1021              // Only process if the charset is not yet loaded:
1022          if (!is_array($this->parsedCharsets[$charset])) {
1023  
1024                  // Conversion table filename:
1025              $charsetConvTableFile = PATH_t3lib . 'csconvtbl/' . $charset . '.tbl';
1026  
1027                  // If the conversion table is found:
1028              if ($charset && t3lib_div::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1029                      // Cache file for charsets:
1030                      // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1031                  $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1032                  if ($cacheFile && @is_file($cacheFile)) {
1033                      $this->parsedCharsets[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1034                  } else {
1035                          // Parse conversion table into lines:
1036                      $lines = t3lib_div::trimExplode(LF, t3lib_div::getUrl($charsetConvTableFile), 1);
1037                          // Initialize the internal variable holding the conv. table:
1038                      $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1039                          // traverse the lines:
1040                      $detectedType = '';
1041                      foreach ($lines as $value) {
1042                          if (trim($value) && substr($value, 0, 1) != '#') { // Comment line or blanks are ignored.
1043  
1044                                  // Detect type if not done yet: (Done on first real line)
1045                                  // The "whitespaced" type is on the syntax     "0x0A    0x000A    #LINE FEED"     while     "ms-token" is like         "B9 = U+00B9 : SUPERSCRIPT ONE"
1046                              if (!$detectedType) {
1047                                  $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1048                              }
1049  
1050                              if ($detectedType == 'ms-token') {
1051                                  list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1052                              } elseif ($detectedType == 'whitespaced') {
1053                                  $regA = array();
1054                                  preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1055                                  $hexbyte = $regA[1];
1056                                  $utf8 = 'U+' . $regA[2];
1057                              }
1058                              $decval = hexdec(trim($hexbyte));
1059                              if ($decval > 127) {
1060                                  $utf8decval = hexdec(substr(trim($utf8), 2));
1061                                  $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1062                                  $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1063                              }
1064                          }
1065                      }
1066                      if ($cacheFile) {
1067                          t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1068                      }
1069                  }
1070                  return 2;
1071              } else {
1072                  return FALSE;
1073              }
1074          } else {
1075              return 1;
1076          }
1077      }
1078  
1079      /**
1080       * This function initializes all UTF-8 character data tables.
1081       *
1082       * PLEASE SEE: http://www.unicode.org/Public/UNIDATA/
1083       *
1084       * @param    string        Mode ("case", "ascii", ...)
1085       * @return    integer        Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1086       * @access private
1087       */
1088  	function initUnicodeData($mode = NULL) {
1089              // cache files
1090          $cacheFileCase = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1091          $cacheFileASCII = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1092  
1093              // Only process if the tables are not yet loaded
1094          switch ($mode) {
1095              case 'case':
1096                  if (is_array($this->caseFolding['utf-8'])) {
1097                      return 1;
1098                  }
1099  
1100                      // Use cached version if possible
1101                  if ($cacheFileCase && @is_file($cacheFileCase)) {
1102                      $this->caseFolding['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileCase));
1103                      return 2;
1104                  }
1105                  break;
1106  
1107              case 'ascii':
1108                  if (is_array($this->toASCII['utf-8'])) {
1109                      return 1;
1110                  }
1111  
1112                      // Use cached version if possible
1113                  if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1114                      $this->toASCII['utf-8'] = unserialize(t3lib_div::getUrl($cacheFileASCII));
1115                      return 2;
1116                  }
1117                  break;
1118          }
1119  
1120              // process main Unicode data file
1121          $unicodeDataFile = PATH_t3lib . 'unidata/UnicodeData.txt';
1122          if (!(t3lib_div::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1123              return FALSE;
1124          }
1125  
1126          $fh = fopen($unicodeDataFile, 'rb');
1127          if (!$fh) {
1128              return FALSE;
1129          }
1130  
1131              // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1132              // note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1133          $this->caseFolding['utf-8'] = array();
1134          $utf8CaseFolding =& $this->caseFolding['utf-8']; // a shorthand
1135          $utf8CaseFolding['toUpper'] = array();
1136          $utf8CaseFolding['toLower'] = array();
1137          $utf8CaseFolding['toTitle'] = array();
1138  
1139          $decomposition = array(); // array of temp. decompositions
1140          $mark = array(); // array of chars that are marks (eg. composing accents)
1141          $number = array(); // array of chars that are numbers (eg. digits)
1142          $omit = array(); // array of chars to be omitted (eg. Russian hard sign)
1143  
1144          while (!feof($fh)) {
1145              $line = fgets($fh, 4096);
1146                  // has a lot of info
1147              list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title,) = explode(';', rtrim($line));
1148  
1149              $ord = hexdec($char);
1150              if ($ord > 0xFFFF) {
1151                  break;
1152              } // only process the BMP
1153  
1154              $utf8_char = $this->UnumberToChar($ord);
1155  
1156              if ($upper) {
1157                  $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1158              }
1159              if ($lower) {
1160                  $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1161              }
1162                  // store "title" only when different from "upper" (only a few)
1163              if ($title && $title != $upper) {
1164                  $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1165              }
1166  
1167              switch ($cat{0}) {
1168                  case 'M': // mark (accent, umlaut, ...)
1169                      $mark["U+$char"] = 1;
1170                      break;
1171  
1172                  case 'N': // numeric value
1173                      if ($ord > 0x80 && $num != '') {
1174                          $number["U+$char"] = $num;
1175                      }
1176              }
1177  
1178                  // accented Latin letters without "official" decomposition
1179              $match = array();
1180              if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1181                  $c = ord($match[2]);
1182                  if ($match[1] == 'SMALL') {
1183                      $c += 32;
1184                  }
1185  
1186                  $decomposition["U+$char"] = array(dechex($c));
1187                  continue;
1188              }
1189  
1190              $match = array();
1191              if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1192                  switch ($match[1]) {
1193                      case '<circle>': // add parenthesis as circle replacement, eg (1)
1194                          $match[2] = '0028 ' . $match[2] . ' 0029';
1195                          break;
1196  
1197                      case '<square>': // add square brackets as square replacement, eg [1]
1198                          $match[2] = '005B ' . $match[2] . ' 005D';
1199                          break;
1200  
1201                      case '<compat>': // ignore multi char decompositions that start with a space
1202                          if (preg_match('/^0020 /', $match[2])) {
1203                              continue 2;
1204                          }
1205                          break;
1206  
1207                          // ignore Arabic and vertical layout presentation decomposition
1208                      case '<initial>':
1209                      case '<medial>':
1210                      case '<final>':
1211                      case '<isolated>':
1212                      case '<vertical>':
1213                          continue 2;
1214                  }
1215                  $decomposition["U+$char"] = explode(' ', $match[2]);
1216              }
1217          }
1218          fclose($fh);
1219  
1220              // process additional Unicode data for casing (allow folded characters to expand into a sequence)
1221          $specialCasingFile = PATH_t3lib . 'unidata/SpecialCasing.txt';
1222          if (t3lib_div::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1223              $fh = fopen($specialCasingFile, 'rb');
1224              if ($fh) {
1225                  while (!feof($fh)) {
1226                      $line = fgets($fh, 4096);
1227                      if ($line{0} != '#' && trim($line) != '') {
1228  
1229                          list($char, $lower, $title, $upper, $cond) = t3lib_div::trimExplode(';', $line);
1230                          if ($cond == '' || $cond{0} == '#') {
1231                              $utf8_char = $this->UnumberToChar(hexdec($char));
1232                              if ($char != $lower) {
1233                                  $arr = explode(' ', $lower);
1234                                  for ($i = 0; isset($arr[$i]); $i++) {
1235                                      $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1236                                  }
1237                                  $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1238                              }
1239                              if ($char != $title && $title != $upper) {
1240                                  $arr = explode(' ', $title);
1241                                  for ($i = 0; isset($arr[$i]); $i++) {
1242                                      $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1243                                  }
1244                                  $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1245                              }
1246                              if ($char != $upper) {
1247                                  $arr = explode(' ', $upper);
1248                                  for ($i = 0; isset($arr[$i]); $i++) {
1249                                      $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1250                                  }
1251                                  $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1252                              }
1253                          }
1254                      }
1255                  }
1256                  fclose($fh);
1257              }
1258          }
1259  
1260              // process custom decompositions
1261          $customTranslitFile = PATH_t3lib . 'unidata/Translit.txt';
1262          if (t3lib_div::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1263              $fh = fopen($customTranslitFile, 'rb');
1264              if ($fh) {
1265                  while (!feof($fh)) {
1266                      $line = fgets($fh, 4096);
1267                      if ($line{0} != '#' && trim($line) != '') {
1268                          list($char, $translit) = t3lib_div::trimExplode(';', $line);
1269                          if (!$translit) {
1270                              $omit["U+$char"] = 1;
1271                          }
1272                          $decomposition["U+$char"] = explode(' ', $translit);
1273  
1274                      }
1275                  }
1276                  fclose($fh);
1277              }
1278          }
1279  
1280              // decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1281          foreach ($decomposition as $from => $to) {
1282              $code_decomp = array();
1283  
1284              while ($code_value = array_shift($to)) {
1285                  if (isset($decomposition["U+$code_value"])) { // do recursive decomposition
1286                      foreach (array_reverse($decomposition["U+$code_value"]) as $cv) {
1287                          array_unshift($to, $cv);
1288                      }
1289                  } elseif (!isset($mark["U+$code_value"])) { // remove mark
1290                      array_push($code_decomp, $code_value);
1291                  }
1292              }
1293              if (count($code_decomp) || isset($omit[$from])) {
1294                  $decomposition[$from] = $code_decomp;
1295              } else {
1296                  unset($decomposition[$from]);
1297              }
1298          }
1299  
1300              // create ascii only mapping
1301          $this->toASCII['utf-8'] = array();
1302          $ascii =& $this->toASCII['utf-8'];
1303  
1304          foreach ($decomposition as $from => $to) {
1305              $code_decomp = array();
1306              while ($code_value = array_shift($to)) {
1307                  $ord = hexdec($code_value);
1308                  if ($ord > 127) {
1309                      continue 2;
1310                  } // skip decompositions containing non-ASCII chars
1311                  else
1312                  {
1313                      array_push($code_decomp, chr($ord));
1314                  }
1315              }
1316              $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1317          }
1318  
1319              // add numeric decompositions
1320          foreach ($number as $from => $to) {
1321              $utf8_char = $this->UnumberToChar(hexdec($from));
1322              if (!isset($ascii[$utf8_char])) {
1323                  $ascii[$utf8_char] = $to;
1324              }
1325          }
1326  
1327          if ($cacheFileCase) {
1328              t3lib_div::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1329          }
1330  
1331          if ($cacheFileASCII) {
1332              t3lib_div::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1333          }
1334  
1335          return 3;
1336      }
1337  
1338      /**
1339       * This function initializes the folding table for a charset other than UTF-8.
1340       * This function is automatically called by the case folding functions.
1341       *
1342       * @param    string        Charset for which to initialize case folding.
1343       * @return    integer        Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1344       * @access private
1345       */
1346  	function initCaseFolding($charset) {
1347              // Only process if the case table is not yet loaded:
1348          if (is_array($this->caseFolding[$charset])) {
1349              return 1;
1350          }
1351  
1352              // Use cached version if possible
1353          $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1354          if ($cacheFile && @is_file($cacheFile)) {
1355              $this->caseFolding[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1356              return 2;
1357          }
1358  
1359              // init UTF-8 conversion for this charset
1360          if (!$this->initCharset($charset)) {
1361              return FALSE;
1362          }
1363  
1364              // UTF-8 case folding is used as the base conversion table
1365          if (!$this->initUnicodeData('case')) {
1366              return FALSE;
1367          }
1368  
1369          $nochar = chr($this->noCharByteVal);
1370          foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1371                  // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1372              $c = $this->utf8_decode($utf8, $charset);
1373  
1374                  // $cc = $this->conv($this->caseFolding['utf-8']['toUpper'][$utf8], 'utf-8', $charset);
1375              $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1376              if ($cc != '' && $cc != $nochar) {
1377                  $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1378              }
1379  
1380                  // $cc = $this->conv($this->caseFolding['utf-8']['toLower'][$utf8], 'utf-8', $charset);
1381              $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1382              if ($cc != '' && $cc != $nochar) {
1383                  $this->caseFolding[$charset]['toLower'][$c] = $cc;
1384              }
1385  
1386                  // $cc = $this->conv($this->caseFolding['utf-8']['toTitle'][$utf8], 'utf-8', $charset);
1387              $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1388              if ($cc != '' && $cc != $nochar) {
1389                  $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1390              }
1391          }
1392  
1393              // add the ASCII case table
1394          for ($i = ord('a'); $i <= ord('z'); $i++) {
1395              $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1396          }
1397          for ($i = ord('A'); $i <= ord('Z'); $i++) {
1398              $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1399          }
1400  
1401          if ($cacheFile) {
1402              t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1403          }
1404  
1405          return 3;
1406      }
1407  
1408      /**
1409       * This function initializes the to-ASCII conversion table for a charset other than UTF-8.
1410       * This function is automatically called by the ASCII transliteration functions.
1411       *
1412       * @param    string        Charset for which to initialize conversion.
1413       * @return    integer        Returns FALSE on error, a TRUE value on success: 1 table already loaded, 2, cached version, 3 table parsed (and cached).
1414       * @access private
1415       */
1416  	function initToASCII($charset) {
1417              // Only process if the case table is not yet loaded:
1418          if (is_array($this->toASCII[$charset])) {
1419              return 1;
1420          }
1421  
1422              // Use cached version if possible
1423          $cacheFile = t3lib_div::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1424          if ($cacheFile && @is_file($cacheFile)) {
1425              $this->toASCII[$charset] = unserialize(t3lib_div::getUrl($cacheFile));
1426              return 2;
1427          }
1428  
1429              // init UTF-8 conversion for this charset
1430          if (!$this->initCharset($charset)) {
1431              return FALSE;
1432          }
1433  
1434              // UTF-8/ASCII transliteration is used as the base conversion table
1435          if (!$this->initUnicodeData('ascii')) {
1436              return FALSE;
1437          }
1438  
1439          $nochar = chr($this->noCharByteVal);
1440          foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1441                  // reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1442              $c = $this->utf8_decode($utf8, $charset);
1443  
1444              if (isset($this->toASCII['utf-8'][$utf8])) {
1445                  $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1446              }
1447          }
1448  
1449          if ($cacheFile) {
1450              t3lib_div::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1451          }
1452  
1453          return 3;
1454      }
1455  
1456  
1457      /********************************************
1458       *
1459       * String operation functions
1460       *
1461       ********************************************/
1462  
1463      /**
1464       * Returns a part of a string.
1465       * Unit-tested by Kasper (single byte charsets only)
1466       *
1467       * @param    string        The character set
1468       * @param    string        Character string
1469       * @param    integer        Start position (character position)
1470       * @param    integer        Length (in characters)
1471       * @return    string        The substring
1472       * @see substr(), mb_substr()
1473       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1474       */
1475  	function substr($charset, $string, $start, $len = NULL) {
1476          if ($len === 0 || $string === '') {
1477              return '';
1478          }
1479  
1480          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1481                  // cannot omit $len, when specifying charset
1482              if ($len == NULL) {
1483                  $enc = mb_internal_encoding(); // save internal encoding
1484                  mb_internal_encoding($charset);
1485                  $str = mb_substr($string, $start);
1486                  mb_internal_encoding($enc); // restore internal encoding
1487  
1488                  return $str;
1489              }
1490              else {
1491                  return mb_substr($string, $start, $len, $charset);
1492              }
1493          } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1494                  // cannot omit $len, when specifying charset
1495              if ($len == NULL) {
1496                  $enc = iconv_get_encoding('internal_encoding'); // save internal encoding
1497                  iconv_set_encoding('internal_encoding', $charset);
1498                  $str = iconv_substr($string, $start);
1499                  iconv_set_encoding('internal_encoding', $enc); // restore internal encoding
1500  
1501                  return $str;
1502              }
1503              else {
1504                  return iconv_substr($string, $start, $len, $charset);
1505              }
1506          } elseif ($charset == 'utf-8') {
1507              return $this->utf8_substr($string, $start, $len);
1508          } elseif ($this->eucBasedSets[$charset]) {
1509              return $this->euc_substr($string, $start, $charset, $len);
1510          } elseif ($this->twoByteSets[$charset]) {
1511              return substr($string, $start * 2, $len * 2);
1512          } elseif ($this->fourByteSets[$charset]) {
1513              return substr($string, $start * 4, $len * 4);
1514          }
1515  
1516              // treat everything else as single-byte encoding
1517          return $len === NULL ? substr($string, $start) : substr($string, $start, $len);
1518      }
1519  
1520      /**
1521       * Counts the number of characters.
1522       * Unit-tested by Kasper (single byte charsets only)
1523       *
1524       * @param    string        The character set
1525       * @param    string        Character string
1526       * @return    integer        The number of characters
1527       * @see strlen()
1528       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1529       */
1530  	function strlen($charset, $string) {
1531          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1532              return mb_strlen($string, $charset);
1533          } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1534              return iconv_strlen($string, $charset);
1535          } elseif ($charset == 'utf-8') {
1536              return $this->utf8_strlen($string);
1537          } elseif ($this->eucBasedSets[$charset]) {
1538              return $this->euc_strlen($string, $charset);
1539          } elseif ($this->twoByteSets[$charset]) {
1540              return strlen($string) / 2;
1541          } elseif ($this->fourByteSets[$charset]) {
1542              return strlen($string) / 4;
1543          }
1544              // treat everything else as single-byte encoding
1545          return strlen($string);
1546      }
1547  
1548      /**
1549       * Method to crop strings using the mb_substr function.
1550       *
1551       * @param  string        The character set
1552       * @param  string        String to be cropped
1553       * @param  integer        Crop length (in characters)
1554       * @param  string        Crop signifier
1555       * @return string        The shortened string
1556       * @see mb_strlen(), mb_substr()
1557       */
1558  	protected function cropMbstring($charset, $string, $len, $crop = '') {
1559          if (intval($len) === 0 || mb_strlen($string, $charset) <= abs($len)) {
1560              return $string;
1561          }
1562  
1563          if ($len > 0) {
1564              $string = mb_substr($string, 0, $len, $charset) . $crop;
1565          } else {
1566              $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1567          }
1568  
1569          return $string;
1570      }
1571  
1572      /**
1573       * Truncates a string and pre-/appends a string.
1574       * Unit tested by Kasper
1575       *
1576       * @param    string        The character set
1577       * @param    string        Character string
1578       * @param    integer        Length (in characters)
1579       * @param    string        Crop signifier
1580       * @return    string        The shortened string
1581       * @see substr(), mb_strimwidth()
1582       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1583       */
1584  	function crop($charset, $string, $len, $crop = '') {
1585          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1586              return $this->cropMbstring($charset, $string, $len, $crop);
1587          }
1588  
1589          if (intval($len) == 0) {
1590              return $string;
1591          }
1592  
1593          if ($charset == 'utf-8') {
1594              $i = $this->utf8_char2byte_pos($string, $len);
1595          } elseif ($this->eucBasedSets[$charset]) {
1596              $i = $this->euc_char2byte_pos($string, $len, $charset);
1597          } else {
1598              if ($len > 0) {
1599                  $i = $len;
1600              } else {
1601                  $i = strlen($string) + $len;
1602                  if ($i <= 0) {
1603                      $i = FALSE;
1604                  }
1605              }
1606          }
1607  
1608          if ($i === FALSE) { // $len outside actual string length
1609              return $string;
1610          } else {
1611              if ($len > 0) {
1612                  if (strlen($string{$i})) {
1613                      return substr($string, 0, $i) . $crop;
1614  
1615                  }
1616              } else {
1617                  if (strlen($string{$i - 1})) {
1618                      return $crop . substr($string, $i);
1619                  }
1620              }
1621  
1622              /*
1623                 if (abs($len)<$this->strlen($charset,$string))    {    // Has to use ->strlen() - otherwise multibyte strings ending with a multibyte char will return TRUE here (which is not a catastrophe, but...)
1624                     if ($len > 0)    {
1625                         return substr($string,0,$i).$crop;
1626                     } else {
1627                         return $crop.substr($string,$i);
1628                     }
1629                 }
1630     */
1631          }
1632          return $string;
1633      }
1634  
1635      /**
1636       * Cuts a string short at a given byte length.
1637       *
1638       * @param    string        The character set
1639       * @param    string        Character string
1640       * @param    integer        The byte length
1641       * @return    string        The shortened string
1642       * @see mb_strcut()
1643       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1644       */
1645  	function strtrunc($charset, $string, $len) {
1646          if ($len <= 0) {
1647              return '';
1648          }
1649  
1650          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1651              return mb_strcut($string, 0, $len, $charset);
1652          } elseif ($charset == 'utf-8') {
1653              return $this->utf8_strtrunc($string, $len);
1654          } elseif ($this->eucBasedSets[$charset]) {
1655              return $this->euc_strtrunc($string, $len, $charset);
1656          } elseif ($this->twoByteSets[$charset]) {
1657              if ($len % 2) {
1658                  $len--;
1659              } // don't cut at odd positions
1660          } elseif ($this->fourByteSets[$charset]) {
1661              $x = $len % 4;
1662              $len -= $x; // realign to position dividable by four
1663          }
1664              // treat everything else as single-byte encoding
1665          return substr($string, 0, $len);
1666      }
1667  
1668      /**
1669       * Translates all characters of a string into their respective case values.
1670       * Unlike strtolower() and strtoupper() this method is locale independent.
1671       * Note that the string length may change!
1672       * eg. lower case German "ß" (sharp S) becomes upper case "SS"
1673       * Unit-tested by Kasper
1674       * Real case folding is language dependent, this method ignores this fact.
1675       *
1676       * @param    string        Character set of string
1677       * @param    string        Input string to convert case for
1678       * @param    string        Case keyword: "toLower" means lowercase conversion, anything else is uppercase (use "toUpper" )
1679       * @return    string        The converted string
1680       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1681       * @see strtolower(), strtoupper()
1682       */
1683  	function conv_case($charset, $string, $case) {
1684          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1685              if ($case == 'toLower') {
1686                  $string = mb_strtolower($string, $charset);
1687              } else {
1688                  $string = mb_strtoupper($string, $charset);
1689              }
1690          } elseif ($charset == 'utf-8') {
1691              $string = $this->utf8_char_mapping($string, 'case', $case);
1692          } elseif (isset($this->eucBasedSets[$charset])) {
1693              $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1694          } else {
1695                  // treat everything else as single-byte encoding
1696              $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1697          }
1698  
1699          return $string;
1700      }
1701  
1702      /**
1703       * Equivalent of lcfirst/ucfirst but using character set.
1704       *
1705       * @param string $charset
1706       * @param string $string
1707       * @param string $case
1708       * @return string
1709       * @see t3lib_cs::conv_case()
1710       */
1711  	public function convCaseFirst($charset, $string, $case) {
1712          $firstChar = $this->substr($charset, $string, 0, 1);
1713          $firstChar = $this->conv_case($charset, $firstChar, $case);
1714          $remainder = $this->substr($charset, $string, 1);
1715          return $firstChar . $remainder;
1716      }
1717  
1718      /**
1719       * Converts special chars (like æøåÆØÅ, umlauts etc) to ascii equivalents (usually double-bytes, like æ => ae etc.)
1720       *
1721       * @param string $charset Character set of string
1722       * @param string $string Input string to convert
1723       * @return string The converted string
1724       */
1725  	function specCharsToASCII($charset, $string) {
1726          if ($charset == 'utf-8') {
1727              $string = $this->utf8_char_mapping($string, 'ascii');
1728          } elseif (isset($this->eucBasedSets[$charset])) {
1729              $string = $this->euc_char_mapping($string, $charset, 'ascii');
1730          } else {
1731                  // treat everything else as single-byte encoding
1732              $string = $this->sb_char_mapping($string, $charset, 'ascii');
1733          }
1734  
1735          return $string;
1736      }
1737  
1738  
1739      /**
1740       * converts the language codes that we get from the client (usually HTTP_ACCEPT_LANGUAGE)
1741       * into a TYPO3-readable language code
1742       * @param    $languageCodesList    list of language codes. something like 'de,en-us;q=0.9,de-de;q=0.7,es-cl;q=0.6,en;q=0.4,es;q=0.3,zh;q=0.1'
1743       *             see http://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.4
1744       * @return    string    a preferred language that TYPO3 supports, or "default" if none found
1745       * @author    Benjamin Mack (benni.typo3.org)
1746       */
1747  	public function getPreferredClientLanguage($languageCodesList) {
1748          $allLanguageCodes = array();
1749          $selectedLanguage = 'default';
1750  
1751              // get all languages where TYPO3 code is the same as the ISO code
1752          foreach ($this->charSetArray as $typo3Lang => $charSet) {
1753              $allLanguageCodes[$typo3Lang] = $typo3Lang;
1754          }
1755  
1756              // get all languages where TYPO3 code differs from ISO code
1757              // or needs the country part
1758              // the iso codes will here overwrite the default typo3 language in the key
1759          foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1760              $isoLang = join('-', explode('_', $isoLang));
1761              $allLanguageCodes[$typo3Lang] = $isoLang;
1762          }
1763  
1764              // move the iso codes to the (because we're comparing the keys with "isset" later on)
1765          $allLanguageCodes = array_flip($allLanguageCodes);
1766  
1767  
1768          $preferredLanguages = t3lib_div::trimExplode(',', $languageCodesList);
1769              // order the preferred languages after they key
1770          $sortedPreferredLanguages = array();
1771          foreach ($preferredLanguages as $preferredLanguage) {
1772              $quality = 1.0;
1773              if (strpos($preferredLanguage, ';q=') !== FALSE) {
1774                  list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1775              }
1776              $sortedPreferredLanguages[$preferredLanguage] = $quality;
1777          }
1778  
1779              // loop through the languages, with the highest priority first
1780          arsort($sortedPreferredLanguages, SORT_NUMERIC);
1781          foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1782              if (isset($allLanguageCodes[$preferredLanguage])) {
1783                  $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1784                  break;
1785              }
1786  
1787                  // strip the country code from the end
1788              list($preferredLanguage, $preferredCountry) = explode('-', $preferredLanguage);
1789              if (isset($allLanguageCodes[$preferredLanguage])) {
1790                  $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1791                  break;
1792              }
1793          }
1794          if (!$selectedLanguage || $selectedLanguage == 'en') {
1795              $selectedLanguage = 'default';
1796          }
1797          return $selectedLanguage;
1798      }
1799  
1800  
1801      /********************************************
1802       *
1803       * Internal string operation functions
1804       *
1805       ********************************************/
1806  
1807      /**
1808       * Maps all characters of a string in a single byte charset.
1809       *
1810       * @param    string        the string
1811       * @param    string        the charset
1812       * @param    string        mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
1813       * @param    string        'case': conversion 'toLower' or 'toUpper'
1814       * @return    string        the converted string
1815       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1816       */
1817  	function sb_char_mapping($str, $charset, $mode, $opt = '') {
1818          switch ($mode) {
1819              case 'case':
1820                  if (!$this->initCaseFolding($charset)) {
1821                      return $str;
1822                  } // do nothing
1823                  $map =& $this->caseFolding[$charset][$opt];
1824                  break;
1825  
1826              case 'ascii':
1827                  if (!$this->initToASCII($charset)) {
1828                      return $str;
1829                  } // do nothing
1830                  $map =& $this->toASCII[$charset];
1831                  break;
1832  
1833              default:
1834                  return $str;
1835          }
1836  
1837          $out = '';
1838          for ($i = 0; strlen($str{$i}); $i++) {
1839              $c = $str{$i};
1840              if (isset($map[$c])) {
1841                  $out .= $map[$c];
1842              } else {
1843                  $out .= $c;
1844              }
1845          }
1846  
1847          return $out;
1848      }
1849  
1850  
1851      /********************************************
1852       *
1853       * Internal UTF-8 string operation functions
1854       *
1855       ********************************************/
1856  
1857      /**
1858       * Returns a part of a UTF-8 string.
1859       * Unit-tested by Kasper and works 100% like substr() / mb_substr() for full range of $start/$len
1860       *
1861       * @param    string        UTF-8 string
1862       * @param    integer        Start position (character position)
1863       * @param    integer        Length (in characters)
1864       * @return    string        The substring
1865       * @see substr()
1866       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1867       */
1868  	function utf8_substr($str, $start, $len = NULL) {
1869          if (!strcmp($len, '0')) {
1870              return '';
1871          }
1872  
1873          $byte_start = $this->utf8_char2byte_pos($str, $start);
1874          if ($byte_start === FALSE) {
1875              if ($start > 0) {
1876                  return FALSE; // $start outside string length
1877              } else {
1878                  $start = 0;
1879              }
1880          }
1881  
1882          $str = substr($str, $byte_start);
1883  
1884          if ($len != NULL) {
1885              $byte_end = $this->utf8_char2byte_pos($str, $len);
1886              if ($byte_end === FALSE) // $len outside actual string length
1887              {
1888                  return $len < 0 ? '' : $str;
1889              } // When length is less than zero and exceeds, then we return blank string.
1890              else
1891              {
1892                  return substr($str, 0, $byte_end);
1893              }
1894          }
1895          else    {
1896              return $str;
1897          }
1898      }
1899  
1900      /**
1901       * Counts the number of characters of a string in UTF-8.
1902       * Unit-tested by Kasper and works 100% like strlen() / mb_strlen()
1903       *
1904       * @param    string        UTF-8 multibyte character string
1905       * @return    integer        The number of characters
1906       * @see strlen()
1907       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1908       */
1909  	function utf8_strlen($str) {
1910          $n = 0;
1911          for ($i = 0; strlen($str{$i}); $i++) {
1912              $c = ord($str{$i});
1913              if (!($c & 0x80)) // single-byte (0xxxxxx)
1914              {
1915                  $n++;
1916              }
1917              elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
1918              {
1919                  $n++;
1920              }
1921          }
1922          return $n;
1923      }
1924  
1925      /**
1926       * Truncates a string in UTF-8 short at a given byte length.
1927       *
1928       * @param    string        UTF-8 multibyte character string
1929       * @param    integer        the byte length
1930       * @return    string        the shortened string
1931       * @see mb_strcut()
1932       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1933       */
1934  	function utf8_strtrunc($str, $len) {
1935          $i = $len - 1;
1936          if (ord($str{$i}) & 0x80) { // part of a multibyte sequence
1937              for (; $i > 0 && !(ord($str{$i}) & 0x40); $i--) {
1938                  // find the first byte
1939                  ;
1940              }
1941              if ($i <= 0) {
1942                  return '';
1943              } // sanity check
1944              for ($bc = 0, $mbs = ord($str{$i}); $mbs & 0x80; $mbs = $mbs << 1) {
1945                  // calculate number of bytes
1946                  $bc++;
1947              }
1948              if ($bc + $i > $len) {
1949                  return substr($str, 0, $i);
1950              }
1951              // fallthru: multibyte char fits into length
1952          }
1953          return substr($str, 0, $len);
1954      }
1955  
1956      /**
1957       * Find position of first occurrence of a string, both arguments are in UTF-8.
1958       *
1959       * @param    string        UTF-8 string to search in
1960       * @param    string        UTF-8 string to search for
1961       * @param    integer        Positition to start the search
1962       * @return    integer        The character position
1963       * @see strpos()
1964       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1965       */
1966  	function utf8_strpos($haystack, $needle, $offset = 0) {
1967          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1968              return mb_strpos($haystack, $needle, $offset, 'utf-8');
1969          } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1970              return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1971          }
1972  
1973          $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1974          if ($byte_offset === FALSE) {
1975              return FALSE;
1976          } // offset beyond string length
1977  
1978          $byte_pos = strpos($haystack, $needle, $byte_offset);
1979          if ($byte_pos === FALSE) {
1980              return FALSE;
1981          } // needle not found
1982  
1983          return $this->utf8_byte2char_pos($haystack, $byte_pos);
1984      }
1985  
1986      /**
1987       * Find position of last occurrence of a char in a string, both arguments are in UTF-8.
1988       *
1989       * @param    string        UTF-8 string to search in
1990       * @param    string        UTF-8 character to search for (single character)
1991       * @return    integer        The character position
1992       * @see strrpos()
1993       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
1994       */
1995  	function utf8_strrpos($haystack, $needle) {
1996          if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'mbstring') {
1997              return mb_strrpos($haystack, $needle, 'utf-8');
1998          } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] == 'iconv') {
1999              return iconv_strrpos($haystack, $needle, 'utf-8');
2000          }
2001  
2002          $byte_pos = strrpos($haystack, $needle);
2003          if ($byte_pos === FALSE) {
2004              return FALSE;
2005          } // needle not found
2006  
2007          return $this->utf8_byte2char_pos($haystack, $byte_pos);
2008      }
2009  
2010      /**
2011       * Translates a character position into an 'absolute' byte position.
2012       * Unit tested by Kasper.
2013       *
2014       * @param    string        UTF-8 string
2015       * @param    integer        Character position (negative values start from the end)
2016       * @return    integer        Byte position
2017       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
2018       */
2019  	function utf8_char2byte_pos($str, $pos) {
2020          $n = 0; // number of characters found
2021          $p = abs($pos); // number of characters wanted
2022  
2023          if ($pos >= 0) {
2024              $i = 0;
2025              $d = 1;
2026          } else {
2027              $i = strlen($str) - 1;
2028              $d = -1;
2029          }
2030  
2031          for (; strlen($str{$i}) && $n < $p; $i += $d) {
2032              $c = (int) ord($str{$i});
2033              if (!($c & 0x80)) // single-byte (0xxxxxx)
2034              {
2035                  $n++;
2036              }
2037              elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2038              {
2039                  $n++;
2040              }
2041          }
2042          if (!strlen($str{$i})) {
2043              return FALSE;
2044          } // offset beyond string length
2045  
2046          if ($pos >= 0) {
2047                  // skip trailing multi-byte data bytes
2048              while ((ord($str{$i}) & 0x80) && !(ord($str{$i}) & 0x40)) {
2049                  $i++;
2050              }
2051          } else {
2052                  // correct offset
2053              $i++;
2054          }
2055  
2056          return $i;
2057      }
2058  
2059      /**
2060       * Translates an 'absolute' byte position into a character position.
2061       * Unit tested by Kasper.
2062       *
2063       * @param    string        UTF-8 string
2064       * @param    integer        byte position
2065       * @return    integer        character position
2066       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
2067       */
2068  	function utf8_byte2char_pos($str, $pos) {
2069          $n = 0; // number of characters
2070          for ($i = $pos; $i > 0; $i--) {
2071              $c = (int) ord($str{$i});
2072              if (!($c & 0x80)) // single-byte (0xxxxxx)
2073              {
2074                  $n++;
2075              }
2076              elseif (($c & 0xC0) == 0xC0) // multi-byte starting byte (11xxxxxx)
2077              {
2078                  $n++;
2079              }
2080          }
2081          if (!strlen($str{$i})) {
2082              return FALSE;
2083          } // offset beyond string length
2084  
2085          return $n;
2086      }
2087  
2088      /**
2089       * Maps all characters of an UTF-8 string.
2090       *
2091       * @param    string        UTF-8 string
2092       * @param    string        mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2093       * @param    string        'case': conversion 'toLower' or 'toUpper'
2094       * @return    string        the converted string
2095       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
2096       */
2097  	function utf8_char_mapping($str, $mode, $opt = '') {
2098          if (!$this->initUnicodeData($mode)) {
2099              return $str;
2100          } // do nothing
2101  
2102          $out = '';
2103          switch ($mode) {
2104              case 'case':
2105                  $map =& $this->caseFolding['utf-8'][$opt];
2106                  break;
2107  
2108              case 'ascii':
2109                  $map =& $this->toASCII['utf-8'];
2110                  break;
2111  
2112              default:
2113                  return $str;
2114          }
2115  
2116          for ($i = 0; strlen($str{$i}); $i++) {
2117              $c = ord($str{$i});
2118              if (!($c & 0x80)) // single-byte (0xxxxxx)
2119              {
2120                  $mbc = $str{$i};
2121              }
2122              elseif (($c & 0xC0) == 0xC0) { // multi-byte starting byte (11xxxxxx)
2123                  for ($bc = 0; $c & 0x80; $c = $c << 1) {
2124                      $bc++;
2125                  } // calculate number of bytes
2126                  $mbc = substr($str, $i, $bc);
2127                  $i += $bc - 1;
2128              }
2129  
2130              if (isset($map[$mbc])) {
2131                  $out .= $map[$mbc];
2132              } else {
2133                  $out .= $mbc;
2134              }
2135          }
2136  
2137          return $out;
2138      }
2139  
2140  
2141      /********************************************
2142       *
2143       * Internal EUC string operation functions
2144       *
2145       * Extended Unix Code:
2146       *  ASCII compatible 7bit single bytes chars
2147       *  8bit two byte chars
2148       *
2149       * Shift-JIS is treated as a special case.
2150       *
2151       ********************************************/
2152  
2153      /**
2154       * Cuts a string in the EUC charset family short at a given byte length.
2155       *
2156       * @param    string        EUC multibyte character string
2157       * @param    integer        the byte length
2158       * @param    string        the charset
2159       * @return    string        the shortened string
2160       * @see mb_strcut()
2161       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
2162       */
2163  	function euc_strtrunc($str, $len, $charset) {
2164          $sjis = ($charset == 'shift_jis');
2165          for ($i = 0; strlen($str{$i}) && $i < $len; $i++) {
2166              $c = ord($str{$i});
2167              if ($sjis) {
2168                  if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2169                      $i++;
2170                  } // advance a double-byte char
2171              }
2172              else {
2173                  if ($c >= 0x80) {
2174                      $i++;
2175                  } // advance a double-byte char
2176              }
2177          }
2178          if (!strlen($str{$i})) {
2179              return $str;
2180          } // string shorter than supplied length
2181  
2182          if ($i > $len) {
2183              return substr($str, 0, $len - 1); // we ended on a first byte
2184          } else {
2185              return substr($str, 0, $len);
2186          }
2187      }
2188  
2189      /**
2190       * Returns a part of a string in the EUC charset family.
2191       *
2192       * @param    string        EUC multibyte character string
2193       * @param    integer        start position (character position)
2194       * @param    string        the charset
2195       * @param    integer        length (in characters)
2196       * @return    string        the substring
2197       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
2198       */
2199  	function euc_substr($str, $start, $charset, $len = NULL) {
2200          $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2201          if ($byte_start === FALSE) {
2202              return FALSE;
2203          } // $start outside string length
2204  
2205          $str = substr($str, $byte_start);
2206  
2207          if ($len != NULL) {
2208              $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2209              if ($byte_end === FALSE) // $len outside actual string length
2210              {
2211                  return $str;
2212              }
2213              else
2214              {
2215                  return substr($str, 0, $byte_end);
2216              }
2217          }
2218          else    {
2219              return $str;
2220          }
2221      }
2222  
2223      /**
2224       * Counts the number of characters of a string in the EUC charset family.
2225       *
2226       * @param    string        EUC multibyte character string
2227       * @param    string        the charset
2228       * @return    integer        the number of characters
2229       * @see strlen()
2230       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
2231       */
2232  	function euc_strlen($str, $charset) {
2233          $sjis = ($charset == 'shift_jis');
2234          $n = 0;
2235          for ($i = 0; strlen($str{$i}); $i++) {
2236              $c = ord($str{$i});
2237              if ($sjis) {
2238                  if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2239                      $i++;
2240                  } // advance a double-byte char
2241              }
2242              else {
2243                  if ($c >= 0x80) {
2244                      $i++;
2245                  } // advance a double-byte char
2246              }
2247  
2248              $n++;
2249          }
2250  
2251          return $n;
2252      }
2253  
2254      /**
2255       * Translates a character position into an 'absolute' byte position.
2256       *
2257       * @param    string        EUC multibyte character string
2258       * @param    integer        character position (negative values start from the end)
2259       * @param    string        the charset
2260       * @return    integer        byte position
2261       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
2262       */
2263  	function euc_char2byte_pos($str, $pos, $charset) {
2264          $sjis = ($charset == 'shift_jis');
2265          $n = 0; // number of characters seen
2266          $p = abs($pos); // number of characters wanted
2267  
2268          if ($pos >= 0) {
2269              $i = 0;
2270              $d = 1;
2271          } else {
2272              $i = strlen($str) - 1;
2273              $d = -1;
2274          }
2275  
2276          for (; strlen($str{$i}) && $n < $p; $i += $d) {
2277              $c = ord($str{$i});
2278              if ($sjis) {
2279                  if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) {
2280                      $i += $d;
2281                  } // advance a double-byte char
2282              }
2283              else {
2284                  if ($c >= 0x80) {
2285                      $i += $d;
2286                  } // advance a double-byte char
2287              }
2288  
2289              $n++;
2290          }
2291          if (!strlen($str{$i})) {
2292              return FALSE;
2293          } // offset beyond string length
2294  
2295          if ($pos < 0) {
2296              $i++;
2297          } // correct offset
2298  
2299          return $i;
2300      }
2301  
2302      /**
2303       * Maps all characters of a string in the EUC charset family.
2304       *
2305       * @param    string        EUC multibyte character string
2306       * @param    string        the charset
2307       * @param    string        mode: 'case' (case folding) or 'ascii' (ASCII transliteration)
2308       * @param    string        'case': conversion 'toLower' or 'toUpper'
2309       * @return    string        the converted string
2310       * @author    Martin Kutschker <martin.t.kutschker@blackbox.net>
2311       */
2312  	function euc_char_mapping($str, $charset, $mode, $opt = '') {
2313          switch ($mode) {
2314              case 'case':
2315                  if (!$this->initCaseFolding($charset)) {
2316                      return $str;
2317                  } // do nothing
2318                  $map =& $this->caseFolding[$charset][$opt];
2319                  break;
2320  
2321              case 'ascii':
2322                  if (!$this->initToASCII($charset)) {
2323                      return $str;
2324                  } // do nothing
2325                  $map =& $this->toASCII[$charset];
2326                  break;
2327  
2328              default:
2329                  return $str;
2330          }
2331  
2332          $sjis = ($charset == 'shift_jis');
2333          $out = '';
2334          for ($i = 0; strlen($str{$i}); $i++) {
2335              $mbc = $str{$i};
2336              $c = ord($mbc);
2337  
2338              if ($sjis) {
2339                  if (($c >= 0x80 && $c < 0xA0) || ($c >= 0xE0)) { // a double-byte char
2340                      $mbc = substr($str, $i, 2);
2341                      $i++;
2342                  }
2343              }
2344              else {
2345                  if ($c >= 0x80) { // a double-byte char
2346                      $mbc = substr($str, $i, 2);
2347                      $i++;
2348                  }
2349              }
2350  
2351              if (isset($map[$mbc])) {
2352                  $out .= $map[$mbc];
2353              } else {
2354                  $out .= $mbc;
2355              }
2356          }
2357  
2358          return $out;
2359      }
2360  
2361  }
2362  
2363  if (defined('TYPO3_MODE') && isset($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php'])) {
2364      include_once($GLOBALS['TYPO3_CONF_VARS'][TYPO3_MODE]['XCLASS']['t3lib/class.t3lib_cs.php']);
2365  }
2366  
2367  ?>
PHP Cross Reference of Unnamed Project

/lib/typo3/ -> class.t3lib_cs.php (source)