TYPO3  7.6
CharsetConverter.php
Go to the documentation of this file.
1 <?php
2 namespace TYPO3\CMS\Core\Charset;
3 
4 /*
5  * This file is part of the TYPO3 CMS project.
6  *
7  * It is free software; you can redistribute it and/or modify it under
8  * the terms of the GNU General Public License, either version 2
9  * of the License, or any later version.
10  *
11  * For the full copyright and license information, please read the
12  * LICENSE.txt file that was distributed with this source code.
13  *
14  * The TYPO3 project - inspiring people to share!
15  */
16 
20 
54 {
58  protected $locales;
59 
65  public $noCharByteVal = 63;
66 
72  public $parsedCharsets = array();
73 
79  public $caseFolding = array();
80 
86  public $toASCII = array();
87 
93  public $twoByteSets = array(
94  'ucs-2' => 1
95  );
96 
102  public $fourByteSets = array(
103  'ucs-4' => 1, // 4-byte Unicode
104  'utf-32' => 1
105  );
106 
112  public $eucBasedSets = array(
113  'gb2312' => 1, // Chinese, simplified.
114  'big5' => 1, // Chinese, traditional.
115  'euc-kr' => 1, // Korean
116  'shift_jis' => 1
117  );
118 
125  public $synonyms = array(
126  'us' => 'ascii',
127  'us-ascii' => 'ascii',
128  'cp819' => 'iso-8859-1',
129  'ibm819' => 'iso-8859-1',
130  'iso-ir-100' => 'iso-8859-1',
131  'iso-ir-101' => 'iso-8859-2',
132  'iso-ir-109' => 'iso-8859-3',
133  'iso-ir-110' => 'iso-8859-4',
134  'iso-ir-144' => 'iso-8859-5',
135  'iso-ir-127' => 'iso-8859-6',
136  'iso-ir-126' => 'iso-8859-7',
137  'iso-ir-138' => 'iso-8859-8',
138  'iso-ir-148' => 'iso-8859-9',
139  'iso-ir-157' => 'iso-8859-10',
140  'iso-ir-179' => 'iso-8859-13',
141  'iso-ir-199' => 'iso-8859-14',
142  'iso-ir-203' => 'iso-8859-15',
143  'csisolatin1' => 'iso-8859-1',
144  'csisolatin2' => 'iso-8859-2',
145  'csisolatin3' => 'iso-8859-3',
146  'csisolatin5' => 'iso-8859-9',
147  'csisolatin8' => 'iso-8859-14',
148  'csisolatin9' => 'iso-8859-15',
149  'csisolatingreek' => 'iso-8859-7',
150  'iso-celtic' => 'iso-8859-14',
151  'latin1' => 'iso-8859-1',
152  'latin2' => 'iso-8859-2',
153  'latin3' => 'iso-8859-3',
154  'latin5' => 'iso-8859-9',
155  'latin6' => 'iso-8859-10',
156  'latin8' => 'iso-8859-14',
157  'latin9' => 'iso-8859-15',
158  'l1' => 'iso-8859-1',
159  'l2' => 'iso-8859-2',
160  'l3' => 'iso-8859-3',
161  'l5' => 'iso-8859-9',
162  'l6' => 'iso-8859-10',
163  'l8' => 'iso-8859-14',
164  'l9' => 'iso-8859-15',
165  'cyrillic' => 'iso-8859-5',
166  'arabic' => 'iso-8859-6',
167  'tis-620' => 'iso-8859-11',
168  'win874' => 'windows-874',
169  'win1250' => 'windows-1250',
170  'win1251' => 'windows-1251',
171  'win1252' => 'windows-1252',
172  'win1253' => 'windows-1253',
173  'win1254' => 'windows-1254',
174  'win1255' => 'windows-1255',
175  'win1256' => 'windows-1256',
176  'win1257' => 'windows-1257',
177  'win1258' => 'windows-1258',
178  'cp1250' => 'windows-1250',
179  'cp1251' => 'windows-1251',
180  'cp1252' => 'windows-1252',
181  'ms-ee' => 'windows-1250',
182  'ms-ansi' => 'windows-1252',
183  'ms-greek' => 'windows-1253',
184  'ms-turk' => 'windows-1254',
185  'winbaltrim' => 'windows-1257',
186  'koi-8ru' => 'koi-8r',
187  'koi8r' => 'koi-8r',
188  'cp878' => 'koi-8r',
189  'mac' => 'macroman',
190  'macintosh' => 'macroman',
191  'euc-cn' => 'gb2312',
192  'x-euc-cn' => 'gb2312',
193  'euccn' => 'gb2312',
194  'cp936' => 'gb2312',
195  'big-5' => 'big5',
196  'cp950' => 'big5',
197  'eucjp' => 'euc-jp',
198  'sjis' => 'shift_jis',
199  'shift-jis' => 'shift_jis',
200  'cp932' => 'shift_jis',
201  'cp949' => 'euc-kr',
202  'utf7' => 'utf-7',
203  'utf8' => 'utf-8',
204  'utf16' => 'utf-16',
205  'utf32' => 'utf-32',
206  'ucs2' => 'ucs-2',
207  'ucs4' => 'ucs-4'
208  );
209 
215  public $lang_to_script = array(
216  // iso-639-1 language codes, see http://www.loc.gov/standards/iso639-2/php/code_list.php
217  'af' => 'west_european', // Afrikaans
218  'ar' => 'arabic',
219  'bg' => 'cyrillic', // Bulgarian
220  'bs' => 'east_european', // Bosnian
221  'cs' => 'east_european', // Czech
222  'da' => 'west_european', // Danish
223  'de' => 'west_european', // German
224  'es' => 'west_european', // Spanish
225  'et' => 'estonian',
226  'eo' => 'unicode', // Esperanto
227  'eu' => 'west_european', // Basque
228  'fa' => 'arabic', // Persian
229  'fi' => 'west_european', // Finish
230  'fo' => 'west_european', // Faroese
231  'fr' => 'west_european', // French
232  'ga' => 'west_european', // Irish
233  'gl' => 'west_european', // Galician
234  'gr' => 'greek',
235  'he' => 'hebrew', // Hebrew (since 1998)
236  'hi' => 'unicode', // Hindi
237  'hr' => 'east_european', // Croatian
238  'hu' => 'east_european', // Hungarian
239  'iw' => 'hebrew', // Hebrew (til 1998)
240  'is' => 'west_european', // Icelandic
241  'it' => 'west_european', // Italian
242  'ja' => 'japanese',
243  'ka' => 'unicode', // Georgian
244  'kl' => 'west_european', // Greenlandic
245  'km' => 'unicode', // Khmer
246  'ko' => 'korean',
247  'lt' => 'lithuanian',
248  'lv' => 'west_european', // Latvian/Lettish
249  'nl' => 'west_european', // Dutch
250  'no' => 'west_european', // Norwegian
251  'nb' => 'west_european', // Norwegian Bokmal
252  'nn' => 'west_european', // Norwegian Nynorsk
253  'pl' => 'east_european', // Polish
254  'pt' => 'west_european', // Portuguese
255  'ro' => 'east_european', // Romanian
256  'ru' => 'cyrillic', // Russian
257  'sk' => 'east_european', // Slovak
258  'sl' => 'east_european', // Slovenian
259  'sr' => 'cyrillic', // Serbian
260  'sv' => 'west_european', // Swedish
261  'sq' => 'albanian', // Albanian
262  'th' => 'thai',
263  'uk' => 'cyrillic', // Ukranian
264  'vi' => 'vietnamese',
265  'zh' => 'chinese',
266 
267  // MS language codes, see http://msdn.microsoft.com/library/default.asp?url=/library/en-us/vclib/html/_crt_language_strings.asp
268  // http://msdn.microsoft.com/library/default.asp?url=/library/en-us/wceinternational5/html/wce50conLanguageIdentifiersandLocales.asp
269  'afk' => 'west_european', // Afrikaans
270  'ara' => 'arabic',
271  'bgr' => 'cyrillic', // Bulgarian
272  'cat' => 'west_european', // Catalan
273  'chs' => 'simpl_chinese',
274  'cht' => 'trad_chinese',
275  'csy' => 'east_european', // Czech
276  'dan' => 'west_european', // Danish
277  'deu' => 'west_european', // German
278  'dea' => 'west_european', // German (Austrian)
279  'des' => 'west_european', // German (Swiss)
280  'ena' => 'west_european', // English (Australian)
281  'enc' => 'west_european', // English (Canadian)
282  'eng' => 'west_european', // English
283  'enz' => 'west_european', // English (New Zealand)
284  'enu' => 'west_european', // English (United States)
285  'euq' => 'west_european', // Basque
286  'fos' => 'west_european', // Faroese
287  'far' => 'arabic', // Persian
288  'fin' => 'west_european', // Finish
289  'fra' => 'west_european', // French
290  'frb' => 'west_european', // French (Belgian)
291  'frc' => 'west_european', // French (Canadian)
292  'frs' => 'west_european', // French (Swiss)
293  'geo' => 'unicode', // Georgian
294  'glg' => 'west_european', // Galician
295  'ell' => 'greek',
296  'heb' => 'hebrew',
297  'hin' => 'unicode', // Hindi
298  'hun' => 'east_european', // Hungarian
299  'isl' => 'west_european', // Icelandic
300  'ita' => 'west_european', // Italian
301  'its' => 'west_european', // Italian (Swiss)
302  'jpn' => 'japanese',
303  'khm' => 'unicode', // Khmer
304  'kor' => 'korean',
305  'lth' => 'lithuanian',
306  'lvi' => 'west_european', // Latvian/Lettish
307  'msl' => 'west_european', // Malay
308  'nlb' => 'west_european', // Dutch (Belgian)
309  'nld' => 'west_european', // Dutch
310  'nor' => 'west_european', // Norwegian (bokmal)
311  'non' => 'west_european', // Norwegian (nynorsk)
312  'plk' => 'east_european', // Polish
313  'ptg' => 'west_european', // Portuguese
314  'ptb' => 'west_european', // Portuguese (Brazil)
315  'rom' => 'east_european', // Romanian
316  'rus' => 'cyrillic', // Russian
317  'slv' => 'east_european', // Slovenian
318  'sky' => 'east_european', // Slovak
319  'srl' => 'east_european', // Serbian (Latin)
320  'srb' => 'cyrillic', // Serbian (Cyrillic)
321  'esp' => 'west_european', // Spanish (trad. sort)
322  'esm' => 'west_european', // Spanish (Mexican)
323  'esn' => 'west_european', // Spanish (internat. sort)
324  'sve' => 'west_european', // Swedish
325  'sqi' => 'albanian', // Albanian
326  'tha' => 'thai',
327  'trk' => 'turkish',
328  'ukr' => 'cyrillic', // Ukrainian
329 
330  // English language names
331  'afrikaans' => 'west_european',
332  'albanian' => 'albanian',
333  'arabic' => 'arabic',
334  'basque' => 'west_european',
335  'bosnian' => 'east_european',
336  'bulgarian' => 'east_european',
337  'catalan' => 'west_european',
338  'croatian' => 'east_european',
339  'czech' => 'east_european',
340  'danish' => 'west_european',
341  'dutch' => 'west_european',
342  'english' => 'west_european',
343  'esperanto' => 'unicode',
344  'estonian' => 'estonian',
345  'faroese' => 'west_european',
346  'farsi' => 'arabic',
347  'finnish' => 'west_european',
348  'french' => 'west_european',
349  'galician' => 'west_european',
350  'georgian' => 'unicode',
351  'german' => 'west_european',
352  'greek' => 'greek',
353  'greenlandic' => 'west_european',
354  'hebrew' => 'hebrew',
355  'hindi' => 'unicode',
356  'hungarian' => 'east_european',
357  'icelandic' => 'west_european',
358  'italian' => 'west_european',
359  'khmer' => 'unicode',
360  'latvian' => 'west_european',
361  'lettish' => 'west_european',
362  'lithuanian' => 'lithuanian',
363  'malay' => 'west_european',
364  'norwegian' => 'west_european',
365  'persian' => 'arabic',
366  'polish' => 'east_european',
367  'portuguese' => 'west_european',
368  'russian' => 'cyrillic',
369  'romanian' => 'east_european',
370  'serbian' => 'cyrillic',
371  'slovak' => 'east_european',
372  'slovenian' => 'east_european',
373  'spanish' => 'west_european',
374  'svedish' => 'west_european',
375  'that' => 'thai',
376  'turkish' => 'turkish',
377  'ukrainian' => 'cyrillic'
378  );
379 
385  public $script_to_charset_unix = array(
386  'west_european' => 'iso-8859-1',
387  'estonian' => 'iso-8859-1',
388  'east_european' => 'iso-8859-2',
389  'baltic' => 'iso-8859-4',
390  'cyrillic' => 'iso-8859-5',
391  'arabic' => 'iso-8859-6',
392  'greek' => 'iso-8859-7',
393  'hebrew' => 'iso-8859-8',
394  'turkish' => 'iso-8859-9',
395  'thai' => 'iso-8859-11', // = TIS-620
396  'lithuanian' => 'iso-8859-13',
397  'chinese' => 'gb2312', // = euc-cn
398  'japanese' => 'euc-jp',
399  'korean' => 'euc-kr',
400  'simpl_chinese' => 'gb2312',
401  'trad_chinese' => 'big5',
402  'vietnamese' => '',
403  'unicode' => 'utf-8',
404  'albanian' => 'utf-8'
405  );
406 
413  'east_european' => 'windows-1250',
414  'cyrillic' => 'windows-1251',
415  'west_european' => 'windows-1252',
416  'greek' => 'windows-1253',
417  'turkish' => 'windows-1254',
418  'hebrew' => 'windows-1255',
419  'arabic' => 'windows-1256',
420  'baltic' => 'windows-1257',
421  'estonian' => 'windows-1257',
422  'lithuanian' => 'windows-1257',
423  'vietnamese' => 'windows-1258',
424  'thai' => 'cp874',
425  'korean' => 'cp949',
426  'chinese' => 'gb2312',
427  'japanese' => 'shift_jis',
428  'simpl_chinese' => 'gb2312',
429  'trad_chinese' => 'big5',
430  'albanian' => 'windows-1250',
431  'unicode' => 'utf-8'
432  );
433 
439  public $locale_to_charset = array(
440  'japanese.euc' => 'euc-jp',
441  'ja_jp.ujis' => 'euc-jp',
442  'korean.euc' => 'euc-kr',
443  'sr@Latn' => 'iso-8859-2',
444  'zh_cn' => 'gb2312',
445  'zh_hk' => 'big5',
446  'zh_tw' => 'big5'
447  );
448 
455  public $charSetArray = array(
456  'af' => '',
457  'ar' => 'iso-8859-6',
458  'ba' => 'iso-8859-2',
459  'bg' => 'windows-1251',
460  'br' => '',
461  'ca' => 'iso-8859-15',
462  'ch' => 'gb2312',
463  'cs' => 'windows-1250',
464  'cz' => 'windows-1250',
465  'da' => '',
466  'de' => '',
467  'dk' => '',
468  'el' => 'iso-8859-7',
469  'eo' => 'utf-8',
470  'es' => '',
471  'et' => 'iso-8859-4',
472  'eu' => '',
473  'fa' => 'utf-8',
474  'fi' => '',
475  'fo' => 'utf-8',
476  'fr' => '',
477  'fr_CA' => '',
478  'ga' => '',
479  'ge' => 'utf-8',
480  'gl' => '',
481  'gr' => 'iso-8859-7',
482  'he' => 'utf-8',
483  'hi' => 'utf-8',
484  'hk' => 'big5',
485  'hr' => 'windows-1250',
486  'hu' => 'iso-8859-2',
487  'is' => 'utf-8',
488  'it' => '',
489  'ja' => 'shift_jis',
490  'jp' => 'shift_jis',
491  'ka' => 'utf-8',
492  'kl' => 'utf-8',
493  'km' => 'utf-8',
494  'ko' => 'euc-kr',
495  'kr' => 'euc-kr',
496  'lt' => 'windows-1257',
497  'lv' => 'utf-8',
498  'ms' => '',
499  'my' => '',
500  'nl' => '',
501  'no' => '',
502  'pl' => 'iso-8859-2',
503  'pt' => '',
504  'pt_BR' => '',
505  'qc' => '',
506  'ro' => 'iso-8859-2',
507  'ru' => 'windows-1251',
508  'se' => '',
509  'si' => 'windows-1250',
510  'sk' => 'windows-1250',
511  'sl' => 'windows-1250',
512  'sq' => 'utf-8',
513  'sr' => 'utf-8',
514  'sv' => '',
515  'th' => 'iso-8859-11',
516  'tr' => 'iso-8859-9',
517  'ua' => 'windows-1251',
518  'uk' => 'windows-1251',
519  'vi' => 'utf-8',
520  'vn' => 'utf-8',
521  'zh' => 'big5'
522  );
523 
527  public function __construct()
528  {
529  $this->locales = GeneralUtility::makeInstance(Locales::class);
530  }
531 
538  public function parse_charset($charset)
539  {
540  $charset = trim(strtolower($charset));
541  if (isset($this->synonyms[$charset])) {
542  $charset = $this->synonyms[$charset];
543  }
544  return $charset;
545  }
546 
558  public function get_locale_charset($locale)
559  {
560  $locale = strtolower($locale);
561  // Exact locale specific charset?
562  if (isset($this->locale_to_charset[$locale])) {
563  return $this->locale_to_charset[$locale];
564  }
565  // Get modifier
566  list($locale, $modifier) = explode('@', $locale);
567  // Locale contains charset: use it
568  list($locale, $charset) = explode('.', $locale);
569  if ($charset) {
570  return $this->parse_charset($charset);
571  }
572  // Modifier is 'euro' (after charset check, because of xx.utf-8@euro)
573  if ($modifier === 'euro') {
574  return 'iso-8859-15';
575  }
576  // Get language
577  list($language, ) = explode('_', $locale);
578  if (isset($this->lang_to_script[$language])) {
579  $script = $this->lang_to_script[$language];
580  }
581  if (TYPO3_OS === 'WIN') {
582  $cs = $this->script_to_charset_windows[$script] ?: 'windows-1252';
583  } else {
584  $cs = $this->script_to_charset_unix[$script] ?: 'utf-8';
585  }
586  return $cs;
587  }
588 
589  /********************************************
590  *
591  * Charset Conversion functions
592  *
593  ********************************************/
604  public function conv($inputString, $fromCharset, $toCharset, $useEntityForNoChar = false)
605  {
606  if ($fromCharset === $toCharset) {
607  return $inputString;
608  }
609  // PHP-libs don't support fallback to SGML entities, but UTF-8 handles everything
610  if ($toCharset === 'utf-8' || !$useEntityForNoChar) {
611  switch ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_convMethod']) {
612  case 'mbstring':
613  $convertedString = mb_convert_encoding($inputString, $toCharset, $fromCharset);
614  if (false !== $convertedString) {
615  return $convertedString;
616  }
617  // Returns FALSE for unsupported charsets
618  break;
619  case 'iconv':
620  $convertedString = iconv($fromCharset, $toCharset . '//TRANSLIT', $inputString);
621  if (false !== $convertedString) {
622  return $convertedString;
623  }
624  break;
625  case 'recode':
626  $convertedString = recode_string($fromCharset . '..' . $toCharset, $inputString);
627  if (false !== $convertedString) {
628  return $convertedString;
629  }
630  break;
631  }
632  }
633  if ($fromCharset !== 'utf-8') {
634  $inputString = $this->utf8_encode($inputString, $fromCharset);
635  }
636  if ($toCharset !== 'utf-8') {
637  $inputString = $this->utf8_decode($inputString, $toCharset, $useEntityForNoChar);
638  }
639  return $inputString;
640  }
641 
653  public function convArray(&$array, $fromCharset, $toCharset, $useEntityForNoChar = false)
654  {
655  foreach ($array as $key => $value) {
656  if (is_array($array[$key])) {
657  $this->convArray($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
658  } elseif (is_string($array[$key])) {
659  $array[$key] = $this->conv($array[$key], $fromCharset, $toCharset, $useEntityForNoChar);
660  }
661  }
662  }
663 
671  public function utf8_encode($str, $charset)
672  {
673  if ($charset === 'utf-8') {
674  return $str;
675  }
676  // Charset is case-insensitive
677  // Parse conv. table if not already
678  if ($this->initCharset($charset)) {
679  $strLen = strlen($str);
680  $outStr = '';
681  // Traverse each char in string
682  for ($a = 0; $a < $strLen; $a++) {
683  $chr = substr($str, $a, 1);
684  $ord = ord($chr);
685  // If the charset has two bytes per char
686  if (isset($this->twoByteSets[$charset])) {
687  $ord2 = ord($str[$a + 1]);
688  // Assume big endian
689  $ord = $ord << 8 | $ord2;
690  // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
691  if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
692  $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
693  } else {
694  $outStr .= chr($this->noCharByteVal);
695  }
696  // No char exists
697  $a++;
698  } elseif ($ord > 127) {
699  // If char has value over 127 it's a multibyte char in UTF-8
700  // EUC uses two-bytes above 127; we get both and advance pointer and make $ord a 16bit int.
701  if (isset($this->eucBasedSets[$charset])) {
702  // Shift-JIS: chars between 160 and 223 are single byte
703  if ($charset !== 'shift_jis' || ($ord < 160 || $ord > 223)) {
704  $a++;
705  $ord2 = ord(substr($str, $a, 1));
706  $ord = $ord * 256 + $ord2;
707  }
708  }
709  if (isset($this->parsedCharsets[$charset]['local'][$ord])) {
710  // If the local char-number was found in parsed conv. table then we use that, otherwise 127 (no char?)
711  $outStr .= $this->parsedCharsets[$charset]['local'][$ord];
712  } else {
713  $outStr .= chr($this->noCharByteVal);
714  }
715  } else {
716  $outStr .= $chr;
717  }
718  }
719  return $outStr;
720  }
721  }
722 
731  public function utf8_decode($str, $charset, $useEntityForNoChar = false)
732  {
733  if ($charset === 'utf-8') {
734  return $str;
735  }
736  // Charset is case-insensitive.
737  // Parse conv. table if not already
738  if ($this->initCharset($charset)) {
739  $strLen = strlen($str);
740  $outStr = '';
741  // Traverse each char in UTF-8 string
742  for ($a = 0, $i = 0; $a < $strLen; $a++, $i++) {
743  $chr = substr($str, $a, 1);
744  $ord = ord($chr);
745  // This means multibyte! (first byte!)
746  if ($ord > 127) {
747  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
748  if ($ord & 64) {
749  // Add first byte
750  $buf = $chr;
751  // For each byte in multibyte string
752  for ($b = 0; $b < 8; $b++) {
753  // Shift it left and
754  $ord = $ord << 1;
755  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
756  if ($ord & 128) {
757  $a++;
758  // ... and add the next char.
759  $buf .= substr($str, $a, 1);
760  } else {
761  break;
762  }
763  }
764  // If the UTF-8 char-sequence is found then...
765  if (isset($this->parsedCharsets[$charset]['utf8'][$buf])) {
766  // The local number
767  $mByte = $this->parsedCharsets[$charset]['utf8'][$buf];
768  // If the local number is greater than 255 we will need to split the byte (16bit word assumed) in two chars.
769  if ($mByte > 255) {
770  $outStr .= chr(($mByte >> 8 & 255)) . chr(($mByte & 255));
771  } else {
772  $outStr .= chr($mByte);
773  }
774  } elseif ($useEntityForNoChar) {
775  // Create num entity:
776  $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
777  } else {
778  $outStr .= chr($this->noCharByteVal);
779  }
780  } else {
781  $outStr .= chr($this->noCharByteVal);
782  }
783  } else {
784  $outStr .= $chr;
785  }
786  }
787  return $outStr;
788  }
789  }
790 
797  public function utf8_to_entities($str)
798  {
799  $strLen = strlen($str);
800  $outStr = '';
801  // Traverse each char in UTF-8 string.
802  for ($a = 0; $a < $strLen; $a++) {
803  $chr = substr($str, $a, 1);
804  $ord = ord($chr);
805  // This means multibyte! (first byte!)
806  if ($ord > 127) {
807  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
808  if ($ord & 64) {
809  // Add first byte
810  $buf = $chr;
811  // For each byte in multibyte string...
812  for ($b = 0; $b < 8; $b++) {
813  // Shift it left and ...
814  $ord = $ord << 1;
815  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
816  if ($ord & 128) {
817  $a++;
818  // ... and add the next char.
819  $buf .= substr($str, $a, 1);
820  } else {
821  break;
822  }
823  }
824  $outStr .= '&#' . $this->utf8CharToUnumber($buf, 1) . ';';
825  } else {
826  $outStr .= chr($this->noCharByteVal);
827  }
828  } else {
829  $outStr .= $chr;
830  }
831  }
832  return $outStr;
833  }
834 
842  public function entities_to_utf8($str, $alsoStdHtmlEnt = false)
843  {
844  if ($alsoStdHtmlEnt) {
845  $trans_tbl = array_flip(get_html_translation_table(HTML_ENTITIES, ENT_COMPAT, 'UTF-8'));
846  }
847  $token = md5(microtime());
848  $parts = explode($token, preg_replace('/(&([#[:alnum:]]*);)/', $token . '${2}' . $token, $str));
849  foreach ($parts as $k => $v) {
850  // Only take every second element
851  if ($k % 2 === 0) {
852  continue;
853  }
854  $position = 0;
855  // Dec or hex entities
856  if (substr($v, $position, 1) === '#') {
857  $position++;
858  if (substr($v, $position, 1) === 'x') {
859  $v = hexdec(substr($v, ++$position));
860  } else {
861  $v = substr($v, $position);
862  }
863  $parts[$k] = $this->UnumberToChar($v);
864  } elseif ($alsoStdHtmlEnt && isset($trans_tbl['&' . $v . ';'])) {
865  // Other entities:
866  $v = $trans_tbl['&' . $v . ';'];
867  $parts[$k] = $v;
868  } else {
869  // No conversion:
870  $parts[$k] = '&' . $v . ';';
871  }
872  }
873  return implode('', $parts);
874  }
875 
884  public function utf8_to_numberarray($str, $convEntities = false, $retChar = false)
885  {
886  // If entities must be registered as well...:
887  if ($convEntities) {
888  $str = $this->entities_to_utf8($str, 1);
889  }
890  // Do conversion:
891  $strLen = strlen($str);
892  $outArr = array();
893  // Traverse each char in UTF-8 string.
894  for ($a = 0; $a < $strLen; $a++) {
895  $chr = substr($str, $a, 1);
896  $ord = ord($chr);
897  // This means multibyte! (first byte!)
898  if ($ord > 127) {
899  // Since the first byte must have the 7th bit set we check that. Otherwise we might be in the middle of a byte sequence.
900  if ($ord & 64) {
901  // Add first byte
902  $buf = $chr;
903  // For each byte in multibyte string...
904  for ($b = 0; $b < 8; $b++) {
905  // Shift it left and ...
906  $ord = $ord << 1;
907  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
908  if ($ord & 128) {
909  $a++;
910  // ... and add the next char.
911  $buf .= substr($str, $a, 1);
912  } else {
913  break;
914  }
915  }
916  $outArr[] = $retChar ? $buf : $this->utf8CharToUnumber($buf);
917  } else {
918  $outArr[] = $retChar ? chr($this->noCharByteVal) : $this->noCharByteVal;
919  }
920  } else {
921  $outArr[] = $retChar ? chr($ord) : $ord;
922  }
923  }
924  return $outArr;
925  }
926 
947  public function UnumberToChar($unicodeInteger)
948  {
949  $str = '';
950  if ($unicodeInteger < 128) {
951  $str .= chr($unicodeInteger);
952  } elseif ($unicodeInteger < 2048) {
953  $str .= chr(192 | $unicodeInteger >> 6);
954  $str .= chr(128 | $unicodeInteger & 63);
955  } elseif ($unicodeInteger < 65536) {
956  $str .= chr(224 | $unicodeInteger >> 12);
957  $str .= chr(128 | $unicodeInteger >> 6 & 63);
958  $str .= chr(128 | $unicodeInteger & 63);
959  } elseif ($unicodeInteger < 2097152) {
960  $str .= chr(240 | $unicodeInteger >> 18);
961  $str .= chr(128 | $unicodeInteger >> 12 & 63);
962  $str .= chr(128 | $unicodeInteger >> 6 & 63);
963  $str .= chr(128 | $unicodeInteger & 63);
964  } elseif ($unicodeInteger < 67108864) {
965  $str .= chr(248 | $unicodeInteger >> 24);
966  $str .= chr(128 | $unicodeInteger >> 18 & 63);
967  $str .= chr(128 | $unicodeInteger >> 12 & 63);
968  $str .= chr(128 | $unicodeInteger >> 6 & 63);
969  $str .= chr(128 | $unicodeInteger & 63);
970  } elseif ($unicodeInteger < 2147483648) {
971  $str .= chr(252 | $unicodeInteger >> 30);
972  $str .= chr(128 | $unicodeInteger >> 24 & 63);
973  $str .= chr(128 | $unicodeInteger >> 18 & 63);
974  $str .= chr(128 | $unicodeInteger >> 12 & 63);
975  $str .= chr(128 | $unicodeInteger >> 6 & 63);
976  $str .= chr(128 | $unicodeInteger & 63);
977  } else {
978  // Cannot express a 32-bit character in UTF-8
979  $str .= chr($this->noCharByteVal);
980  }
981  return $str;
982  }
983 
993  public function utf8CharToUnumber($str, $hex = false)
994  {
995  // First char
996  $ord = ord($str[0]);
997  // This verifies that it IS a multi byte string
998  if (($ord & 192) === 192) {
999  $binBuf = '';
1000  // For each byte in multibyte string...
1001  for ($b = 0; $b < 8; $b++) {
1002  // Shift it left and ...
1003  $ord = $ord << 1;
1004  // ... and with 8th bit - if that is set, then there are still bytes in sequence.
1005  if ($ord & 128) {
1006  $binBuf .= substr('00000000' . decbin(ord(substr($str, ($b + 1), 1))), -6);
1007  } else {
1008  break;
1009  }
1010  }
1011  $binBuf = substr(('00000000' . decbin(ord($str[0]))), -(6 - $b)) . $binBuf;
1012  $int = bindec($binBuf);
1013  } else {
1014  $int = $ord;
1015  }
1016  return $hex ? 'x' . dechex($int) : $int;
1017  }
1018 
1019  /********************************************
1020  *
1021  * Init functions
1022  *
1023  ********************************************/
1034  public function initCharset($charset)
1035  {
1036  // Only process if the charset is not yet loaded:
1037  if (!is_array($this->parsedCharsets[$charset])) {
1038  // Conversion table filename:
1039  $charsetConvTableFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/csconvtbl/' . $charset . '.tbl';
1040  // If the conversion table is found:
1041  if ($charset && GeneralUtility::validPathStr($charsetConvTableFile) && @is_file($charsetConvTableFile)) {
1042  // Cache file for charsets:
1043  // Caching brought parsing time for gb2312 down from 2400 ms to 150 ms. For other charsets we are talking 11 ms down to zero.
1044  $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/charset_' . $charset . '.tbl');
1045  if ($cacheFile && @is_file($cacheFile)) {
1046  $this->parsedCharsets[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1047  } else {
1048  // Parse conversion table into lines:
1049  $lines = GeneralUtility::trimExplode(LF, GeneralUtility::getUrl($charsetConvTableFile), true);
1050  // Initialize the internal variable holding the conv. table:
1051  $this->parsedCharsets[$charset] = array('local' => array(), 'utf8' => array());
1052  // traverse the lines:
1053  $detectedType = '';
1054  foreach ($lines as $value) {
1055  // Comment line or blanks are ignored.
1056  if (trim($value) && $value[0] !== '#') {
1057  // Detect type if not done yet: (Done on first real line)
1058  // The "whitespaced" type is on the syntax "0x0A 0x000A #LINE FEED" while "ms-token" is like "B9 = U+00B9 : SUPERSCRIPT ONE"
1059  if (!$detectedType) {
1060  $detectedType = preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value) ? 'whitespaced' : 'ms-token';
1061  }
1062  if ($detectedType === 'ms-token') {
1063  list($hexbyte, $utf8) = preg_split('/[=:]/', $value, 3);
1064  } elseif ($detectedType === 'whitespaced') {
1065  $regA = array();
1066  preg_match('/[[:space:]]*0x([[:alnum:]]*)[[:space:]]+0x([[:alnum:]]*)[[:space:]]+/', $value, $regA);
1067  $hexbyte = $regA[1];
1068  $utf8 = 'U+' . $regA[2];
1069  }
1070  $decval = hexdec(trim($hexbyte));
1071  if ($decval > 127) {
1072  $utf8decval = hexdec(substr(trim($utf8), 2));
1073  $this->parsedCharsets[$charset]['local'][$decval] = $this->UnumberToChar($utf8decval);
1074  $this->parsedCharsets[$charset]['utf8'][$this->parsedCharsets[$charset]['local'][$decval]] = $decval;
1075  }
1076  }
1077  }
1078  if ($cacheFile) {
1079  GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->parsedCharsets[$charset]));
1080  }
1081  }
1082  return 2;
1083  } else {
1084  return false;
1085  }
1086  } else {
1087  return 1;
1088  }
1089  }
1090 
1100  public function initUnicodeData($mode = null)
1101  {
1102  // Cache files
1103  $cacheFileCase = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_utf-8.tbl');
1104  $cacheFileASCII = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_utf-8.tbl');
1105  // Only process if the tables are not yet loaded
1106  switch ($mode) {
1107  case 'case':
1108  if (is_array($this->caseFolding['utf-8'])) {
1109  return 1;
1110  }
1111  // Use cached version if possible
1112  if ($cacheFileCase && @is_file($cacheFileCase)) {
1113  $this->caseFolding['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileCase));
1114  return 2;
1115  }
1116  break;
1117  case 'ascii':
1118  if (is_array($this->toASCII['utf-8'])) {
1119  return 1;
1120  }
1121  // Use cached version if possible
1122  if ($cacheFileASCII && @is_file($cacheFileASCII)) {
1123  $this->toASCII['utf-8'] = unserialize(GeneralUtility::getUrl($cacheFileASCII));
1124  return 2;
1125  }
1126  break;
1127  }
1128  // Process main Unicode data file
1129  $unicodeDataFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/UnicodeData.txt';
1130  if (!(GeneralUtility::validPathStr($unicodeDataFile) && @is_file($unicodeDataFile))) {
1131  return false;
1132  }
1133  $fh = fopen($unicodeDataFile, 'rb');
1134  if (!$fh) {
1135  return false;
1136  }
1137  // key = utf8 char (single codepoint), value = utf8 string (codepoint sequence)
1138  // Note: we use the UTF-8 characters here and not the Unicode numbers to avoid conversion roundtrip in utf8_strtolower/-upper)
1139  $this->caseFolding['utf-8'] = array();
1140  $utf8CaseFolding = &$this->caseFolding['utf-8'];
1141  // a shorthand
1142  $utf8CaseFolding['toUpper'] = array();
1143  $utf8CaseFolding['toLower'] = array();
1144  $utf8CaseFolding['toTitle'] = array();
1145  // Array of temp. decompositions
1146  $decomposition = array();
1147  // Array of chars that are marks (eg. composing accents)
1148  $mark = array();
1149  // Array of chars that are numbers (eg. digits)
1150  $number = array();
1151  // Array of chars to be omitted (eg. Russian hard sign)
1152  $omit = array();
1153  while (!feof($fh)) {
1154  $line = fgets($fh, 4096);
1155  // Has a lot of info
1156  list($char, $name, $cat, , , $decomp, , , $num, , , , $upper, $lower, $title, ) = explode(';', rtrim($line));
1157  $ord = hexdec($char);
1158  if ($ord > 65535) {
1159  // Only process the BMP
1160  break;
1161  }
1162  $utf8_char = $this->UnumberToChar($ord);
1163  if ($upper) {
1164  $utf8CaseFolding['toUpper'][$utf8_char] = $this->UnumberToChar(hexdec($upper));
1165  }
1166  if ($lower) {
1167  $utf8CaseFolding['toLower'][$utf8_char] = $this->UnumberToChar(hexdec($lower));
1168  }
1169  // Store "title" only when different from "upper" (only a few)
1170  if ($title && $title !== $upper) {
1171  $utf8CaseFolding['toTitle'][$utf8_char] = $this->UnumberToChar(hexdec($title));
1172  }
1173  switch ($cat[0]) {
1174  case 'M':
1175  // mark (accent, umlaut, ...)
1176  $mark['U+' . $char] = 1;
1177  break;
1178  case 'N':
1179  // numeric value
1180  if ($ord > 128 && $num !== '') {
1181  $number['U+' . $char] = $num;
1182  }
1183  }
1184  // Accented Latin letters without "official" decomposition
1185  $match = array();
1186  if (preg_match('/^LATIN (SMALL|CAPITAL) LETTER ([A-Z]) WITH/', $name, $match) && !$decomp) {
1187  $c = ord($match[2]);
1188  if ($match[1] === 'SMALL') {
1189  $c += 32;
1190  }
1191  $decomposition['U+' . $char] = array(dechex($c));
1192  continue;
1193  }
1194  $match = array();
1195  if (preg_match('/(<.*>)? *(.+)/', $decomp, $match)) {
1196  switch ($match[1]) {
1197  case '<circle>':
1198  // add parenthesis as circle replacement, eg (1)
1199  $match[2] = '0028 ' . $match[2] . ' 0029';
1200  break;
1201  case '<square>':
1202  // add square brackets as square replacement, eg [1]
1203  $match[2] = '005B ' . $match[2] . ' 005D';
1204  break;
1205  case '<compat>':
1206  // ignore multi char decompositions that start with a space
1207  if (preg_match('/^0020 /', $match[2])) {
1208  continue 2;
1209  }
1210  break;
1211  case '<initial>':
1212  case '<medial>':
1213  case '<final>':
1214  case '<isolated>':
1215  case '<vertical>':
1216  continue 2;
1217  }
1218  $decomposition['U+' . $char] = explode(' ', $match[2]);
1219  }
1220  }
1221  fclose($fh);
1222  // Process additional Unicode data for casing (allow folded characters to expand into a sequence)
1223  $specialCasingFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/SpecialCasing.txt';
1224  if (GeneralUtility::validPathStr($specialCasingFile) && @is_file($specialCasingFile)) {
1225  $fh = fopen($specialCasingFile, 'rb');
1226  if ($fh) {
1227  while (!feof($fh)) {
1228  $line = fgets($fh, 4096);
1229  if ($line[0] !== '#' && trim($line) !== '') {
1230  list($char, $lower, $title, $upper, $cond) = GeneralUtility::trimExplode(';', $line);
1231  if ($cond === '' || $cond[0] === '#') {
1232  $utf8_char = $this->UnumberToChar(hexdec($char));
1233  if ($char !== $lower) {
1234  $arr = explode(' ', $lower);
1235  for ($i = 0; isset($arr[$i]); $i++) {
1236  $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1237  }
1238  $utf8CaseFolding['toLower'][$utf8_char] = implode('', $arr);
1239  }
1240  if ($char !== $title && $title !== $upper) {
1241  $arr = explode(' ', $title);
1242  for ($i = 0; isset($arr[$i]); $i++) {
1243  $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1244  }
1245  $utf8CaseFolding['toTitle'][$utf8_char] = implode('', $arr);
1246  }
1247  if ($char !== $upper) {
1248  $arr = explode(' ', $upper);
1249  for ($i = 0; isset($arr[$i]); $i++) {
1250  $arr[$i] = $this->UnumberToChar(hexdec($arr[$i]));
1251  }
1252  $utf8CaseFolding['toUpper'][$utf8_char] = implode('', $arr);
1253  }
1254  }
1255  }
1256  }
1257  fclose($fh);
1258  }
1259  }
1260  // Process custom decompositions
1261  $customTranslitFile = ExtensionManagementUtility::extPath('core') . 'Resources/Private/Charsets/unidata/Translit.txt';
1262  if (GeneralUtility::validPathStr($customTranslitFile) && @is_file($customTranslitFile)) {
1263  $fh = fopen($customTranslitFile, 'rb');
1264  if ($fh) {
1265  while (!feof($fh)) {
1266  $line = fgets($fh, 4096);
1267  if ($line[0] !== '#' && trim($line) !== '') {
1268  list($char, $translit) = GeneralUtility::trimExplode(';', $line);
1269  if (!$translit) {
1270  $omit['U+' . $char] = 1;
1271  }
1272  $decomposition['U+' . $char] = explode(' ', $translit);
1273  }
1274  }
1275  fclose($fh);
1276  }
1277  }
1278  // Decompose and remove marks; inspired by unac (Loic Dachary <loic@senga.org>)
1279  foreach ($decomposition as $from => $to) {
1280  $code_decomp = array();
1281  while ($code_value = array_shift($to)) {
1282  // Do recursive decomposition
1283  if (isset($decomposition['U+' . $code_value])) {
1284  foreach (array_reverse($decomposition['U+' . $code_value]) as $cv) {
1285  array_unshift($to, $cv);
1286  }
1287  } elseif (!isset($mark[('U+' . $code_value)])) {
1288  // remove mark
1289  array_push($code_decomp, $code_value);
1290  }
1291  }
1292  if (!empty($code_decomp) || isset($omit[$from])) {
1293  $decomposition[$from] = $code_decomp;
1294  } else {
1295  unset($decomposition[$from]);
1296  }
1297  }
1298  // Create ascii only mapping
1299  $this->toASCII['utf-8'] = array();
1300  $ascii = &$this->toASCII['utf-8'];
1301  foreach ($decomposition as $from => $to) {
1302  $code_decomp = array();
1303  while ($code_value = array_shift($to)) {
1304  $ord = hexdec($code_value);
1305  if ($ord > 127) {
1306  continue 2;
1307  } else {
1308  // Skip decompositions containing non-ASCII chars
1309  array_push($code_decomp, chr($ord));
1310  }
1311  }
1312  $ascii[$this->UnumberToChar(hexdec($from))] = join('', $code_decomp);
1313  }
1314  // Add numeric decompositions
1315  foreach ($number as $from => $to) {
1316  $utf8_char = $this->UnumberToChar(hexdec($from));
1317  if (!isset($ascii[$utf8_char])) {
1318  $ascii[$utf8_char] = $to;
1319  }
1320  }
1321  if ($cacheFileCase) {
1322  GeneralUtility::writeFileToTypo3tempDir($cacheFileCase, serialize($utf8CaseFolding));
1323  }
1324  if ($cacheFileASCII) {
1325  GeneralUtility::writeFileToTypo3tempDir($cacheFileASCII, serialize($ascii));
1326  }
1327  return 3;
1328  }
1329 
1338  public function initCaseFolding($charset)
1339  {
1340  // Only process if the case table is not yet loaded:
1341  if (is_array($this->caseFolding[$charset])) {
1342  return 1;
1343  }
1344  // Use cached version if possible
1345  $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/cscase_' . $charset . '.tbl');
1346  if ($cacheFile && @is_file($cacheFile)) {
1347  $this->caseFolding[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1348  return 2;
1349  }
1350  // init UTF-8 conversion for this charset
1351  if (!$this->initCharset($charset)) {
1352  return false;
1353  }
1354  // UTF-8 case folding is used as the base conversion table
1355  if (!$this->initUnicodeData('case')) {
1356  return false;
1357  }
1358  $nochar = chr($this->noCharByteVal);
1359  foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1360  // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1361  $c = $this->utf8_decode($utf8, $charset);
1362  $cc = $this->utf8_decode($this->caseFolding['utf-8']['toUpper'][$utf8], $charset);
1363  if ($cc !== '' && $cc !== $nochar) {
1364  $this->caseFolding[$charset]['toUpper'][$c] = $cc;
1365  }
1366  $cc = $this->utf8_decode($this->caseFolding['utf-8']['toLower'][$utf8], $charset);
1367  if ($cc !== '' && $cc !== $nochar) {
1368  $this->caseFolding[$charset]['toLower'][$c] = $cc;
1369  }
1370  $cc = $this->utf8_decode($this->caseFolding['utf-8']['toTitle'][$utf8], $charset);
1371  if ($cc !== '' && $cc !== $nochar) {
1372  $this->caseFolding[$charset]['toTitle'][$c] = $cc;
1373  }
1374  }
1375  // Add the ASCII case table
1376  $start = ord('a');
1377  $end = ord('z');
1378  for ($i = $start; $i <= $end; $i++) {
1379  $this->caseFolding[$charset]['toUpper'][chr($i)] = chr($i - 32);
1380  }
1381  $start = ord('A');
1382  $end = ord('Z');
1383  for ($i = $start; $i <= $end; $i++) {
1384  $this->caseFolding[$charset]['toLower'][chr($i)] = chr($i + 32);
1385  }
1386  if ($cacheFile) {
1387  GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->caseFolding[$charset]));
1388  }
1389  return 3;
1390  }
1391 
1400  public function initToASCII($charset)
1401  {
1402  // Only process if the case table is not yet loaded:
1403  if (is_array($this->toASCII[$charset])) {
1404  return 1;
1405  }
1406  // Use cached version if possible
1407  $cacheFile = GeneralUtility::getFileAbsFileName('typo3temp/cs/csascii_' . $charset . '.tbl');
1408  if ($cacheFile && @is_file($cacheFile)) {
1409  $this->toASCII[$charset] = unserialize(GeneralUtility::getUrl($cacheFile));
1410  return 2;
1411  }
1412  // Init UTF-8 conversion for this charset
1413  if (!$this->initCharset($charset)) {
1414  return false;
1415  }
1416  // UTF-8/ASCII transliteration is used as the base conversion table
1417  if (!$this->initUnicodeData('ascii')) {
1418  return false;
1419  }
1420  foreach ($this->parsedCharsets[$charset]['local'] as $ci => $utf8) {
1421  // Reconvert to charset (don't use chr() of numeric value, might be muli-byte)
1422  $c = $this->utf8_decode($utf8, $charset);
1423  if (isset($this->toASCII['utf-8'][$utf8])) {
1424  $this->toASCII[$charset][$c] = $this->toASCII['utf-8'][$utf8];
1425  }
1426  }
1427  if ($cacheFile) {
1428  GeneralUtility::writeFileToTypo3tempDir($cacheFile, serialize($this->toASCII[$charset]));
1429  }
1430  return 3;
1431  }
1432 
1433  /********************************************
1434  *
1435  * String operation functions
1436  *
1437  ********************************************/
1449  public function substr($charset, $string, $start, $len = null)
1450  {
1451  if ($len === 0 || $string === '') {
1452  return '';
1453  }
1454  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1455  // Cannot omit $len, when specifying charset
1456  if ($len === null) {
1457  // Save internal encoding
1458  $enc = mb_internal_encoding();
1459  mb_internal_encoding($charset);
1460  $str = mb_substr($string, $start);
1461  // Restore internal encoding
1462  mb_internal_encoding($enc);
1463  return $str;
1464  } else {
1465  return mb_substr($string, $start, $len, $charset);
1466  }
1467  } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1468  // Cannot omit $len, when specifying charset
1469  if ($len === null) {
1470  // Save internal encoding
1471  $enc = iconv_get_encoding('internal_encoding');
1472  iconv_set_encoding('internal_encoding', $charset);
1473  $str = iconv_substr($string, $start);
1474  // Restore internal encoding
1475  iconv_set_encoding('internal_encoding', $enc);
1476  return $str;
1477  } else {
1478  return iconv_substr($string, $start, $len, $charset);
1479  }
1480  } elseif ($charset === 'utf-8') {
1481  return $this->utf8_substr($string, $start, $len);
1482  } elseif ($this->eucBasedSets[$charset]) {
1483  return $this->euc_substr($string, $start, $charset, $len);
1484  } elseif ($this->twoByteSets[$charset]) {
1485  return substr($string, $start * 2, $len * 2);
1486  } elseif ($this->fourByteSets[$charset]) {
1487  return substr($string, $start * 4, $len * 4);
1488  }
1489  // Treat everything else as single-byte encoding
1490  return $len === null ? substr($string, $start) : substr($string, $start, $len);
1491  }
1492 
1502  public function strlen($charset, $string)
1503  {
1504  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1505  return mb_strlen($string, $charset);
1506  } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1507  return iconv_strlen($string, $charset);
1508  } elseif ($charset === 'utf-8') {
1509  return $this->utf8_strlen($string);
1510  } elseif ($this->eucBasedSets[$charset]) {
1511  return $this->euc_strlen($string, $charset);
1512  } elseif ($this->twoByteSets[$charset]) {
1513  return strlen($string) / 2;
1514  } elseif ($this->fourByteSets[$charset]) {
1515  return strlen($string) / 4;
1516  }
1517  // Treat everything else as single-byte encoding
1518  return strlen($string);
1519  }
1520 
1531  protected function cropMbstring($charset, $string, $len, $crop = '')
1532  {
1533  if ((int)$len === 0 || mb_strlen($string, $charset) <= abs($len)) {
1534  return $string;
1535  }
1536  if ($len > 0) {
1537  $string = mb_substr($string, 0, $len, $charset) . $crop;
1538  } else {
1539  $string = $crop . mb_substr($string, $len, mb_strlen($string, $charset), $charset);
1540  }
1541  return $string;
1542  }
1543 
1555  public function crop($charset, $string, $len, $crop = '')
1556  {
1557  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1558  return $this->cropMbstring($charset, $string, $len, $crop);
1559  }
1560  if ((int)$len === 0) {
1561  return $string;
1562  }
1563  if ($charset === 'utf-8') {
1564  $i = $this->utf8_char2byte_pos($string, $len);
1565  } elseif ($this->eucBasedSets[$charset]) {
1566  $i = $this->euc_char2byte_pos($string, $len, $charset);
1567  } else {
1568  if ($len > 0) {
1569  $i = $len;
1570  } else {
1571  $i = strlen($string) + $len;
1572  if ($i <= 0) {
1573  $i = false;
1574  }
1575  }
1576  }
1577  // $len outside actual string length
1578  if ($i === false) {
1579  return $string;
1580  } else {
1581  if ($len > 0) {
1582  if (isset($string[$i])) {
1583  return substr($string, 0, $i) . $crop;
1584  }
1585  } else {
1586  if (isset($string[$i - 1])) {
1587  return $crop . substr($string, $i);
1588  }
1589  }
1590  }
1591  return $string;
1592  }
1593 
1603  public function strtrunc($charset, $string, $len)
1604  {
1605  if ($len <= 0) {
1606  return '';
1607  }
1608  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1609  return mb_strcut($string, 0, $len, $charset);
1610  } elseif ($charset === 'utf-8') {
1611  return $this->utf8_strtrunc($string, $len);
1612  } elseif ($this->eucBasedSets[$charset]) {
1613  return $this->euc_strtrunc($string, $len, $charset);
1614  } elseif ($this->twoByteSets[$charset]) {
1615  if ($len % 2) {
1616  $len--;
1617  }
1618  } elseif ($this->fourByteSets[$charset]) {
1619  $x = $len % 4;
1620  // Realign to position dividable by four
1621  $len -= $x;
1622  }
1623  // Treat everything else as single-byte encoding
1624  return substr($string, 0, $len);
1625  }
1626 
1641  public function conv_case($charset, $string, $case)
1642  {
1643  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1644  if ($case === 'toLower') {
1645  $string = mb_strtolower($string, $charset);
1646  } else {
1647  $string = mb_strtoupper($string, $charset);
1648  }
1649  } elseif ($charset === 'utf-8') {
1650  $string = $this->utf8_char_mapping($string, 'case', $case);
1651  } elseif (isset($this->eucBasedSets[$charset])) {
1652  $string = $this->euc_char_mapping($string, $charset, 'case', $case);
1653  } else {
1654  // Treat everything else as single-byte encoding
1655  $string = $this->sb_char_mapping($string, $charset, 'case', $case);
1656  }
1657  return $string;
1658  }
1659 
1669  public function convCaseFirst($charset, $string, $case)
1670  {
1671  $firstChar = $this->substr($charset, $string, 0, 1);
1672  $firstChar = $this->conv_case($charset, $firstChar, $case);
1673  $remainder = $this->substr($charset, $string, 1);
1674  return $firstChar . $remainder;
1675  }
1676 
1684  public function specCharsToASCII($charset, $string)
1685  {
1686  if ($charset === 'utf-8') {
1687  $string = $this->utf8_char_mapping($string, 'ascii');
1688  } elseif (isset($this->eucBasedSets[$charset])) {
1689  $string = $this->euc_char_mapping($string, $charset, 'ascii');
1690  } else {
1691  // Treat everything else as single-byte encoding
1692  $string = $this->sb_char_mapping($string, $charset, 'ascii');
1693  }
1694  return $string;
1695  }
1696 
1704  public function getPreferredClientLanguage($languageCodesList)
1705  {
1706  $allLanguageCodes = array();
1707  $selectedLanguage = 'default';
1708  // Get all languages where TYPO3 code is the same as the ISO code
1709  foreach ($this->charSetArray as $typo3Lang => $charSet) {
1710  $allLanguageCodes[$typo3Lang] = $typo3Lang;
1711  }
1712  // Get all languages where TYPO3 code differs from ISO code
1713  // or needs the country part
1714  // the iso codes will here overwrite the default typo3 language in the key
1715  foreach ($this->locales->getIsoMapping() as $typo3Lang => $isoLang) {
1716  $isoLang = join('-', explode('_', $isoLang));
1717  $allLanguageCodes[$typo3Lang] = $isoLang;
1718  }
1719  // Move the iso codes to the (because we're comparing the keys with "isset" later on)
1720  $allLanguageCodes = array_flip($allLanguageCodes);
1721  $preferredLanguages = GeneralUtility::trimExplode(',', $languageCodesList);
1722  // Order the preferred languages after they key
1723  $sortedPreferredLanguages = array();
1724  foreach ($preferredLanguages as $preferredLanguage) {
1725  $quality = 1.0;
1726  if (strpos($preferredLanguage, ';q=') !== false) {
1727  list($preferredLanguage, $quality) = explode(';q=', $preferredLanguage);
1728  }
1729  $sortedPreferredLanguages[$preferredLanguage] = $quality;
1730  }
1731  // Loop through the languages, with the highest priority first
1732  arsort($sortedPreferredLanguages, SORT_NUMERIC);
1733  foreach ($sortedPreferredLanguages as $preferredLanguage => $quality) {
1734  if (isset($allLanguageCodes[$preferredLanguage])) {
1735  $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1736  break;
1737  }
1738  // Strip the country code from the end
1739  list($preferredLanguage, ) = explode('-', $preferredLanguage);
1740  if (isset($allLanguageCodes[$preferredLanguage])) {
1741  $selectedLanguage = $allLanguageCodes[$preferredLanguage];
1742  break;
1743  }
1744  }
1745  if (!$selectedLanguage || $selectedLanguage === 'en') {
1746  $selectedLanguage = 'default';
1747  }
1748  return $selectedLanguage;
1749  }
1750 
1751  /********************************************
1752  *
1753  * Internal string operation functions
1754  *
1755  ********************************************/
1765  public function sb_char_mapping($str, $charset, $mode, $opt = '')
1766  {
1767  switch ($mode) {
1768  case 'case':
1769  if (!$this->initCaseFolding($charset)) {
1770  return $str;
1771  }
1772  // Do nothing
1773  $map = &$this->caseFolding[$charset][$opt];
1774  break;
1775  case 'ascii':
1776  if (!$this->initToASCII($charset)) {
1777  return $str;
1778  }
1779  // Do nothing
1780  $map = &$this->toASCII[$charset];
1781  break;
1782  default:
1783  return $str;
1784  }
1785  $out = '';
1786  for ($i = 0; isset($str[$i]); $i++) {
1787  $c = $str[$i];
1788  if (isset($map[$c])) {
1789  $out .= $map[$c];
1790  } else {
1791  $out .= $c;
1792  }
1793  }
1794  return $out;
1795  }
1796 
1797  /********************************************
1798  *
1799  * Internal UTF-8 string operation functions
1800  *
1801  ********************************************/
1812  public function utf8_substr($str, $start, $len = null)
1813  {
1814  if ((string)$len === '0') {
1815  return '';
1816  }
1817  $byte_start = $this->utf8_char2byte_pos($str, $start);
1818  if ($byte_start === false) {
1819  if ($start > 0) {
1820  // $start outside string length
1821  return false;
1822  }
1823  }
1824  $str = substr($str, $byte_start);
1825  if ($len != null) {
1826  $byte_end = $this->utf8_char2byte_pos($str, $len);
1827  // $len outside actual string length
1828  if ($byte_end === false) {
1829  return $len < 0 ? '' : $str;
1830  } else {
1831  // When length is less than zero and exceeds, then we return blank string.
1832  return substr($str, 0, $byte_end);
1833  }
1834  } else {
1835  return $str;
1836  }
1837  }
1838 
1847  public function utf8_strlen($str)
1848  {
1849  $n = 0;
1850  for ($i = 0; isset($str[$i]); $i++) {
1851  $c = ord($str[$i]);
1852  // Single-byte (0xxxxxx)
1853  if (!($c & 128)) {
1854  $n++;
1855  } elseif (($c & 192) === 192) {
1856  // Multi-byte starting byte (11xxxxxx)
1857  $n++;
1858  }
1859  }
1860  return $n;
1861  }
1862 
1871  public function utf8_strtrunc($str, $len)
1872  {
1873  $i = $len - 1;
1874  // Part of a multibyte sequence
1875  if (ord($str[$i]) & 128) {
1876  for (; $i > 0 && !(ord($str[$i]) & 64); $i--) {
1877  }
1878  if ($i <= 0) {
1879  return '';
1880  }
1881  // Sanity check
1882  for ($bc = 0, $mbs = ord($str[$i]); $mbs & 128; $mbs = $mbs << 1) {
1883  // Calculate number of bytes
1884  $bc++;
1885  }
1886  if ($bc + $i > $len) {
1887  return substr($str, 0, $i);
1888  }
1889  }
1890  return substr($str, 0, $len);
1891  }
1892 
1902  public function utf8_strpos($haystack, $needle, $offset = 0)
1903  {
1904  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1905  return mb_strpos($haystack, $needle, $offset, 'utf-8');
1906  } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1907  return iconv_strpos($haystack, $needle, $offset, 'utf-8');
1908  }
1909  $byte_offset = $this->utf8_char2byte_pos($haystack, $offset);
1910  if ($byte_offset === false) {
1911  // Offset beyond string length
1912  return false;
1913  }
1914  $byte_pos = strpos($haystack, $needle, $byte_offset);
1915  if ($byte_pos === false) {
1916  // Needle not found
1917  return false;
1918  }
1919  return $this->utf8_byte2char_pos($haystack, $byte_pos);
1920  }
1921 
1930  public function utf8_strrpos($haystack, $needle)
1931  {
1932  if ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'mbstring') {
1933  return mb_strrpos($haystack, $needle, 'utf-8');
1934  } elseif ($GLOBALS['TYPO3_CONF_VARS']['SYS']['t3lib_cs_utils'] === 'iconv') {
1935  return iconv_strrpos($haystack, $needle, 'utf-8');
1936  }
1937  $byte_pos = strrpos($haystack, $needle);
1938  if ($byte_pos === false) {
1939  // Needle not found
1940  return false;
1941  }
1942  return $this->utf8_byte2char_pos($haystack, $byte_pos);
1943  }
1944 
1953  public function utf8_char2byte_pos($str, $pos)
1954  {
1955  // Number of characters found
1956  $n = 0;
1957  // Number of characters wanted
1958  $p = abs($pos);
1959  if ($pos >= 0) {
1960  $i = 0;
1961  $d = 1;
1962  } else {
1963  $i = strlen($str) - 1;
1964  $d = -1;
1965  }
1966  for (; isset($str[$i]) && $n < $p; $i += $d) {
1967  $c = (int)ord($str[$i]);
1968  // single-byte (0xxxxxx)
1969  if (!($c & 128)) {
1970  $n++;
1971  } elseif (($c & 192) === 192) {
1972  // Multi-byte starting byte (11xxxxxx)
1973  $n++;
1974  }
1975  }
1976  if (!isset($str[$i])) {
1977  // Offset beyond string length
1978  return false;
1979  }
1980  if ($pos >= 0) {
1981  // Skip trailing multi-byte data bytes
1982  while (ord($str[$i]) & 128 && !(ord($str[$i]) & 64)) {
1983  $i++;
1984  }
1985  } else {
1986  // Correct offset
1987  $i++;
1988  }
1989  return $i;
1990  }
1991 
2000  public function utf8_byte2char_pos($str, $pos)
2001  {
2002  // Number of characters
2003  $n = 0;
2004  for ($i = $pos; $i > 0; $i--) {
2005  $c = (int)ord($str[$i]);
2006  // single-byte (0xxxxxx)
2007  if (!($c & 128)) {
2008  $n++;
2009  } elseif (($c & 192) === 192) {
2010  // Multi-byte starting byte (11xxxxxx)
2011  $n++;
2012  }
2013  }
2014  if (!isset($str[$i])) {
2015  // Offset beyond string length
2016  return false;
2017  }
2018  return $n;
2019  }
2020 
2029  public function utf8_char_mapping($str, $mode, $opt = '')
2030  {
2031  if (!$this->initUnicodeData($mode)) {
2032  // Do nothing
2033  return $str;
2034  }
2035  $out = '';
2036  switch ($mode) {
2037  case 'case':
2038  $map = &$this->caseFolding['utf-8'][$opt];
2039  break;
2040  case 'ascii':
2041  $map = &$this->toASCII['utf-8'];
2042  break;
2043  default:
2044  return $str;
2045  }
2046  for ($i = 0; isset($str[$i]); $i++) {
2047  $c = ord($str[$i]);
2048  // single-byte (0xxxxxx)
2049  if (!($c & 128)) {
2050  $mbc = $str[$i];
2051  } elseif (($c & 192) === 192) {
2052  // multi-byte starting byte (11xxxxxx)
2053  for ($bc = 0; $c & 128; $c = $c << 1) {
2054  $bc++;
2055  }
2056  // calculate number of bytes
2057  $mbc = substr($str, $i, $bc);
2058  $i += $bc - 1;
2059  }
2060  if (isset($map[$mbc])) {
2061  $out .= $map[$mbc];
2062  } else {
2063  $out .= $mbc;
2064  }
2065  }
2066  return $out;
2067  }
2068 
2069  /********************************************
2070  *
2071  * Internal EUC string operation functions
2072  *
2073  * Extended Unix Code:
2074  * ASCII compatible 7bit single bytes chars
2075  * 8bit two byte chars
2076  *
2077  * Shift-JIS is treated as a special case.
2078  *
2079  ********************************************/
2089  public function euc_strtrunc($str, $len, $charset)
2090  {
2091  $shiftJis = $charset === 'shift_jis';
2092  for ($i = 0; isset($str[$i]) && $i < $len; $i++) {
2093  $c = ord($str[$i]);
2094  if ($shiftJis) {
2095  if ($c >= 128 && $c < 160 || $c >= 224) {
2096  $i++;
2097  }
2098  } else {
2099  if ($c >= 128) {
2100  $i++;
2101  }
2102  }
2103  }
2104  if (!isset($str[$i])) {
2105  return $str;
2106  }
2107  // string shorter than supplied length
2108  if ($i > $len) {
2109  // We ended on a first byte
2110  return substr($str, 0, $len - 1);
2111  } else {
2112  return substr($str, 0, $len);
2113  }
2114  }
2115 
2125  public function euc_substr($str, $start, $charset, $len = null)
2126  {
2127  $byte_start = $this->euc_char2byte_pos($str, $start, $charset);
2128  if ($byte_start === false) {
2129  // $start outside string length
2130  return false;
2131  }
2132  $str = substr($str, $byte_start);
2133  if ($len != null) {
2134  $byte_end = $this->euc_char2byte_pos($str, $len, $charset);
2135  // $len outside actual string length
2136  if ($byte_end === false) {
2137  return $str;
2138  } else {
2139  return substr($str, 0, $byte_end);
2140  }
2141  } else {
2142  return $str;
2143  }
2144  }
2145 
2154  public function euc_strlen($str, $charset)
2155  {
2156  $sjis = $charset === 'shift_jis';
2157  $n = 0;
2158  for ($i = 0; isset($str[$i]); $i++) {
2159  $c = ord($str[$i]);
2160  if ($sjis) {
2161  if ($c >= 128 && $c < 160 || $c >= 224) {
2162  $i++;
2163  }
2164  } else {
2165  if ($c >= 128) {
2166  $i++;
2167  }
2168  }
2169  $n++;
2170  }
2171  return $n;
2172  }
2173 
2182  public function euc_char2byte_pos($str, $pos, $charset)
2183  {
2184  $sjis = $charset === 'shift_jis';
2185  // Number of characters seen
2186  $n = 0;
2187  // Number of characters wanted
2188  $p = abs($pos);
2189  if ($pos >= 0) {
2190  $i = 0;
2191  $d = 1;
2192  } else {
2193  $i = strlen($str) - 1;
2194  $d = -1;
2195  }
2196  for (; isset($str[$i]) && $n < $p; $i += $d) {
2197  $c = ord($str[$i]);
2198  if ($sjis) {
2199  if ($c >= 128 && $c < 160 || $c >= 224) {
2200  $i += $d;
2201  }
2202  } else {
2203  if ($c >= 128) {
2204  $i += $d;
2205  }
2206  }
2207  $n++;
2208  }
2209  if (!isset($str[$i])) {
2210  return false;
2211  }
2212  // offset beyond string length
2213  if ($pos < 0) {
2214  $i++;
2215  }
2216  // correct offset
2217  return $i;
2218  }
2219 
2229  public function euc_char_mapping($str, $charset, $mode, $opt = '')
2230  {
2231  switch ($mode) {
2232  case 'case':
2233  if (!$this->initCaseFolding($charset)) {
2234  return $str;
2235  }
2236  // do nothing
2237  $map = &$this->caseFolding[$charset][$opt];
2238  break;
2239  case 'ascii':
2240  if (!$this->initToASCII($charset)) {
2241  return $str;
2242  }
2243  // do nothing
2244  $map = &$this->toASCII[$charset];
2245  break;
2246  default:
2247  return $str;
2248  }
2249  $sjis = $charset === 'shift_jis';
2250  $out = '';
2251  for ($i = 0; isset($str[$i]); $i++) {
2252  $mbc = $str[$i];
2253  $c = ord($mbc);
2254  if ($sjis) {
2255  // A double-byte char
2256  if ($c >= 128 && $c < 160 || $c >= 224) {
2257  $mbc = substr($str, $i, 2);
2258  $i++;
2259  }
2260  } else {
2261  // A double-byte char
2262  if ($c >= 128) {
2263  $mbc = substr($str, $i, 2);
2264  $i++;
2265  }
2266  }
2267  if (isset($map[$mbc])) {
2268  $out .= $map[$mbc];
2269  } else {
2270  $out .= $mbc;
2271  }
2272  }
2273  return $out;
2274  }
2275 }