3 * Class for manipulating Unicode data
7 * Copyright (c) 2007 Geoffrey Sneddon
9 * Permission is hereby granted, free of charge, to any person obtaining a copy
10 * of this software and associated documentation files (the "Software"), to deal
11 * in the Software without restriction, including without limitation the rights
12 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 * copies of the Software, and to permit persons to whom the Software is
14 * furnished to do so, subject to the following conditions:
16 * The above copyright notice and this permission notice shall be included in
17 * all copies or substantial portions of the Software.
19 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
29 * @copyright 2007 Geoffrey Sneddon
30 * @author Geoffrey Sneddon
31 * @license http://www.opensource.org/licenses/mit-license.php The MIT License
42 * Contains the raw unicode data that we're working from
44 * @var string UTF-32BE binary string on PHP < 6, otherwise a unicode string
49 * Object should be created with some Unicode::from_*() method, therefore
52 private function __construct()
57 * Prepare the object for serialisation
59 * If we're on PHP6, convert the Unicode::$data to a UTF-32BE binary string
60 * before serialising the object to allow for the object to be unserialised
61 * on older PHP versions without affecting functionality
63 public function __sleep()
65 if (version_compare(phpversion(), '6', '>=') && is_unicode($this->data))
67 $this->data = unicode_encode($this->data, 'UTF-32BE');
73 * Check the object is valid when being unserialised
75 * To prepare the object for use after being unserialised, we need to check
76 * that it is valid, and to convert Unicode::$data to a unicode string on
77 * PHP6. If Unicode::$data is not a string, a warning will be thrown. The
78 * validity of the UTF-32BE Unicode::$data is also checked, and the string
79 * is corrected if it is invalid.
81 public function __wakeup()
83 if (!isset($this->data))
85 trigger_error('Unicode::__wakeup() expects the serialised object to have a $data property, none exists', E_USER_WARNING);
88 elseif (!is_string($this->data))
90 trigger_error('Unicode::__wakeup() expects Unicode::$data to be string, ' . get_type($this->data) . ' given', E_USER_WARNING);
93 elseif (version_compare(phpversion(), '6', '>=') && is_binary($this->data))
95 $this->data = self::call_unicode_func('unicode_decode', $this->data, 'UTF-32BE');
97 elseif (version_compare(phpversion(), '6', '<'))
99 $this->data = Unicode::from_utf32be($this->data)->to_utf32be();
104 * Call a function given by the first parameter in our own unicode setup
106 * @see call_user_func()
107 * @see Unicode::call_unicode_func_array()
108 * @param callback $function
109 * @param mixed $parameter,...
112 private static function call_unicode_func($function)
114 $param_arr = func_get_args();
115 unset($param_arr[0]);
116 return self::call_unicode_func_array($function, $param_arr);
120 * Call a function given by the first parameter with an array of parameters
121 * in our own unicode setup
123 * @see call_user_func_array()
124 * @see Unicode::call_unicode_func()
125 * @param callback $function
126 * @param array $param_arr
129 private static function call_unicode_func_array($function, $param_arr)
131 // Save the current unicode enviroment settings
132 $substr_char = unicode_get_subst_char();
133 $from_mode = unicode_get_error_mode(FROM_UNICODE);
134 $to_mode = unicode_get_error_mode(TO_UNICODE);
136 // Set our own unicode enviroment settings
137 unicode_set_subst_char("\uFFFD");
138 unicode_set_error_mode(FROM_UNICODE, U_CONV_ERROR_SUBST);
139 unicode_set_error_mode(TO_UNICODE, U_CONV_ERROR_SUBST);
141 // Actually call the function
142 $return = call_user_func_array($function, $param_arr);
144 // Return everything to its prior state
145 unicode_set_subst_char($substr_char);
146 unicode_set_error_mode(FROM_UNICODE, $from_mode);
147 unicode_set_error_mode(TO_UNICODE, $to_mode);
149 // Finally return what the function returned
154 * Check the given codepoint is a valid character
156 * @param int $codepoint
159 private static function valid_unicode_codepoint($codepoint)
161 // Outside of Unicode codespace
163 || $codepoint > 0x10FFFF
165 || $codepoint >= 0xD800 && $codepoint <= 0xDFFF
167 || ($codepoint & 0xFFFE) === 0xFFFE
168 || $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF)
179 * Create a new Unicode object from an array of codepoints
181 * @param array $array
184 public static function from_codepoint_array($array)
186 // Check given parameter is an array
187 if (!is_array($string))
189 trigger_error('Unicode::from_codepoint_array() expects parameter 1 to be array, ' . get_type($string) . ' given', E_USER_WARNING);
193 // Get U+FFFD as a binary string (which is slightly hard on PHP 6)
194 static $replacement_character;
195 if (!$replacement_character)
197 if (version_compare(phpversion(), '6', '>='))
199 $replacement_character = unicode_encode("\uFFFD", 'UTF-32');
203 $replacement_character = "\x00\x00\xFF\xFD";
208 $unicode = new Unicode;
210 // Strip any leading BOM (as otherwise we chage the meaing of the new sequence, which is illegal)
211 if (isset($array[0]) && $array[0] === 0xFFFD)
213 array_splice($array, 1);
216 // Iterate through each and every codepoint
217 foreach ($array as $codepoint)
219 // If the codepoint is an invalid character replace it with a U+FFFD REPLACEMENT CHARACTER
220 if (!self::valid_unicode_codepoint($codepoint))
222 $unicode->data .= $replacement_character;
224 // Otherwise, append it to Unicode::$data
227 $unicode->data .= pack('N', $codepoint);
231 // If we're on PHP6, convert it to a unicode string and store that
232 if (version_compare(phpversion(), '6', '>='))
234 $unicode->data = unicode_decode($string, 'UTF-32BE');
241 * Create an array of codepoints from the object
245 public function to_codepoint_array()
247 if (version_compare(phpversion(), '6', '>=') && is_unicode($this->data))
249 $data = unicode_encode($this->data, 'UTF-32BE');
255 return array_values(unpack('N*', $data));
259 * Create a new Unicode object from a UTF-8 encoded string
261 * @param string $string
264 public static function from_utf8($string)
266 // Check given parameter is a string
267 if (!is_string($string))
269 trigger_error('Unicode::from_utf8() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
274 $unicode = new Unicode;
276 // If we're on PHP6, we'll just get a unicode string and store that
277 if (version_compare(phpversion(), '6', '>='))
279 if (is_unicode($string))
281 $unicode->data = $string;
285 $unicode->data = self::call_unicode_func('unicode_decode', $string, 'UTF-8');
288 // Otherwise, we need to decode the UTF-8 string
291 // Set the data to an empty string, and remaining bytes in the current sequence to zero
295 // Iterate through each and every byte
296 for ($i = 0, $len = strlen($string); $i < $len; $i++)
298 $value = ord($string[$i]);
300 // If we're the first byte of sequence:
303 // One byte sequence:
309 // Two byte sequence:
310 elseif (($value & 0xE0) === 0xC0)
312 $character = ($value & 0x1F) << 6;
316 // Three byte sequence:
317 elseif (($value & 0xF0) === 0xE0)
319 $character = ($value & 0x0F) << 12;
323 // Four byte sequence:
324 elseif (($value & 0xF8) === 0xF0)
326 $character = ($value & 0x07) << 18;
338 // Continuation byte:
341 // Check that the byte is valid, then add it to the character:
342 if (($value & 0xC0) === 0x80)
345 $character |= ($value & 0x3F) << ($remaining * 6);
347 // If it is invalid, count the sequence as invalid and reprocess the current byte as the start of a sequence:
357 // If we've reached the end of the current byte sequence, append it to Unicode::$data
360 // If the character is illegal replace it with U+FFFD REPLACEMENT CHARACTER
361 if ($length > 1 && $character <= 0x7F
362 || $length > 2 && $character <= 0x7FF
363 || $length > 3 && $character <= 0xFFFF
364 || !self::valid_unicode_codepoint($character))
369 $unicode->data .= pack('N', $character);
373 // Strip any U+FEFF BYTE ORDER MARK (as otherwise we chage the meaing of the new sequence, which is illegal)
374 if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
376 $unicode->data = substr($unicode->data, 4);
379 // If we've reached the end of the string but not the end of a character sequence, append a U+FFFD REPLACEMENT CHARACTE
382 $unicode->data .= "\x00\x00\xFF\xFD";
389 * Create a UTF-8 binary string from the object
393 public function to_utf8()
395 if (version_compare(phpversion(), '6', '>=') && is_unicode($this->data))
397 return unicode_encode($this->data, 'UTF-8');
399 elseif (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-8', 'UTF-32BE')))
403 elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-8', $this->data)))
409 $codepoints = unpack('N*', $this->data);
411 foreach ($codepoints as $codepoint)
413 $return .= self::codepoint_to_utf8($codepoint);
420 * Convert a unicode codepoint to a UTF-8 character sequence
422 * @param int $codepoint
425 private static function codepoint_to_utf8($codepoint)
427 // Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
430 // If we haven't already got it cached, go cache it
431 if (!isset($cache[$codepoint]))
433 // On PHP6, we can use its own unicode support
434 if (version_compare(phpversion(), '6', '>='))
436 $cache[$codepoint] = unicode_encode(self::call_unicode_func('chr', $codepoint), 'UTF-8');
438 // If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
439 elseif (!self::valid_unicode_codepoint($codepoint))
441 $cache[$codepoint] = "\xEF\xBF\xBD";
443 // One byte sequence:
444 elseif ($codepoint <= 0x7F)
446 $cache[$codepoint] = chr($codepoint);
448 // Two byte sequence:
449 elseif ($codepoint <= 0x7FF)
451 $cache[$codepoint] = chr(0xC0 | ($codepoint >> 6)) . chr(0x80 | ($codepoint & 0x3F));
453 // Three byte sequence:
454 elseif ($codepoint <= 0xFFFF)
456 $cache[$codepoint] = chr(0xE0 | ($codepoint >> 12)) . chr(0x80 | (($codepoint >> 6) & 0x3F)) . chr(0x80 | ($codepoint & 0x3F));
458 // Four byte sequence:
461 $cache[$codepoint] = chr(0xF0 | ($codepoint >> 18)) . chr(0x80 | (($codepoint >> 12) & 0x3F)) . chr(0x80 | (($codepoint >> 6) & 0x3F)) . chr(0x80 | ($codepoint & 0x3F));
464 return $cache[$codepoint];
468 * Create a new Unicode object from a UTF-16 encoded string
470 * @param string $string
473 public static function from_utf16($string)
475 // Check given parameter is a string
476 if (!is_string($string))
478 trigger_error('Unicode::from_utf8() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
483 $unicode = new Unicode;
485 // If we're on PHP6, we'll just get a unicode string and store that
486 if (version_compare(phpversion(), '6', '>='))
488 if (is_unicode($string))
490 $unicode->data = $string;
494 $unicode->data = self::call_unicode_func('unicode_decode', $string, 'UTF-16');
497 // Otherwise, we need to decode the UTF-16 string
500 // Set the data to an empty string and surrogate to false
504 // See if the string is of a valid length (as UTF-16 is in two byte sequences, it must be divisible by two)
505 $valid_length = (($len = strlen($string)) % 2) ? false : true;
507 // If it is of an invalid length, trim all the invalid bytes at the end (we'll replace them with a U+FFFD REPLACEMENT CHARACTER later)
510 $string = substr($string, 0, floor($len / 2) * 2);
513 // If the string starts with a UTF-16LE BOM, it is UTF-16LE, so decode it as such
514 if (substr($string, 0, 2) === "\xFF\xFE")
516 $words = array_values(unpack('v*', $string));
518 // Otherwise, it is UTF-16BE, so decode it as such
521 $words = array_values(unpack('n*', $string));
524 // Iterate through each and every word
525 for ($i = 0, $word_count = count($words); $i < $word_count; $i++)
527 // If we're the first word of sequence:
530 // One word sequence:
531 if (self::valid_unicode_codepoint($words[$i]))
533 $unicode->data .= pack('N', $words[$i]);
535 // Two word sequence:
536 elseif ($words[$i] >= 0xD800 && $words[$i] <= 0xDFFF)
538 $character = ($words[$i] & 0x3FF) << 10;
544 $unicode->data .= pack('N', 0xFFFD);
550 // Surrogates are only ever two words, so we can say we've reached the end with certainty
553 // Check that the word is valid, then add it to the character:
554 if ($words[$i] >= 0xDC00 && $words[$i] <= 0xDFFF)
556 $character |= $words[$i] & 0x3FF;
557 if (self::valid_unicode_codepoint($character))
559 $unicode->data .= pack('N', $character);
563 $unicode->data .= pack('N', 0xFFFD);
566 // If it is invalid, count the sequence as invalid and reprocess the current word as a first word:
569 $unicode->data .= pack('N', 0xFFFD);
575 // If we've reached the end of the string but not the end of a surrogate pair, append a U+FFFD REPLACEMENT CHARACTER
578 $unicode->data .= "\x00\x00\xFF\xFD";
581 // If it was of an invalid length, append a U+FFFD REPLACEMENT CHARACTER
584 $unicode->data .= "\x00\x00\xFF\xFD";
587 // Strip any U+FEFF BYTE ORDER MARK (as otherwise we chage the meaing of the new sequence, which is illegal)
588 if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
590 $unicode->data = substr($unicode->data, 4);
597 * Create a new Unicode object from a UTF-16BE encoded string
599 * @param string $string
602 public static function from_utf16be($string)
604 // Check given parameter is a string
605 if (!is_string($string))
607 trigger_error('Unicode::from_utf16be() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
611 // Add BOM before calling Unicode::from_utf16()
612 if ((version_compare(phpversion(), '6', '<') || is_binary($string)))
614 // Get U+FEFF as a binary string (which is slightly hard on PHP 6)
618 if (version_compare(phpversion(), '6', '>='))
620 $bom = unicode_encode("\uFEFF", 'UTF-16BE');
627 $string = $bom . $string;
629 return self::from_utf16($string);
633 * Create a new Unicode object from a UTF-16LE encoded string
635 * @param string $string
638 public static function from_utf16le($string)
640 // Check given parameter is a string
641 if (!is_string($string))
643 trigger_error('Unicode::from_utf16le() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
647 // Add BOM before calling Unicode::from_utf16()
648 if ((version_compare(phpversion(), '6', '<') || is_binary($string)))
650 // Get U+FEFF as a binary string (which is slightly hard on PHP 6)
654 if (version_compare(phpversion(), '6', '>='))
656 $bom = unicode_encode("\uFEFF", 'UTF-16LE');
663 $string = $bom . $string;
665 return self::from_utf16($string);
669 * Create a UTF-16 binary string from the object
673 public function to_utf16()
675 if (version_compare(phpversion(), '6', '>='))
677 return unicode_encode("\uFEFF", 'UTF-16BE') . $this->to_utf16be();
681 return "\xFE\xFF" . $this->to_utf16be();
686 * Create a UTF-16BE binary string from the object
690 public function to_utf16be()
692 if (version_compare(phpversion(), '6', '>=') && is_unicode($this->data))
694 return unicode_encode($this->data, 'UTF-16BE');
696 elseif (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-16BE', 'UTF-32BE')))
700 elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-16BE', $this->data)))
706 $codepoints = unpack('N*', $this->data);
708 foreach ($codepoints as $codepoint)
710 $return .= self::codepoint_to_utf16be($codepoint);
717 * Create a UTF-16LE binary string from the object
721 public function to_utf16le()
723 if (version_compare(phpversion(), '6', '>=') && is_unicode($this->data))
725 return unicode_encode($this->data, 'UTF-16LE');
727 elseif (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-16LE', 'UTF-32BE')))
731 elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-16LE', $this->data)))
737 $codepoints = unpack('N*', $this->data);
739 foreach ($codepoints as $codepoint)
741 $return .= self::codepoint_to_utf16le($codepoint);
748 * Convert a unicode codepoint to a UTF-16 character sequence
750 * @param int $codepoint
753 private static function codepoint_to_utf16($codepoint)
755 return self::codepoint_to_utf16be($codepoint);
759 * Convert a unicode codepoint to a UTF-16BE character sequence
761 * @param int $codepoint
764 private static function codepoint_to_utf16be($codepoint)
766 // Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
769 // If we haven't already got it cached, go cache it
770 if (!isset($cache[$codepoint]))
772 // On PHP6, we can use its own unicode support
773 if (version_compare(phpversion(), '6', '>='))
775 $cache[$codepoint] = unicode_encode(self::call_unicode_func('chr', $codepoint), 'UTF-16BE');
777 // If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
778 elseif (!self::valid_unicode_codepoint($codepoint))
780 $cache[$codepoint] = "\xFF\xFD";
782 // Without a surrogate:
783 elseif ($codepoint < 0x10000)
785 $cache[$codepoint] = pack('n', $codepoint);
790 $surrogate_code_point = $codepoint - 0x10000;
791 $cache[$codepoint] = pack('n*', ($codepoint >> 10) | 0xD800, ($codepoint & 0x03FF) | 0xDC00);
794 return $cache[$codepoint];
798 * Convert a unicode codepoint to a UTF-16LE character sequence
800 * @param int $codepoint
803 private static function codepoint_to_utf16le($codepoint)
805 // Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
808 // If we haven't already got it cached, go cache it
809 if (!isset($cache[$codepoint]))
811 // On PHP6, we can use its own unicode support
812 if (version_compare(phpversion(), '6', '>='))
814 $cache[$codepoint] = unicode_encode(self::call_unicode_func('chr', $codepoint), 'UTF-16LE');
816 // If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
817 elseif (!self::valid_unicode_codepoint($codepoint))
819 $cache[$codepoint] = "\xFD\xFF";
821 // Without a surrogate:
822 elseif ($codepoint < 0x10000)
824 $cache[$codepoint] = pack('v', $codepoint);
829 $surrogate_code_point = $codepoint - 0x10000;
830 $cache[$codepoint] = pack('v*', ($codepoint >> 10) | 0xD800, ($codepoint & 0x03FF) | 0xDC00);
833 return $cache[$codepoint];
837 * Create a new Unicode object from a UTF-32 encoded string
839 * @param string $string
842 public static function from_utf32($string)
844 // Check given parameter is a string
845 if (!is_string($string))
847 trigger_error('Unicode::from_utf32() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
852 $unicode = new Unicode;
854 // If we're on PHP6, we'll just get a unicode string and store that
855 if (version_compare(phpversion(), '6', '>='))
857 if (is_unicode($string))
859 $unicode->data = $string;
863 $unicode->data = self::call_unicode_func('unicode_decode', $string, 'UTF-32');
866 // Otherwise, we need to decode the UTF-32 string
869 // Set the data to an empty string
872 // See if the string is of a valid length (as UTF-32 is in