1.1 --- a/unicode.php Wed Jun 08 21:23:13 2011 +0100
1.2 +++ b/unicode.php Wed Jun 08 21:25:21 2011 +0100
1.3 @@ -38,808 +38,808 @@
1.4 */
1.5 class Unicode
1.6 {
1.7 - /**
1.8 - * Contains the raw unicode data that we're working from
1.9 - *
1.10 - * @var string UTF-32BE binary string on PHP < 6, otherwise a unicode string
1.11 - */
1.12 - private $data;
1.13 -
1.14 - /**
1.15 - * Object should be created with some Unicode::from_*() method, therefore
1.16 - * this is private
1.17 - */
1.18 - private function __construct()
1.19 - {
1.20 - }
1.21 -
1.22 - /**
1.23 - * Prepare the object for serialisation
1.24 - */
1.25 - public function __sleep()
1.26 - {
1.27 - return array('data');
1.28 - }
1.29 -
1.30 - /**
1.31 - * Check the object is valid when being unserialised
1.32 - *
1.33 - * To prepare the object for use after being unserialised, we need to check
1.34 - * that it is valid. If Unicode::$data is not a string, a warning will be thrown. The
1.35 - * validity of the UTF-32BE Unicode::$data is also checked, and the string
1.36 - * is corrected if it is invalid.
1.37 - */
1.38 - public function __wakeup()
1.39 - {
1.40 - if (!isset($this->data))
1.41 - {
1.42 - trigger_error('Unicode::__wakeup() expects the serialised object to have a $data property, none exists', E_USER_WARNING);
1.43 - $this->data = '';
1.44 - }
1.45 - elseif (!is_string($this->data))
1.46 - {
1.47 - trigger_error('Unicode::__wakeup() expects Unicode::$data to be string, ' . get_type($this->data) . ' given', E_USER_WARNING);
1.48 - $this->data = '';
1.49 - }
1.50 - else
1.51 - {
1.52 - $this->data = Unicode::from_utf32be($this->data)->to_utf32be();
1.53 - }
1.54 - }
1.55 -
1.56 - /**
1.57 - * Check the given codepoint is a valid character
1.58 - *
1.59 - * @param int $codepoint
1.60 - * @return bool
1.61 - */
1.62 - private static function valid_unicode_codepoint($codepoint)
1.63 - {
1.64 - // Outside of Unicode codespace
1.65 - if ($codepoint < 0
1.66 - || $codepoint > 0x10FFFF
1.67 - // UTF-16 Surrogates
1.68 - || $codepoint >= 0xD800 && $codepoint <= 0xDFFF
1.69 - // Noncharacters
1.70 - || ($codepoint & 0xFFFE) === 0xFFFE
1.71 - || $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF)
1.72 - {
1.73 - return false;
1.74 - }
1.75 - else
1.76 - {
1.77 - return true;
1.78 - }
1.79 - }
1.80 -
1.81 - /**
1.82 - * Create a new Unicode object from an array of codepoints
1.83 - *
1.84 - * @param array $array
1.85 - * @return Unicode
1.86 - */
1.87 - public static function from_codepoint_array($array)
1.88 - {
1.89 - // Check given parameter is an array
1.90 - if (!is_array($string))
1.91 - {
1.92 - trigger_error('Unicode::from_codepoint_array() expects parameter 1 to be array, ' . get_type($string) . ' given', E_USER_WARNING);
1.93 - return false;
1.94 - }
1.95 -
1.96 - // Get U+FFFD as a binary string
1.97 - static $replacement_character = "\x00\x00\xFF\xFD";
1.98 -
1.99 - // Create new object
1.100 - $unicode = new Unicode;
1.101 -
1.102 - // Strip any leading BOM (as otherwise we chage the meaing of the new sequence, which is illegal)
1.103 - if (isset($array[0]) && $array[0] === 0xFFFD)
1.104 - {
1.105 - array_splice($array, 1);
1.106 - }
1.107 -
1.108 - // Iterate through each and every codepoint
1.109 - foreach ($array as $codepoint)
1.110 - {
1.111 - // If the codepoint is an invalid character replace it with a U+FFFD REPLACEMENT CHARACTER
1.112 - if (!self::valid_unicode_codepoint($codepoint))
1.113 - {
1.114 - $unicode->data .= $replacement_character;
1.115 - }
1.116 - // Otherwise, append it to Unicode::$data
1.117 - else
1.118 - {
1.119 - $unicode->data .= pack('N', $codepoint);
1.120 - }
1.121 - }
1.122 -
1.123 - return $unicode;
1.124 - }
1.125 -
1.126 - /**
1.127 - * Create an array of codepoints from the object
1.128 - *
1.129 - * @return string
1.130 - */
1.131 - public function to_codepoint_array()
1.132 - {
1.133 - $data = $this->data;
1.134 - return array_values(unpack('N*', $data));
1.135 - }
1.136 -
1.137 - /**
1.138 - * Create a new Unicode object from a UTF-8 encoded string
1.139 - *
1.140 - * @param string $string
1.141 - * @return Unicode
1.142 - */
1.143 - public static function from_utf8($string)
1.144 - {
1.145 - // Check given parameter is a string
1.146 - if (!is_string($string))
1.147 - {
1.148 - trigger_error('Unicode::from_utf8() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
1.149 - return false;
1.150 - }
1.151 -
1.152 - // Create new object
1.153 - $unicode = new Unicode;
1.154 -
1.155 - // Set the data to an empty string, and remaining bytes in the current sequence to zero
1.156 - $unicode->data = '';
1.157 - $remaining = 0;
1.158 -
1.159 - // Iterate through each and every byte
1.160 - for ($i = 0, $len = strlen($string); $i < $len; $i++)
1.161 - {
1.162 - $value = ord($string[$i]);
1.163 -
1.164 - // If we're the first byte of sequence:
1.165 - if (!$remaining)
1.166 - {
1.167 - // One byte sequence:
1.168 - if ($value <= 0x7F)
1.169 - {
1.170 - $character = $value;
1.171 - $length = 1;
1.172 - }
1.173 - // Two byte sequence:
1.174 - elseif (($value & 0xE0) === 0xC0)
1.175 - {
1.176 - $character = ($value & 0x1F) << 6;
1.177 - $length = 2;
1.178 - $remaining = 1;
1.179 - }
1.180 - // Three byte sequence:
1.181 - elseif (($value & 0xF0) === 0xE0)
1.182 - {
1.183 - $character = ($value & 0x0F) << 12;
1.184 - $length = 3;
1.185 - $remaining = 2;
1.186 - }
1.187 - // Four byte sequence:
1.188 - elseif (($value & 0xF8) === 0xF0)
1.189 - {
1.190 - $character = ($value & 0x07) << 18;
1.191 - $length = 4;
1.192 - $remaining = 3;
1.193 - }
1.194 - // Invalid byte:
1.195 - else
1.196 - {
1.197 - $character = 0xFFFD;
1.198 - $length = 3;
1.199 - $remaining = 0;
1.200 - }
1.201 - }
1.202 - // Continuation byte:
1.203 - else
1.204 - {
1.205 - // Check that the byte is valid, then add it to the character:
1.206 - if (($value & 0xC0) === 0x80)
1.207 - {
1.208 - $remaining--;
1.209 - $character |= ($value & 0x3F) << ($remaining * 6);
1.210 - }
1.211 - // If it is invalid, count the sequence as invalid and reprocess the current byte as the start of a sequence:
1.212 - else
1.213 - {
1.214 - $character = 0xFFFD;
1.215 - $length = 3;
1.216 - $remaining = 0;
1.217 - $i--;
1.218 - }
1.219 - }
1.220 -
1.221 - // If we've reached the end of the current byte sequence, append it to Unicode::$data
1.222 - if (!$remaining)
1.223 - {
1.224 - // If the character is illegal replace it with U+FFFD REPLACEMENT CHARACTER
1.225 - if ($length > 1 && $character <= 0x7F
1.226 - || $length > 2 && $character <= 0x7FF
1.227 - || $length > 3 && $character <= 0xFFFF
1.228 - || !self::valid_unicode_codepoint($character))
1.229 - {
1.230 - $character = 0xFFFD;
1.231 - }
1.232 -
1.233 - $unicode->data .= pack('N', $character);
1.234 - }
1.235 - }
1.236 -
1.237 - // Strip any U+FEFF BYTE ORDER MARK (as otherwise we chage the meaing of the new sequence, which is illegal)
1.238 - if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
1.239 - {
1.240 - $unicode->data = substr($unicode->data, 4);
1.241 - }
1.242 -
1.243 - // If we've reached the end of the string but not the end of a character sequence, append a U+FFFD REPLACEMENT CHARACTE
1.244 - if ($remaining > 0)
1.245 - {
1.246 - $unicode->data .= "\x00\x00\xFF\xFD";
1.247 - }
1.248 - return $unicode;
1.249 - }
1.250 -
1.251 - /**
1.252 - * Create a UTF-8 binary string from the object
1.253 - *
1.254 - * @return string
1.255 - */
1.256 - public function to_utf8()
1.257 - {
1.258 - if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-8', 'UTF-32BE')))
1.259 - {
1.260 - return $return;
1.261 - }
1.262 - elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-8', $this->data)))
1.263 - {
1.264 - return $return;
1.265 - }
1.266 - else
1.267 - {
1.268 - $codepoints = unpack('N*', $this->data);
1.269 - $return = '';
1.270 - foreach ($codepoints as $codepoint)
1.271 - {
1.272 - $return .= self::codepoint_to_utf8($codepoint);
1.273 - }
1.274 - return $return;
1.275 - }
1.276 - }
1.277 -
1.278 - /**
1.279 - * Convert a unicode codepoint to a UTF-8 character sequence
1.280 - *
1.281 - * @param int $codepoint
1.282 - * @return string
1.283 - */
1.284 - private static function codepoint_to_utf8($codepoint)
1.285 - {
1.286 - // Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
1.287 - static $cache;
1.288 -
1.289 - // If we haven't already got it cached, go cache it
1.290 - if (!isset($cache[$codepoint]))
1.291 - {
1.292 - // If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
1.293 - if (!self::valid_unicode_codepoint($codepoint))
1.294 - {
1.295 - $cache[$codepoint] = "\xEF\xBF\xBD";
1.296 - }
1.297 - // One byte sequence:
1.298 - elseif ($codepoint <= 0x7F)
1.299 - {
1.300 - $cache[$codepoint] = chr($codepoint);
1.301 - }
1.302 - // Two byte sequence:
1.303 - elseif ($codepoint <= 0x7FF)
1.304 - {
1.305 - $cache[$codepoint] = chr(0xC0 | ($codepoint >> 6)) . chr(0x80 | ($codepoint & 0x3F));
1.306 - }
1.307 - // Three byte sequence:
1.308 - elseif ($codepoint <= 0xFFFF)
1.309 - {
1.310 - $cache[$codepoint] = chr(0xE0 | ($codepoint >> 12)) . chr(0x80 | (($codepoint >> 6) & 0x3F)) . chr(0x80 | ($codepoint & 0x3F));
1.311 - }
1.312 - // Four byte sequence:
1.313 - else
1.314 - {
1.315 - $cache[$codepoint] = chr(0xF0 | ($codepoint >> 18)) . chr(0x80 | (($codepoint >> 12) & 0x3F)) . chr(0x80 | (($codepoint >> 6) & 0x3F)) . chr(0x80 | ($codepoint & 0x3F));
1.316 - }
1.317 - }
1.318 - return $cache[$codepoint];
1.319 - }
1.320 -
1.321 - /**
1.322 - * Create a new Unicode object from a UTF-16 encoded string
1.323 - *
1.324 - * @param string $string
1.325 - * @return Unicode
1.326 - */
1.327 - public static function from_utf16($string)
1.328 - {
1.329 - // Check given parameter is a string
1.330 - if (!is_string($string))
1.331 - {
1.332 - trigger_error('Unicode::from_utf8() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
1.333 - return false;
1.334 - }
1.335 -
1.336 - // Create new object
1.337 - $unicode = new Unicode;
1.338 -
1.339 - // Set the data to an empty string and surrogate to false
1.340 - $unicode->data = '';
1.341 - $surrogate = false;
1.342 -
1.343 - // See if the string is of a valid length (as UTF-16 is in two byte sequences, it must be divisible by two)
1.344 - $valid_length = (($len = strlen($string)) % 2) ? false : true;
1.345 -
1.346 - // If it is of an invalid length, trim all the invalid bytes at the end (we'll replace them with a U+FFFD REPLACEMENT CHARACTER later)
1.347 - if (!$valid_length)
1.348 - {
1.349 - $string = substr($string, 0, floor($len / 2) * 2);
1.350 - }
1.351 -
1.352 - // If the string starts with a UTF-16LE BOM, it is UTF-16LE, so decode it as such
1.353 - if (substr($string, 0, 2) === "\xFF\xFE")
1.354 - {
1.355 - $words = array_values(unpack('v*', $string));
1.356 - }
1.357 - // Otherwise, it is UTF-16BE, so decode it as such
1.358 - else
1.359 - {
1.360 - $words = array_values(unpack('n*', $string));
1.361 - }
1.362 -
1.363 - // Iterate through each and every word
1.364 - for ($i = 0, $word_count = count($words); $i < $word_count; $i++)
1.365 - {
1.366 - // If we're the first word of sequence:
1.367 - if (!$surrogate)
1.368 - {
1.369 - // One word sequence:
1.370 - if (self::valid_unicode_codepoint($words[$i]))
1.371 - {
1.372 - $unicode->data .= pack('N', $words[$i]);
1.373 - }
1.374 - // Two word sequence:
1.375 - elseif ($words[$i] >= 0xD800 && $words[$i] <= 0xDFFF)
1.376 - {
1.377 - $character = ($words[$i] & 0x3FF) << 10;
1.378 - $surrogate = true;
1.379 - }
1.380 - // Invalid word:
1.381 - else
1.382 - {
1.383 - $unicode->data .= pack('N', 0xFFFD);
1.384 - }
1.385 - }
1.386 - // Second word:
1.387 - else
1.388 - {
1.389 - // Surrogates are only ever two words, so we can say we've reached the end with certainty
1.390 - $surrogate = false;
1.391 -
1.392 - // Check that the word is valid, then add it to the character:
1.393 - if ($words[$i] >= 0xDC00 && $words[$i] <= 0xDFFF)
1.394 - {
1.395 - $character |= $words[$i] & 0x3FF;
1.396 - if (self::valid_unicode_codepoint($character))
1.397 - {
1.398 - $unicode->data .= pack('N', $character);
1.399 - }
1.400 - else
1.401 - {
1.402 - $unicode->data .= pack('N', 0xFFFD);
1.403 - }
1.404 - }
1.405 - // If it is invalid, count the sequence as invalid and reprocess the current word as a first word:
1.406 - else
1.407 - {
1.408 - $unicode->data .= pack('N', 0xFFFD);
1.409 - $i--;
1.410 - }
1.411 - }
1.412 - }
1.413 -
1.414 - // If we've reached the end of the string but not the end of a surrogate pair, append a U+FFFD REPLACEMENT CHARACTER
1.415 - if ($surrogate)
1.416 - {
1.417 - $unicode->data .= "\x00\x00\xFF\xFD";
1.418 - }
1.419 -
1.420 - // If it was of an invalid length, append a U+FFFD REPLACEMENT CHARACTER
1.421 - if (!$valid_length)
1.422 - {
1.423 - $unicode->data .= "\x00\x00\xFF\xFD";
1.424 - }
1.425 -
1.426 - // Strip any U+FEFF BYTE ORDER MARK (as otherwise we chage the meaing of the new sequence, which is illegal)
1.427 - if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
1.428 - {
1.429 - $unicode->data = substr($unicode->data, 4);
1.430 - }
1.431 - return $unicode;
1.432 - }
1.433 -
1.434 - /**
1.435 - * Create a new Unicode object from a UTF-16BE encoded string
1.436 - *
1.437 - * @param string $string
1.438 - * @return Unicode
1.439 - */
1.440 - public static function from_utf16be($string)
1.441 - {
1.442 - // Check given parameter is a string
1.443 - if (!is_string($string))
1.444 - {
1.445 - trigger_error('Unicode::from_utf16be() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
1.446 - return false;
1.447 - }
1.448 -
1.449 - // Add BOM before calling Unicode::from_utf16()
1.450 - return self::from_utf16("\xFE\xFF" . $string);
1.451 - }
1.452 -
1.453 - /**
1.454 - * Create a new Unicode object from a UTF-16LE encoded string
1.455 - *
1.456 - * @param string $string
1.457 - * @return Unicode
1.458 - */
1.459 - public static function from_utf16le($string)
1.460 - {
1.461 - // Check given parameter is a string
1.462 - if (!is_string($string))
1.463 - {
1.464 - trigger_error('Unicode::from_utf16le() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
1.465 - return false;
1.466 - }
1.467 -
1.468 - // Add BOM before calling Unicode::from_utf16()
1.469 - return self::from_utf16("\xFF\xFE" . $string);
1.470 - }
1.471 -
1.472 - /**
1.473 - * Create a UTF-16 binary string from the object
1.474 - *
1.475 - * @return string
1.476 - */
1.477 - public function to_utf16()
1.478 - {
1.479 - return "\xFE\xFF" . $this->to_utf16be();
1.480 - }
1.481 -
1.482 - /**
1.483 - * Create a UTF-16BE binary string from the object
1.484 - *
1.485 - * @return string
1.486 - */
1.487 - public function to_utf16be()
1.488 - {
1.489 - if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-16BE', 'UTF-32BE')))
1.490 - {
1.491 - return $return;
1.492 - }
1.493 - elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-16BE', $this->data)))
1.494 - {
1.495 - return $return;
1.496 - }
1.497 - else
1.498 - {
1.499 - $codepoints = unpack('N*', $this->data);
1.500 - $return = '';
1.501 - foreach ($codepoints as $codepoint)
1.502 - {
1.503 - $return .= self::codepoint_to_utf16be($codepoint);
1.504 - }
1.505 - return $return;
1.506 - }
1.507 - }
1.508 -
1.509 - /**
1.510 - * Create a UTF-16LE binary string from the object
1.511 - *
1.512 - * @return string
1.513 - */
1.514 - public function to_utf16le()
1.515 - {
1.516 - if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-16LE', 'UTF-32BE')))
1.517 - {
1.518 - return $return;
1.519 - }
1.520 - elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-16LE', $this->data)))
1.521 - {
1.522 - return $return;
1.523 - }
1.524 - else
1.525 - {
1.526 - $codepoints = unpack('N*', $this->data);
1.527 - $return = '';
1.528 - foreach ($codepoints as $codepoint)
1.529 - {
1.530 - $return .= self::codepoint_to_utf16le($codepoint);
1.531 - }
1.532 - return $return;
1.533 - }
1.534 - }
1.535 -
1.536 - /**
1.537 - * Convert a unicode codepoint to a UTF-16 character sequence
1.538 - *
1.539 - * @param int $codepoint
1.540 - * @return string
1.541 - */
1.542 - private static function codepoint_to_utf16($codepoint)
1.543 - {
1.544 - return self::codepoint_to_utf16be($codepoint);
1.545 - }
1.546 -
1.547 - /**
1.548 - * Convert a unicode codepoint to a UTF-16BE character sequence
1.549 - *
1.550 - * @param int $codepoint
1.551 - * @return string
1.552 - */
1.553 - private static function codepoint_to_utf16be($codepoint)
1.554 - {
1.555 - // Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
1.556 - static $cache;
1.557 -
1.558 - // If we haven't already got it cached, go cache it
1.559 - if (!isset($cache[$codepoint]))
1.560 - {
1.561 - // If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
1.562 - if (!self::valid_unicode_codepoint($codepoint))
1.563 - {
1.564 - $cache[$codepoint] = "\xFF\xFD";
1.565 - }
1.566 - // Without a surrogate:
1.567 - elseif ($codepoint < 0x10000)
1.568 - {
1.569 - $cache[$codepoint] = pack('n', $codepoint);
1.570 - }
1.571 - // With a surrogate
1.572 - else
1.573 - {
1.574 - $surrogate_code_point = $codepoint - 0x10000;
1.575 - $cache[$codepoint] = pack('n*', ($codepoint >> 10) | 0xD800, ($codepoint & 0x03FF) | 0xDC00);
1.576 - }
1.577 - }
1.578 - return $cache[$codepoint];
1.579 - }
1.580 -
1.581 - /**
1.582 - * Convert a unicode codepoint to a UTF-16LE character sequence
1.583 - *
1.584 - * @param int $codepoint
1.585 - * @return string
1.586 - */
1.587 - private static function codepoint_to_utf16le($codepoint)
1.588 - {
1.589 - // Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
1.590 - static $cache;
1.591 -
1.592 - // If we haven't already got it cached, go cache it
1.593 - if (!isset($cache[$codepoint]))
1.594 - {
1.595 - // If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
1.596 - if (!self::valid_unicode_codepoint($codepoint))
1.597 - {
1.598 - $cache[$codepoint] = "\xFD\xFF";
1.599 - }
1.600 - // Without a surrogate:
1.601 - elseif ($codepoint < 0x10000)
1.602 - {
1.603 - $cache[$codepoint] = pack('v', $codepoint);
1.604 - }
1.605 - // With a surrogate
1.606 - else
1.607 - {
1.608 - $surrogate_code_point = $codepoint - 0x10000;
1.609 - $cache[$codepoint] = pack('v*', ($codepoint >> 10) | 0xD800, ($codepoint & 0x03FF) | 0xDC00);
1.610 - }
1.611 - }
1.612 - return $cache[$codepoint];
1.613 - }
1.614 -
1.615 - /**
1.616 - * Create a new Unicode object from a UTF-32 encoded string
1.617 - *
1.618 - * @param string $string
1.619 - * @return Unicode
1.620 - */
1.621 - public static function from_utf32($string)
1.622 - {
1.623 - // Check given parameter is a string
1.624 - if (!is_string($string))
1.625 - {
1.626 - trigger_error('Unicode::from_utf32() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
1.627 - return false;
1.628 - }
1.629 -
1.630 - // Create new object
1.631 - $unicode = new Unicode;
1.632 -
1.633 - // Set the data to an empty string
1.634 - $unicode->data = '';
1.635 -
1.636 - // See if the string is of a valid length (as UTF-32 is in four byte sequences, it must be divisible by four)
1.637 - $valid_length = (($len = strlen($string)) % 4) ? false : true;
1.638 -
1.639 - // If it is of an invalid length, trim all the invalid bytes at the end (we'll replace them with a U+FFFD REPLACEMENT CHARACTER later)
1.640 - if (!$valid_length)
1.641 - {
1.642 - $string = substr($string, 0, floor($len / 4) * 4);
1.643 - }
1.644 -
1.645 - // If the string starts with a UTF-32LE BOM, it is UTF-32LE, so decode it as such
1.646 - if (substr($string, 0, 4) === "\xFF\xFE\x00\x00")
1.647 - {
1.648 - $codepoints = unpack('V*', $string);
1.649 - }
1.650 - // Otherwise, it is UTF-32BE, so decode it as such
1.651 - else
1.652 - {
1.653 - $codepoints = unpack('N*', $string);
1.654 - }
1.655 -
1.656 - // Iterate through each and every codepoint
1.657 - foreach ($codepoints as $codepoint)
1.658 - {
1.659 - // If the codepoint is an invalid character replace it with a U+FFFD REPLACEMENT CHARACTER
1.660 - if (!self::valid_unicode_codepoint($codepoint))
1.661 - {
1.662 - $unicode->data .= "\x00\x00\xFF\xFD";
1.663 - }
1.664 - // Otherwise, append it to Unicode::$data
1.665 - else
1.666 - {
1.667 - $unicode->data .= pack('N', $codepoint);
1.668 - }
1.669 - }
1.670 -
1.671 - // If it was of an invalid length, append a U+FFFD REPLACEMENT CHARACTER
1.672 - if (!$valid_length)
1.673 - {
1.674 - $unicode->data .= "\x00\x00\xFF\xFD";
1.675 - }
1.676 -
1.677 - // Strip any leading BOM (as otherwise we chage the meaing of the new sequence, which is illegal)
1.678 - if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
1.679 - {
1.680 - $unicode->data = substr($unicode->data, 4);
1.681 - }
1.682 -
1.683 - return $unicode;
1.684 - }
1.685 -
1.686 - /**
1.687 - * Create a new Unicode object from a UTF-32BE encoded string
1.688 - *
1.689 - * @param string $string
1.690 - * @return Unicode
1.691 - */
1.692 - public static function from_utf32be($string)
1.693 - {
1.694 - // Check given parameter is a string
1.695 - if (!is_string($string))
1.696 - {
1.697 - trigger_error('Unicode::from_utf32be() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
1.698 - return false;
1.699 - }
1.700 -
1.701 - // Add BOM before calling Unicode::from_utf32()
1.702 - return self::from_utf32("\x00\x00\xFE\xFF" . $string);
1.703 - }
1.704 -
1.705 - /**
1.706 - * Create a new Unicode object from a UTF-32LE encoded string
1.707 - *
1.708 - * @param string $string
1.709 - * @return Unicode
1.710 - */
1.711 - public static function from_utf32le($string)
1.712 - {
1.713 - // Check given parameter is a string
1.714 - if (!is_string($string))
1.715 - {
1.716 - trigger_error('Unicode::from_utf32le() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
1.717 - return false;
1.718 - }
1.719 -
1.720 - // Add BOM before calling Unicode::from_utf32()
1.721 - return self::from_utf32("\xFF\xFE\x00\x00" . $string);
1.722 - }
1.723 -
1.724 - /**
1.725 - * Create a UTF-32 binary string from the object
1.726 - *
1.727 - * @return string
1.728 - */
1.729 - public function to_utf32()
1.730 - {
1.731 - return "\x00\x00\xFE\xFF" . $this->to_utf32be();
1.732 - }
1.733 -
1.734 - /**
1.735 - * Create a UTF-32BE binary string from the object
1.736 - *
1.737 - * @return string
1.738 - */
1.739 - public function to_utf32be()
1.740 - {
1.741 - return $this->data;
1.742 - }
1.743 -
1.744 - /**
1.745 - * Create a UTF-32LE binary string from the object
1.746 - *
1.747 - * @return string
1.748 - */
1.749 - public function to_utf32le()
1.750 - {
1.751 - if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-32LE', 'UTF-32BE')))
1.752 - {
1.753 - return $return;
1.754 - }
1.755 - elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-32LE', $this->data)))
1.756 - {
1.757 - return $return;
1.758 - }
1.759 - else
1.760 - {
1.761 - return call_user_func_array('pack', array_merge(array('V*'), unpack('N*', $this->data)));
1.762 - }
1.763 - }
1.764 -
1.765 - /**
1.766 - * Convert a unicode codepoint to a UTF-32 character sequence
1.767 - *
1.768 - * @param int $codepoint
1.769 - * @return string
1.770 - */
1.771 - private static function codepoint_to_utf32($codepoint)
1.772 - {
1.773 - return self::codepoint_to_utf32be($codepoint);
1.774 - }
1.775 -
1.776 - /**
1.777 - * Convert a unicode codepoint to a UTF-32BE character sequence
1.778 - *
1.779 - * @param int $codepoint
1.780 - * @return string
1.781 - */
1.782 - private static function codepoint_to_utf32be($codepoint)
1.783 - {
1.784 - if (self::valid_unicode_codepoint($codepoint))
1.785 - {
1.786 - return pack('N', $codepoint);
1.787 - }
1.788 - else
1.789 - {
1.790 - return "\x00\x00\xFF\xFD";
1.791 - }
1.792 - }
1.793 -
1.794 - /**
1.795 - * Convert a unicode codepoint to a UTF-32LE character sequence
1.796 - *
1.797 - * @param int $codepoint
1.798 - * @return string
1.799 - */
1.800 - private static function codepoint_to_utf32le($codepoint)
1.801 - {
1.802 - if (self::valid_unicode_codepoint($codepoint))
1.803 - {
1.804 - return pack('V', $codepoint);
1.805 - }
1.806 - else
1.807 - {
1.808 - return "\xFD\xFF\x00\x00";
1.809 - }
1.810 - }
1.811 + /**
1.812 + * Contains the raw unicode data that we're working from
1.813 + *
1.814 + * @var string UTF-32BE binary string on PHP < 6, otherwise a unicode string
1.815 + */
1.816 + private $data;
1.817 +
1.818 + /**
1.819 + * Object should be created with some Unicode::from_*() method, therefore
1.820 + * this is private
1.821 + */
1.822 + private function __construct()
1.823 + {
1.824 + }
1.825 +
1.826 + /**
1.827 + * Prepare the object for serialisation
1.828 + */
1.829 + public function __sleep()
1.830 + {
1.831 + return array('data');
1.832 + }
1.833 +
1.834 + /**
1.835 + * Check the object is valid when being unserialised
1.836 + *
1.837 + * To prepare the object for use after being unserialised, we need to check
1.838 + * that it is valid. If Unicode::$data is not a string, a warning will be thrown. The
1.839 + * validity of the UTF-32BE Unicode::$data is also checked, and the string
1.840 + * is corrected if it is invalid.
1.841 + */
1.842 + public function __wakeup()
1.843 + {
1.844 + if (!isset($this->data))
1.845 + {
1.846 + trigger_error('Unicode::__wakeup() expects the serialised object to have a $data property, none exists', E_USER_WARNING);
1.847 + $this->data = '';
1.848 + }
1.849 + elseif (!is_string($this->data))
1.850 + {
1.851 + trigger_error('Unicode::__wakeup() expects Unicode::$data to be string, ' . get_type($this->data) . ' given', E_USER_WARNING);
1.852 + $this->data = '';
1.853 + }
1.854 + else
1.855 + {
1.856 + $this->data = Unicode::from_utf32be($this->data)->to_utf32be();
1.857 + }
1.858 + }
1.859 +
1.860 + /**
1.861 + * Check the given codepoint is a valid character
1.862 + *
1.863 + * @param int $codepoint
1.864 + * @return bool
1.865 + */
1.866 + private static function valid_unicode_codepoint($codepoint)
1.867 + {
1.868 + // Outside of Unicode codespace
1.869 + if ($codepoint < 0
1.870 + || $codepoint > 0x10FFFF
1.871 + // UTF-16 Surrogates
1.872 + || $codepoint >= 0xD800 && $codepoint <= 0xDFFF
1.873 + // Noncharacters
1.874 + || ($codepoint & 0xFFFE) === 0xFFFE
1.875 + || $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF)
1.876 + {
1.877 + return false;
1.878 + }
1.879 + else
1.880 + {
1.881 + return true;
1.882 + }
1.883 + }
1.884 +
1.885 + /**
1.886 + * Create a new Unicode object from an array of codepoints
1.887 + *
1.888 + * @param array $array
1.889 + * @return Unicode
1.890 + */
1.891 + public static function from_codepoint_array($array)
1.892 + {
1.893 + // Check given parameter is an array
1.894 + if (!is_array($string))
1.895 + {
1.896 + trigger_error('Unicode::from_codepoint_array() expects parameter 1 to be array, ' . get_type($string) . ' given', E_USER_WARNING);
1.897 + return false;
1.898 + }
1.899 +
1.900 + // Get U+FFFD as a binary string
1.901 + static $replacement_character = "\x00\x00\xFF\xFD";
1.902 +
1.903 + // Create new object
1.904 + $unicode = new Unicode;
1.905 +
1.906 + // Strip any leading BOM (as otherwise we chage the meaing of the new sequence, which is illegal)
1.907 + if (isset($array[0]) && $array[0] === 0xFFFD)
1.908 + {
1.909 + array_splice($array, 1);
1.910 + }
1.911 +
1.912 + // Iterate through each and every codepoint
1.913 + foreach ($array as $codepoint)
1.914 + {
1.915 + // If the codepoint is an invalid character replace it with a U+FFFD REPLACEMENT CHARACTER
1.916 + if (!self::valid_unicode_codepoint($codepoint))
1.917 + {
1.918 + $unicode->data .= $replacement_character;
1.919 + }
1.920 + // Otherwise, append it to Unicode::$data
1.921 + else
1.922 + {
1.923 + $unicode->data .= pack('N', $codepoint);
1.924 + }
1.925 + }
1.926 +
1.927 + return $unicode;
1.928 + }
1.929 +
1.930 + /**
1.931 + * Create an array of codepoints from the object
1.932 + *
1.933 + * @return string
1.934 + */
1.935 + public function to_codepoint_array()
1.936 + {
1.937 + $data = $this->data;
1.938 + return array_values(unpack('N*', $data));
1.939 + }
1.940 +
1.941 + /**
1.942 + * Create a new Unicode object from a UTF-8 encoded string
1.943 + *
1.944 + * @param string $string
1.945 + * @return Unicode
1.946 + */
1.947 + public static function from_utf8($string)
1.948 + {
1.949 + // Check given parameter is a string
1.950 + if (!is_string($string))
1.951 + {
1.952 + trigger_error('Unicode::from_utf8() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
1.953 + return false;
1.954 + }
1.955 +
1.956 + // Create new object
1.957 + $unicode = new Unicode;
1.958 +
1.959 + // Set the data to an empty string, and remaining bytes in the current sequence to zero
1.960 + $unicode->data = '';
1.961 + $remaining = 0;
1.962 +
1.963 + // Iterate through each and every byte
1.964 + for ($i = 0, $len = strlen($string); $i < $len; $i++)
1.965 + {
1.966 + $value = ord($string[$i]);
1.967 +
1.968 + // If we're the first byte of sequence:
1.969 + if (!$remaining)
1.970 + {
1.971 + // One byte sequence:
1.972 + if ($value <= 0x7F)
1.973 + {
1.974 + $character = $value;
1.975 + $length = 1;
1.976 + }
1.977 + // Two byte sequence:
1.978 + elseif (($value & 0xE0) === 0xC0)
1.979 + {
1.980 + $character = ($value & 0x1F) << 6;
1.981 + $length = 2;
1.982 + $remaining = 1;
1.983 + }
1.984 + // Three byte sequence:
1.985 + elseif (($value & 0xF0) === 0xE0)
1.986 + {
1.987 + $character = ($value & 0x0F) << 12;
1.988 + $length = 3;
1.989 + $remaining = 2;
1.990 + }
1.991 + // Four byte sequence:
1.992 + elseif (($value & 0xF8) === 0xF0)
1.993 + {
1.994 + $character = ($value & 0x07) << 18;
1.995 + $length = 4;
1.996 + $remaining = 3;
1.997 + }
1.998 + // Invalid byte:
1.999 + else
1.1000 + {
1.1001 + $character = 0xFFFD;
1.1002 + $length = 3;
1.1003 + $remaining = 0;
1.1004 + }
1.1005 + }
1.1006 + // Continuation byte:
1.1007 + else
1.1008 + {
1.1009 + // Check that the byte is valid, then add it to the character:
1.1010 + if (($value & 0xC0) === 0x80)
1.1011 + {
1.1012 + $remaining--;
1.1013 + $character |= ($value & 0x3F) << ($remaining * 6);
1.1014 + }
1.1015 + // If it is invalid, count the sequence as invalid and reprocess the current byte as the start of a sequence:
1.1016 + else
1.1017 + {
1.1018 + $character = 0xFFFD;
1.1019 + $length = 3;
1.1020 + $remaining = 0;
1.1021 + $i--;
1.1022 + }
1.1023 + }
1.1024 +
1.1025 + // If we've reached the end of the current byte sequence, append it to Unicode::$data
1.1026 + if (!$remaining)
1.1027 + {
1.1028 + // If the character is illegal replace it with U+FFFD REPLACEMENT CHARACTER
1.1029 + if ($length > 1 && $character <= 0x7F
1.1030 + || $length > 2 && $character <= 0x7FF
1.1031 + || $length > 3 && $character <= 0xFFFF
1.1032 + || !self::valid_unicode_codepoint($character))
1.1033 + {
1.1034 + $character = 0xFFFD;
1.1035 + }
1.1036 +
1.1037 + $unicode->data .= pack('N', $character);
1.1038 + }
1.1039 + }
1.1040 +
1.1041 + // Strip any U+FEFF BYTE ORDER MARK (as otherwise we chage the meaing of the new sequence, which is illegal)
1.1042 + if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
1.1043 + {
1.1044 + $unicode->data = substr($unicode->data, 4);
1.1045 + }
1.1046 +
1.1047 + // If we've reached the end of the string but not the end of a character sequence, append a U+FFFD REPLACEMENT CHARACTE
1.1048 + if ($remaining > 0)
1.1049 + {
1.1050 + $unicode->data .= "\x00\x00\xFF\xFD";
1.1051 + }
1.1052 + return $unicode;
1.1053 + }
1.1054 +
1.1055 + /**
1.1056 + * Create a UTF-8 binary string from the object
1.1057 + *
1.1058 + * @return string
1.1059 + */
1.1060 + public function to_utf8()
1.1061 + {
1.1062 + if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-8', 'UTF-32BE')))
1.1063 + {
1.1064 + return $return;
1.1065 + }
1.1066 + elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-8', $this->data)))
1.1067 + {
1.1068 + return $return;
1.1069 + }
1.1070 + else
1.1071 + {
1.1072 + $codepoints = unpack('N*', $this->data);
1.1073 + $return = '';
1.1074 + foreach ($codepoints as $codepoint)
1.1075 + {
1.1076 + $return .= self::codepoint_to_utf8($codepoint);
1.1077 + }
1.1078 + return $return;
1.1079 + }
1.1080 + }
1.1081 +
1.1082 + /**
1.1083 + * Convert a unicode codepoint to a UTF-8 character sequence
1.1084 + *
1.1085 + * @param int $codepoint
1.1086 + * @return string
1.1087 + */
1.1088 + private static function codepoint_to_utf8($codepoint)
1.1089 + {
1.1090 + // Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
1.1091 + static $cache;
1.1092 +
1.1093 + // If we haven't already got it cached, go cache it
1.1094 + if (!isset($cache[$codepoint]))
1.1095 + {
1.1096 + // If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
1.1097 + if (!self::valid_unicode_codepoint($codepoint))
1.1098 + {
1.1099 + $cache[$codepoint] = "\xEF\xBF\xBD";
1.1100 + }
1.1101 + // One byte sequence:
1.1102 + elseif ($codepoint <= 0x7F)
1.1103 + {
1.1104 + $cache[$codepoint] = chr($codepoint);
1.1105 + }
1.1106 + // Two byte sequence:
1.1107 + elseif ($codepoint <= 0x7FF)
1.1108 + {
1.1109 + $cache[$codepoint] = chr(0xC0 | ($codepoint >> 6)) . chr(0x80 | ($codepoint & 0x3F));
1.1110 + }
1.1111 + // Three byte sequence:
1.1112 + elseif ($codepoint <= 0xFFFF)
1.1113 + {
1.1114 + $cache[$codepoint] = chr(0xE0 | ($codepoint >> 12)) . chr(0x80 | (($codepoint >> 6) & 0x3F)) . chr(0x80 | ($codepoint & 0x3F));
1.1115 + }
1.1116 + // Four byte sequence:
1.1117 + else
1.1118 + {
1.1119 + $cache[$codepoint] = chr(0xF0 | ($codepoint >> 18)) . chr(0x80 | (($codepoint >> 12) & 0x3F)) . chr(0x80 | (($codepoint >> 6) & 0x3F)) . chr(0x80 | ($codepoint & 0x3F));
1.1120 + }
1.1121 + }
1.1122 + return $cache[$codepoint];
1.1123 + }
1.1124 +
1.1125 + /**
1.1126 + * Create a new Unicode object from a UTF-16 encoded string
1.1127 + *
1.1128 + * @param string $string
1.1129 + * @return Unicode
1.1130 + */
1.1131 + public static function from_utf16($string)
1.1132 + {
1.1133 + // Check given parameter is a string
1.1134 + if (!is_string($string))
1.1135 + {
1.1136 + trigger_error('Unicode::from_utf8() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
1.1137 + return false;
1.1138 + }
1.1139 +
1.1140 + // Create new object
1.1141 + $unicode = new Unicode;
1.1142 +
1.1143 + // Set the data to an empty string and surrogate to false
1.1144 + $unicode->data = '';
1.1145 + $surrogate = false;
1.1146 +
1.1147 + // See if the string is of a valid length (as UTF-16 is in two byte sequences, it must be divisible by two)
1.1148 + $valid_length = (($len = strlen($string)) % 2) ? false : true;
1.1149 +
1.1150 + // If it is of an invalid length, trim all the invalid bytes at the end (we'll replace them with a U+FFFD REPLACEMENT CHARACTER later)
1.1151 + if (!$valid_length)
1.1152 + {
1.1153 + $string = substr($string, 0, floor($len / 2) * 2);
1.1154 + }
1.1155 +
1.1156 + // If the string starts with a UTF-16LE BOM, it is UTF-16LE, so decode it as such
1.1157 + if (substr($string, 0, 2) === "\xFF\xFE")
1.1158 + {
1.1159 + $words = array_values(unpack('v*', $string));
1.1160 + }
1.1161 + // Otherwise, it is UTF-16BE, so decode it as such
1.1162 + else
1.1163 + {
1.1164 + $words = array_values(unpack('n*', $string));
1.1165 + }
1.1166 +
1.1167 + // Iterate through each and every word
1.1168 + for ($i = 0, $word_count = count($words); $i < $word_count; $i++)
1.1169 + {
1.1170 + // If we're the first word of sequence:
1.1171 + if (!$surrogate)
1.1172 + {
1.1173 + // One word sequence:
1.1174 + if (self::valid_unicode_codepoint($words[$i]))
1.1175 + {
1.1176 + $unicode->data .= pack('N', $words[$i]);
1.1177 + }
1.1178 + // Two word sequence:
1.1179 + elseif ($words[$i] >= 0xD800 && $words[$i] <= 0xDFFF)
1.1180 + {
1.1181 + $character = ($words[$i] & 0x3FF) << 10;
1.1182 + $surrogate = true;
1.1183 + }
1.1184 + // Invalid word:
1.1185 + else
1.1186 + {
1.1187 + $unicode->data .= pack('N', 0xFFFD);
1.1188 + }
1.1189 + }
1.1190 + // Second word:
1.1191 + else
1.1192 + {
1.1193 + // Surrogates are only ever two words, so we can say we've reached the end with certainty
1.1194 + $surrogate = false;
1.1195 +
1.1196 + // Check that the word is valid, then add it to the character:
1.1197 + if ($words[$i] >= 0xDC00 && $words[$i] <= 0xDFFF)
1.1198 + {
1.1199 + $character |= $words[$i] & 0x3FF;
1.1200 + if (self::valid_unicode_codepoint($character))
1.1201 + {
1.1202 + $unicode->data .= pack('N', $character);
1.1203 + }
1.1204 + else
1.1205 + {
1.1206 + $unicode->data .= pack('N', 0xFFFD);
1.1207 + }
1.1208 + }
1.1209 + // If it is invalid, count the sequence as invalid and reprocess the current word as a first word:
1.1210 + else
1.1211 + {
1.1212 + $unicode->data .= pack('N', 0xFFFD);
1.1213 + $i--;
1.1214 + }
1.1215 + }
1.1216 + }
1.1217 +
1.1218 + // If we've reached the end of the string but not the end of a surrogate pair, append a U+FFFD REPLACEMENT CHARACTER
1.1219 + if ($surrogate)
1.1220 + {
1.1221 + $unicode->data .= "\x00\x00\xFF\xFD";
1.1222 + }
1.1223 +
1.1224 + // If it was of an invalid length, append a U+FFFD REPLACEMENT CHARACTER
1.1225 + if (!$valid_length)
1.1226 + {
1.1227 + $unicode->data .= "\x00\x00\xFF\xFD";
1.1228 + }
1.1229 +
1.1230 + // Strip any U+FEFF BYTE ORDER MARK (as otherwise we chage the meaing of the new sequence, which is illegal)
1.1231 + if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
1.1232 + {
1.1233 + $unicode->data = substr($unicode->data, 4);
1.1234 + }
1.1235 + return $unicode;
1.1236 + }
1.1237 +
1.1238 + /**
1.1239 + * Create a new Unicode object from a UTF-16BE encoded string
1.1240 + *
1.1241 + * @param string $string
1.1242 + * @return Unicode
1.1243 + */
1.1244 + public static function from_utf16be($string)
1.1245 + {
1.1246 + // Check given parameter is a string
1.1247 + if (!is_string($string))
1.1248 + {
1.1249 + trigger_error('Unicode::from_utf16be() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
1.1250 + return false;
1.1251 + }
1.1252 +
1.1253 + // Add BOM before calling Unicode::from_utf16()
1.1254 + return self::from_utf16("\xFE\xFF" . $string);
1.1255 + }
1.1256 +
1.1257 + /**
1.1258 + * Create a new Unicode object from a UTF-16LE encoded string
1.1259 + *
1.1260 + * @param string $string
1.1261 + * @return Unicode
1.1262 + */
1.1263 + public static function from_utf16le($string)
1.1264 + {
1.1265 + // Check given parameter is a string
1.1266 + if (!is_string($string))
1.1267 + {
1.1268 + trigger_error('Unicode::from_utf16le() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
1.1269 + return false;
1.1270 + }
1.1271 +
1.1272 + // Add BOM before calling Unicode::from_utf16()
1.1273 + return self::from_utf16("\xFF\xFE" . $string);
1.1274 + }
1.1275 +
1.1276 + /**
1.1277 + * Create a UTF-16 binary string from the object
1.1278 + *
1.1279 + * @return string
1.1280 + */
1.1281 + public function to_utf16()
1.1282 + {
1.1283 + return "\xFE\xFF" . $this->to_utf16be();
1.1284 + }
1.1285 +
1.1286 + /**
1.1287 + * Create a UTF-16BE binary string from the object
1.1288 + *
1.1289 + * @return string
1.1290 + */
1.1291 + public function to_utf16be()
1.1292 + {
1.1293 + if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-16BE', 'UTF-32BE')))
1.1294 + {
1.1295 + return $return;
1.1296 + }
1.1297 + elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-16BE', $this->data)))
1.1298 + {
1.1299 + return $return;
1.1300 + }
1.1301 + else
1.1302 + {
1.1303 + $codepoints = unpack('N*', $this->data);
1.1304 + $return = '';
1.1305 + foreach ($codepoints as $codepoint)
1.1306 + {
1.1307 + $return .= self::codepoint_to_utf16be($codepoint);
1.1308 + }
1.1309 + return $return;
1.1310 + }
1.1311 + }
1.1312 +
1.1313 + /**
1.1314 + * Create a UTF-16LE binary string from the object
1.1315 + *
1.1316 + * @return string
1.1317 + */
1.1318 + public function to_utf16le()
1.1319 + {
1.1320 + if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-16LE', 'UTF-32BE')))
1.1321 + {
1.1322 + return $return;
1.1323 + }
1.1324 + elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-16LE', $this->data)))
1.1325 + {
1.1326 + return $return;
1.1327 + }
1.1328 + else
1.1329 + {
1.1330 + $codepoints = unpack('N*', $this->data);
1.1331 + $return = '';
1.1332 + foreach ($codepoints as $codepoint)
1.1333 + {
1.1334 + $return .= self::codepoint_to_utf16le($codepoint);
1.1335 + }
1.1336 + return $return;
1.1337 + }
1.1338 + }
1.1339 +
1.1340 + /**
1.1341 + * Convert a unicode codepoint to a UTF-16 character sequence
1.1342 + *
1.1343 + * @param int $codepoint
1.1344 + * @return string
1.1345 + */
1.1346 + private static function codepoint_to_utf16($codepoint)
1.1347 + {
1.1348 + return self::codepoint_to_utf16be($codepoint);
1.1349 + }
1.1350 +
1.1351 + /**
1.1352 + * Convert a unicode codepoint to a UTF-16BE character sequence
1.1353 + *
1.1354 + * @param int $codepoint
1.1355 + * @return string
1.1356 + */
1.1357 + private static function codepoint_to_utf16be($codepoint)
1.1358 + {
1.1359 + // Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
1.1360 + static $cache;
1.1361 +
1.1362 + // If we haven't already got it cached, go cache it
1.1363 + if (!isset($cache[$codepoint]))
1.1364 + {
1.1365 + // If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
1.1366 + if (!self::valid_unicode_codepoint($codepoint))
1.1367 + {
1.1368 + $cache[$codepoint] = "\xFF\xFD";
1.1369 + }
1.1370 + // Without a surrogate:
1.1371 + elseif ($codepoint < 0x10000)
1.1372 + {
1.1373 + $cache[$codepoint] = pack('n', $codepoint);
1.1374 + }
1.1375 + // With a surrogate
1.1376 + else
1.1377 + {
1.1378 + $surrogate_code_point = $codepoint - 0x10000;
1.1379 + $cache[$codepoint] = pack('n*', ($codepoint >> 10) | 0xD800, ($codepoint & 0x03FF) | 0xDC00);
1.1380 + }
1.1381 + }
1.1382 + return $cache[$codepoint];
1.1383 + }
1.1384 +
1.1385 + /**
1.1386 + * Convert a unicode codepoint to a UTF-16LE character sequence
1.1387 + *
1.1388 + * @param int $codepoint
1.1389 + * @return string
1.1390 + */
1.1391 + private static function codepoint_to_utf16le($codepoint)
1.1392 + {
1.1393 + // Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
1.1394 + static $cache;
1.1395 +
1.1396 + // If we haven't already got it cached, go cache it
1.1397 + if (!isset($cache[$codepoint]))
1.1398 + {
1.1399 + // If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
1.1400 + if (!self::valid_unicode_codepoint($codepoint))
1.1401 + {
1.1402 + $cache[$codepoint] = "\xFD\xFF";
1.1403 + }
1.1404 + // Without a surrogate:
1.1405 + elseif ($codepoint < 0x10000)
1.1406 + {
1.1407 + $cache[$codepoint] = pack('v', $codepoint);
1.1408 + }
1.1409 + // With a surrogate
1.1410 + else
1.1411 + {
1.1412 + $surrogate_code_point = $codepoint - 0x10000;
1.1413 + $cache[$codepoint] = pack('v*', ($codepoint >> 10) | 0xD800, ($codepoint & 0x03FF) | 0xDC00);
1.1414 + }
1.1415 + }
1.1416 + return $cache[$codepoint];
1.1417 + }
1.1418 +
1.1419 + /**
1.1420 + * Create a new Unicode object from a UTF-32 encoded string
1.1421 + *
1.1422 + * @param string $string
1.1423 + * @return Unicode
1.1424 + */
1.1425 + public static function from_utf32($string)
1.1426 + {
1.1427 + // Check given parameter is a string
1.1428 + if (!is_string($string))
1.1429 + {
1.1430 + trigger_error('Unicode::from_utf32() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
1.1431 + return false;
1.1432 + }
1.1433 +
1.1434 + // Create new object
1.1435 + $unicode = new Unicode;
1.1436 +
1.1437 + // Set the data to an empty string
1.1438 + $unicode->data = '';
1.1439 +
1.1440 + // See if the string is of a valid length (as UTF-32 is in four byte sequences, it must be divisible by four)
1.1441 + $valid_length = (($len = strlen($string)) % 4) ? false : true;
1.1442 +
1.1443 + // If it is of an invalid length, trim all the invalid bytes at the end (we'll replace them with a U+FFFD REPLACEMENT CHARACTER later)
1.1444 + if (!$valid_length)
1.1445 + {
1.1446 + $string = substr($string, 0, floor($len / 4) * 4);
1.1447 + }
1.1448 +
1.1449 + // If the string starts with a UTF-32LE BOM, it is UTF-32LE, so decode it as such
1.1450 + if (substr($string, 0, 4) === "\xFF\xFE\x00\x00")
1.1451 + {
1.1452 + $codepoints = unpack('V*', $string);
1.1453 + }
1.1454 + // Otherwise, it is UTF-32BE, so decode it as such
1.1455 + else
1.1456 + {
1.1457 + $codepoints = unpack('N*', $string);
1.1458 + }
1.1459 +
1.1460 + // Iterate through each and every codepoint
1.1461 + foreach ($codepoints as $codepoint)
1.1462 + {
1.1463 + // If the codepoint is an invalid character replace it with a U+FFFD REPLACEMENT CHARACTER
1.1464 + if (!self::valid_unicode_codepoint($codepoint))
1.1465 + {
1.1466 + $unicode->data .= "\x00\x00\xFF\xFD";
1.1467 + }
1.1468 + // Otherwise, append it to Unicode::$data
1.1469 + else
1.1470 + {
1.1471 + $unicode->data .= pack('N', $codepoint);
1.1472 + }
1.1473 + }
1.1474 +
1.1475 + // If it was of an invalid length, append a U+FFFD REPLACEMENT CHARACTER
1.1476 + if (!$valid_length)
1.1477 + {
1.1478 + $unicode->data .= "\x00\x00\xFF\xFD";
1.1479 + }
1.1480 +
1.1481 + // Strip any leading BOM (as otherwise we chage the meaing of the new sequence, which is illegal)
1.1482 + if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
1.1483 + {
1.1484 + $unicode->data = substr($unicode->data, 4);
1.1485 + }
1.1486 +
1.1487 + return $unicode;
1.1488 + }
1.1489 +
1.1490 + /**
1.1491 + * Create a new Unicode object from a UTF-32BE encoded string
1.1492 + *
1.1493 + * @param string $string
1.1494 + * @return Unicode
1.1495 + */
1.1496 + public static function from_utf32be($string)
1.1497 + {
1.1498 + // Check given parameter is a string
1.1499 + if (!is_string($string))
1.1500 + {
1.1501 + trigger_error('Unicode::from_utf32be() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
1.1502 + return false;
1.1503 + }
1.1504 +
1.1505 + // Add BOM before calling Unicode::from_utf32()
1.1506 + return self::from_utf32("\x00\x00\xFE\xFF" . $string);
1.1507 + }
1.1508 +
1.1509 + /**
1.1510 + * Create a new Unicode object from a UTF-32LE encoded string
1.1511 + *
1.1512 + * @param string $string
1.1513 + * @return Unicode
1.1514 + */
1.1515 + public static function from_utf32le($string)
1.1516 + {
1.1517 + // Check given parameter is a string
1.1518 + if (!is_string($string))
1.1519 + {
1.1520 + trigger_error('Unicode::from_utf32le() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
1.1521 + return false;
1.1522 + }
1.1523 +
1.1524 + // Add BOM before calling Unicode::from_utf32()
1.1525 + return self::from_utf32("\xFF\xFE\x00\x00" . $string);
1.1526 + }
1.1527 +
1.1528 + /**
1.1529 + * Create a UTF-32 binary string from the object
1.1530 + *
1.1531 + * @return string
1.1532 + */
1.1533 + public function to_utf32()
1.1534 + {
1.1535 + return "\x00\x00\xFE\xFF" . $this->to_utf32be();
1.1536 + }
1.1537 +
1.1538 + /**
1.1539 + * Create a UTF-32BE binary string from the object
1.1540 + *
1.1541 + * @return string
1.1542 + */
1.1543 + public function to_utf32be()
1.1544 + {
1.1545 + return $this->data;
1.1546 + }
1.1547 +
1.1548 + /**
1.1549 + * Create a UTF-32LE binary string from the object
1.1550 + *
1.1551 + * @return string
1.1552 + */
1.1553 + public function to_utf32le()
1.1554 + {
1.1555 + if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-32LE', 'UTF-32BE')))
1.1556 + {
1.1557 + return $return;
1.1558 + }
1.1559 + elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-32LE', $this->data)))
1.1560 + {
1.1561 + return $return;
1.1562 + }
1.1563 + else
1.1564 + {
1.1565 + return call_user_func_array('pack', array_merge(array('V*'), unpack('N*', $this->data)));
1.1566 + }
1.1567 + }
1.1568 +
1.1569 + /**
1.1570 + * Convert a unicode codepoint to a UTF-32 character sequence
1.1571 + *
1.1572 + * @param int $codepoint
1.1573 + * @return string
1.1574 + */
1.1575 + private static function codepoint_to_utf32($codepoint)
1.1576 + {
1.1577 + return self::codepoint_to_utf32be($codepoint);
1.1578 + }
1.1579 +
1.1580 + /**
1.1581 + * Convert a unicode codepoint to a UTF-32BE character sequence
1.1582 + *
1.1583 + * @param int $codepoint
1.1584 + * @return string
1.1585 + */
1.1586 + private static function codepoint_to_utf32be($codepoint)
1.1587 + {
1.1588 + if (self::valid_unicode_codepoint($codepoint))
1.1589 + {
1.1590 + return pack('N', $codepoint);
1.1591 + }
1.1592 + else
1.1593 + {
1.1594 + return "\x00\x00\xFF\xFD";
1.1595 + }
1.1596 + }
1.1597 +
1.1598 + /**
1.1599 + * Convert a unicode codepoint to a UTF-32LE character sequence
1.1600 + *
1.1601 + * @param int $codepoint
1.1602 + * @return string
1.1603 + */
1.1604 + private static function codepoint_to_utf32le($codepoint)
1.1605 + {
1.1606 + if (self::valid_unicode_codepoint($codepoint))
1.1607 + {
1.1608 + return pack('V', $codepoint);
1.1609 + }
1.1610 + else
1.1611 + {
1.1612 + return "\xFD\xFF\x00\x00";
1.1613 + }
1.1614 + }
1.1615 }