Get rid of tabs default tip
authorGeoffrey Sneddon <geoffers@gmail.com>
Wed Jun 08 21:25:21 2011 +0100 (11 months ago)
changeset 83660f4ee63e08
parent 82 cc70c2839bc1
Get rid of tabs
unicode.php
     1.1 --- a/unicode.php	Wed Jun 08 21:23:13 2011 +0100
     1.2 +++ b/unicode.php	Wed Jun 08 21:25:21 2011 +0100
     1.3 @@ -38,808 +38,808 @@
     1.4   */
     1.5  class Unicode
     1.6  {
     1.7 -	/**
     1.8 -	 * Contains the raw unicode data that we're working from
     1.9 -	 *
    1.10 -	 * @var string UTF-32BE binary string on PHP < 6, otherwise a unicode string
    1.11 -	 */
    1.12 -	private $data;
    1.13 -	
    1.14 -	/**
    1.15 -	 * Object should be created with some Unicode::from_*() method, therefore
    1.16 -	 * this is private
    1.17 -	 */
    1.18 -	private function __construct()
    1.19 -	{
    1.20 -	}
    1.21 -	
    1.22 -	/**
    1.23 -	 * Prepare the object for serialisation
    1.24 -	 */
    1.25 -	public function __sleep()
    1.26 -	{
    1.27 -		return array('data');
    1.28 -	}
    1.29 -	
    1.30 -	/**
    1.31 -	 * Check the object is valid when being unserialised
    1.32 -	 *
    1.33 -	 * To prepare the object for use after being unserialised, we need to check
    1.34 -	 * that it is valid. If Unicode::$data is not a string, a warning will be thrown. The
    1.35 -	 * validity of the UTF-32BE Unicode::$data is also checked, and the string
    1.36 -	 * is corrected if it is invalid.
    1.37 -	 */
    1.38 -	public function __wakeup()
    1.39 -	{
    1.40 -		if (!isset($this->data))
    1.41 -		{
    1.42 -			trigger_error('Unicode::__wakeup() expects the serialised object to have a $data property, none exists', E_USER_WARNING);
    1.43 -			$this->data = '';
    1.44 -		}
    1.45 -		elseif (!is_string($this->data))
    1.46 -		{
    1.47 -			trigger_error('Unicode::__wakeup() expects Unicode::$data to be string, ' . get_type($this->data) . ' given', E_USER_WARNING);
    1.48 -			$this->data = '';
    1.49 -		}
    1.50 -		else
    1.51 -		{
    1.52 -			$this->data = Unicode::from_utf32be($this->data)->to_utf32be();
    1.53 -		}
    1.54 -	}
    1.55 -		
    1.56 -	/**
    1.57 -	 * Check the given codepoint is a valid character
    1.58 -	 *
    1.59 -	 * @param int $codepoint
    1.60 -	 * @return bool
    1.61 -	 */
    1.62 -	private static function valid_unicode_codepoint($codepoint)
    1.63 -	{
    1.64 -		// Outside of Unicode codespace
    1.65 -		if ($codepoint < 0
    1.66 -			|| $codepoint > 0x10FFFF
    1.67 -			// UTF-16 Surrogates
    1.68 -			|| $codepoint >= 0xD800 && $codepoint <= 0xDFFF
    1.69 -			// Noncharacters
    1.70 -			|| ($codepoint & 0xFFFE) === 0xFFFE
    1.71 -			|| $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF)
    1.72 -		{
    1.73 -			return false;
    1.74 -		}
    1.75 -		else
    1.76 -		{
    1.77 -			return true;
    1.78 -		}
    1.79 -	}
    1.80 -	
    1.81 -	/**
    1.82 -	 * Create a new Unicode object from an array of codepoints
    1.83 -	 *
    1.84 -	 * @param array $array
    1.85 -	 * @return Unicode
    1.86 -	 */
    1.87 -	public static function from_codepoint_array($array)
    1.88 -	{
    1.89 -		// Check given parameter is an array
    1.90 -		if (!is_array($string))
    1.91 -		{
    1.92 -			trigger_error('Unicode::from_codepoint_array() expects parameter 1 to be array, ' . get_type($string) . ' given', E_USER_WARNING);
    1.93 -			return false;
    1.94 -		}
    1.95 -		
    1.96 -		// Get U+FFFD as a binary string
    1.97 -		static $replacement_character = "\x00\x00\xFF\xFD";
    1.98 -		
    1.99 -		// Create new object
   1.100 -		$unicode = new Unicode;
   1.101 -		
   1.102 -		// Strip any leading BOM (as otherwise we chage the meaing of the new sequence, which is illegal)
   1.103 -		if (isset($array[0]) && $array[0] === 0xFFFD)
   1.104 -		{
   1.105 -			array_splice($array, 1);
   1.106 -		}
   1.107 -		
   1.108 -		// Iterate through each and every codepoint
   1.109 -		foreach ($array as $codepoint)
   1.110 -		{
   1.111 -			// If the codepoint is an invalid character replace it with a U+FFFD REPLACEMENT CHARACTER
   1.112 -			if (!self::valid_unicode_codepoint($codepoint))
   1.113 -			{
   1.114 -				$unicode->data .= $replacement_character;
   1.115 -			}
   1.116 -			// Otherwise, append it to Unicode::$data
   1.117 -			else
   1.118 -			{
   1.119 -				$unicode->data .= pack('N', $codepoint);
   1.120 -			}
   1.121 -		}
   1.122 -		
   1.123 -		return $unicode;
   1.124 -	}
   1.125 -	
   1.126 -	/**
   1.127 -	 * Create an array of codepoints from the object
   1.128 -	 *
   1.129 -	 * @return string
   1.130 -	 */
   1.131 -	public function to_codepoint_array()
   1.132 -	{
   1.133 -		$data = $this->data;
   1.134 -		return array_values(unpack('N*', $data));
   1.135 -	}
   1.136 -	
   1.137 -	/**
   1.138 -	 * Create a new Unicode object from a UTF-8 encoded string
   1.139 -	 *
   1.140 -	 * @param string $string
   1.141 -	 * @return Unicode
   1.142 -	 */
   1.143 -	public static function from_utf8($string)
   1.144 -	{
   1.145 -		// Check given parameter is a string
   1.146 -		if (!is_string($string))
   1.147 -		{
   1.148 -			trigger_error('Unicode::from_utf8() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
   1.149 -			return false;
   1.150 -		}
   1.151 -		
   1.152 -		// Create new object
   1.153 -		$unicode = new Unicode;
   1.154 -		
   1.155 -		// Set the data to an empty string, and remaining bytes in the current sequence to zero
   1.156 -		$unicode->data = '';
   1.157 -		$remaining = 0;
   1.158 -		
   1.159 -		// Iterate through each and every byte
   1.160 -		for ($i = 0, $len = strlen($string); $i < $len; $i++)
   1.161 -		{
   1.162 -			$value = ord($string[$i]);
   1.163 -			
   1.164 -			// If we're the first byte of sequence:
   1.165 -			if (!$remaining)
   1.166 -			{
   1.167 -				// One byte sequence:
   1.168 -				if ($value <= 0x7F)
   1.169 -				{
   1.170 -					$character = $value;
   1.171 -					$length = 1;
   1.172 -				}
   1.173 -				// Two byte sequence:
   1.174 -				elseif (($value & 0xE0) === 0xC0)
   1.175 -				{
   1.176 -					$character = ($value & 0x1F) << 6;
   1.177 -					$length = 2;
   1.178 -					$remaining = 1;
   1.179 -				}
   1.180 -				// Three byte sequence:
   1.181 -				elseif (($value & 0xF0) === 0xE0)
   1.182 -				{
   1.183 -					$character = ($value & 0x0F) << 12;
   1.184 -					$length = 3;
   1.185 -					$remaining = 2;
   1.186 -				}
   1.187 -				// Four byte sequence:
   1.188 -				elseif (($value & 0xF8) === 0xF0)
   1.189 -				{
   1.190 -					$character = ($value & 0x07) << 18;
   1.191 -					$length = 4;
   1.192 -					$remaining = 3;
   1.193 -				}
   1.194 -				// Invalid byte:
   1.195 -				else
   1.196 -				{
   1.197 -					$character = 0xFFFD;
   1.198 -					$length = 3;
   1.199 -					$remaining = 0;
   1.200 -				}
   1.201 -			}
   1.202 -			// Continuation byte:
   1.203 -			else
   1.204 -			{
   1.205 -				// Check that the byte is valid, then add it to the character:
   1.206 -				if (($value & 0xC0) === 0x80)
   1.207 -				{
   1.208 -					$remaining--;
   1.209 -					$character |= ($value & 0x3F) << ($remaining * 6);
   1.210 -				}
   1.211 -				// If it is invalid, count the sequence as invalid and reprocess the current byte as the start of a sequence:
   1.212 -				else
   1.213 -				{
   1.214 -					$character = 0xFFFD;
   1.215 -					$length = 3;
   1.216 -					$remaining = 0;
   1.217 -					$i--;
   1.218 -				}
   1.219 -			}
   1.220 -			
   1.221 -			// If we've reached the end of the current byte sequence, append it to Unicode::$data
   1.222 -			if (!$remaining)
   1.223 -			{
   1.224 -				// If the character is illegal replace it with U+FFFD REPLACEMENT CHARACTER
   1.225 -				if ($length > 1 && $character <= 0x7F
   1.226 -					|| $length > 2 && $character <= 0x7FF
   1.227 -					|| $length > 3 && $character <= 0xFFFF
   1.228 -					|| !self::valid_unicode_codepoint($character))
   1.229 -				{
   1.230 -					$character = 0xFFFD;
   1.231 -				}
   1.232 -				
   1.233 -				$unicode->data .= pack('N', $character);
   1.234 -			}
   1.235 -		}
   1.236 -		
   1.237 -		// Strip any U+FEFF BYTE ORDER MARK (as otherwise we chage the meaing of the new sequence, which is illegal)
   1.238 -		if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
   1.239 -		{
   1.240 -			$unicode->data = substr($unicode->data, 4);
   1.241 -		}
   1.242 -		
   1.243 -		// If we've reached the end of the string but not the end of a character sequence, append a U+FFFD REPLACEMENT CHARACTE
   1.244 -		if ($remaining > 0)
   1.245 -		{
   1.246 -			$unicode->data .= "\x00\x00\xFF\xFD";
   1.247 -		}
   1.248 -		return $unicode;
   1.249 -	}
   1.250 -	
   1.251 -	/**
   1.252 -	 * Create a UTF-8 binary string from the object
   1.253 -	 *
   1.254 -	 * @return string
   1.255 -	 */
   1.256 -	public function to_utf8()
   1.257 -	{
   1.258 -		if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-8', 'UTF-32BE')))
   1.259 -		{
   1.260 -			return $return;
   1.261 -		}
   1.262 -		elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-8', $this->data)))
   1.263 -		{
   1.264 -			return $return;
   1.265 -		}
   1.266 -		else
   1.267 -		{
   1.268 -			$codepoints = unpack('N*', $this->data);
   1.269 -			$return = '';
   1.270 -			foreach ($codepoints as $codepoint)
   1.271 -			{
   1.272 -				$return .= self::codepoint_to_utf8($codepoint);
   1.273 -			}
   1.274 -			return $return;
   1.275 -		}
   1.276 -	}
   1.277 -	
   1.278 -	/**
   1.279 -	 * Convert a unicode codepoint to a UTF-8 character sequence
   1.280 -	 *
   1.281 -	 * @param int $codepoint
   1.282 -	 * @return string
   1.283 -	 */
   1.284 -	private static function codepoint_to_utf8($codepoint)
   1.285 -	{
   1.286 -		// Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
   1.287 -		static $cache;
   1.288 -		
   1.289 -		// If we haven't already got it cached, go cache it
   1.290 -		if (!isset($cache[$codepoint]))
   1.291 -		{
   1.292 -			// If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
   1.293 -			if (!self::valid_unicode_codepoint($codepoint))
   1.294 -			{
   1.295 -				$cache[$codepoint] = "\xEF\xBF\xBD";
   1.296 -			}
   1.297 -			// One byte sequence:
   1.298 -			elseif ($codepoint <= 0x7F)
   1.299 -			{
   1.300 -				$cache[$codepoint] = chr($codepoint);
   1.301 -			}
   1.302 -			// Two byte sequence:
   1.303 -			elseif ($codepoint <= 0x7FF)
   1.304 -			{
   1.305 -				$cache[$codepoint] = chr(0xC0 | ($codepoint >> 6)) . chr(0x80 | ($codepoint & 0x3F));
   1.306 -			}
   1.307 -			// Three byte sequence:
   1.308 -			elseif ($codepoint <= 0xFFFF)
   1.309 -			{
   1.310 -				$cache[$codepoint] = chr(0xE0 | ($codepoint >> 12)) . chr(0x80 | (($codepoint >> 6) & 0x3F)) . chr(0x80 | ($codepoint & 0x3F));
   1.311 -			}
   1.312 -			// Four byte sequence:
   1.313 -			else
   1.314 -			{
   1.315 -				$cache[$codepoint] = chr(0xF0 | ($codepoint >> 18)) . chr(0x80 | (($codepoint >> 12) & 0x3F)) . chr(0x80 | (($codepoint >> 6) & 0x3F)) . chr(0x80 | ($codepoint & 0x3F));
   1.316 -			}
   1.317 -		}
   1.318 -		return $cache[$codepoint];
   1.319 -	}
   1.320 -	
   1.321 -	/**
   1.322 -	 * Create a new Unicode object from a UTF-16 encoded string
   1.323 -	 *
   1.324 -	 * @param string $string
   1.325 -	 * @return Unicode
   1.326 -	 */
   1.327 -	public static function from_utf16($string)
   1.328 -	{
   1.329 -		// Check given parameter is a string
   1.330 -		if (!is_string($string))
   1.331 -		{
   1.332 -			trigger_error('Unicode::from_utf8() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
   1.333 -			return false;
   1.334 -		}
   1.335 -		
   1.336 -		// Create new object
   1.337 -		$unicode = new Unicode;
   1.338 -		
   1.339 -		// Set the data to an empty string and surrogate to false
   1.340 -		$unicode->data = '';
   1.341 -		$surrogate = false;
   1.342 -		
   1.343 -		// See if the string is of a valid length (as UTF-16 is in two byte sequences, it must be divisible by two)
   1.344 -		$valid_length = (($len = strlen($string)) % 2) ? false : true;
   1.345 -		
   1.346 -		// If it is of an invalid length, trim all the invalid bytes at the end (we'll replace them with a U+FFFD REPLACEMENT CHARACTER later)
   1.347 -		if (!$valid_length)
   1.348 -		{
   1.349 -			$string = substr($string, 0, floor($len / 2) * 2);
   1.350 -		}
   1.351 -		
   1.352 -		// If the string starts with a UTF-16LE BOM, it is UTF-16LE, so decode it as such
   1.353 -		if (substr($string, 0, 2) === "\xFF\xFE")
   1.354 -		{
   1.355 -			$words = array_values(unpack('v*', $string));
   1.356 -		}
   1.357 -		// Otherwise, it is UTF-16BE, so decode it as such
   1.358 -		else
   1.359 -		{
   1.360 -			$words = array_values(unpack('n*', $string));
   1.361 -		}
   1.362 -		
   1.363 -		// Iterate through each and every word
   1.364 -		for ($i = 0, $word_count = count($words); $i < $word_count; $i++)
   1.365 -		{
   1.366 -			// If we're the first word of sequence:
   1.367 -			if (!$surrogate)
   1.368 -			{
   1.369 -				// One word sequence:
   1.370 -				if (self::valid_unicode_codepoint($words[$i]))
   1.371 -				{
   1.372 -					$unicode->data .= pack('N', $words[$i]);
   1.373 -				}
   1.374 -				// Two word sequence:
   1.375 -				elseif ($words[$i] >= 0xD800 && $words[$i] <= 0xDFFF)
   1.376 -				{
   1.377 -					$character = ($words[$i] & 0x3FF) << 10;
   1.378 -					$surrogate = true;
   1.379 -				}
   1.380 -				// Invalid word:
   1.381 -				else
   1.382 -				{
   1.383 -					$unicode->data .= pack('N', 0xFFFD);
   1.384 -				}
   1.385 -			}
   1.386 -			// Second word:
   1.387 -			else
   1.388 -			{
   1.389 -				// Surrogates are only ever two words, so we can say we've reached the end with certainty
   1.390 -				$surrogate = false;
   1.391 -				
   1.392 -				// Check that the word is valid, then add it to the character:
   1.393 -				if ($words[$i] >= 0xDC00 && $words[$i] <= 0xDFFF)
   1.394 -				{
   1.395 -					$character |= $words[$i] & 0x3FF;
   1.396 -					if (self::valid_unicode_codepoint($character))
   1.397 -					{
   1.398 -						$unicode->data .= pack('N', $character);
   1.399 -					}
   1.400 -					else
   1.401 -					{
   1.402 -						$unicode->data .= pack('N', 0xFFFD);
   1.403 -					}
   1.404 -				}
   1.405 -				// If it is invalid, count the sequence as invalid and reprocess the current word as a first word:
   1.406 -				else
   1.407 -				{
   1.408 -					$unicode->data .= pack('N', 0xFFFD);
   1.409 -					$i--;
   1.410 -				}
   1.411 -			}
   1.412 -		}
   1.413 -		
   1.414 -		// If we've reached the end of the string but not the end of a surrogate pair, append a U+FFFD REPLACEMENT CHARACTER
   1.415 -		if ($surrogate)
   1.416 -		{
   1.417 -			$unicode->data .= "\x00\x00\xFF\xFD";
   1.418 -		}
   1.419 -		
   1.420 -		// If it was of an invalid length, append a U+FFFD REPLACEMENT CHARACTER
   1.421 -		if (!$valid_length)
   1.422 -		{
   1.423 -			$unicode->data .= "\x00\x00\xFF\xFD";
   1.424 -		}
   1.425 -		
   1.426 -		// Strip any U+FEFF BYTE ORDER MARK (as otherwise we chage the meaing of the new sequence, which is illegal)
   1.427 -		if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
   1.428 -		{
   1.429 -			$unicode->data = substr($unicode->data, 4);
   1.430 -		}
   1.431 -		return $unicode;
   1.432 -	}
   1.433 -	
   1.434 -	/**
   1.435 -	 * Create a new Unicode object from a UTF-16BE encoded string
   1.436 -	 *
   1.437 -	 * @param string $string
   1.438 -	 * @return Unicode
   1.439 -	 */
   1.440 -	public static function from_utf16be($string)
   1.441 -	{
   1.442 -		// Check given parameter is a string
   1.443 -		if (!is_string($string))
   1.444 -		{
   1.445 -			trigger_error('Unicode::from_utf16be() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
   1.446 -			return false;
   1.447 -		}
   1.448 -		
   1.449 -		// Add BOM before calling Unicode::from_utf16()
   1.450 -		return self::from_utf16("\xFE\xFF" . $string);
   1.451 -	}
   1.452 -	
   1.453 -	/**
   1.454 -	 * Create a new Unicode object from a UTF-16LE encoded string
   1.455 -	 *
   1.456 -	 * @param string $string
   1.457 -	 * @return Unicode
   1.458 -	 */
   1.459 -	public static function from_utf16le($string)
   1.460 -	{
   1.461 -		// Check given parameter is a string
   1.462 -		if (!is_string($string))
   1.463 -		{
   1.464 -			trigger_error('Unicode::from_utf16le() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
   1.465 -			return false;
   1.466 -		}
   1.467 -		
   1.468 -		// Add BOM before calling Unicode::from_utf16()
   1.469 -		return self::from_utf16("\xFF\xFE" . $string);
   1.470 -	}
   1.471 -	
   1.472 -	/**
   1.473 -	 * Create a UTF-16 binary string from the object
   1.474 -	 *
   1.475 -	 * @return string
   1.476 -	 */
   1.477 -	public function to_utf16()
   1.478 -	{
   1.479 -		return "\xFE\xFF" . $this->to_utf16be();
   1.480 -	}
   1.481 -	
   1.482 -	/**
   1.483 -	 * Create a UTF-16BE binary string from the object
   1.484 -	 *
   1.485 -	 * @return string
   1.486 -	 */
   1.487 -	public function to_utf16be()
   1.488 -	{
   1.489 -		if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-16BE', 'UTF-32BE')))
   1.490 -		{
   1.491 -			return $return;
   1.492 -		}
   1.493 -		elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-16BE', $this->data)))
   1.494 -		{
   1.495 -			return $return;
   1.496 -		}
   1.497 -		else
   1.498 -		{
   1.499 -			$codepoints = unpack('N*', $this->data);
   1.500 -			$return = '';
   1.501 -			foreach ($codepoints as $codepoint)
   1.502 -			{
   1.503 -				$return .= self::codepoint_to_utf16be($codepoint);
   1.504 -			}
   1.505 -			return $return;
   1.506 -		}
   1.507 -	}
   1.508 -	
   1.509 -	/**
   1.510 -	 * Create a UTF-16LE binary string from the object
   1.511 -	 *
   1.512 -	 * @return string
   1.513 -	 */
   1.514 -	public function to_utf16le()
   1.515 -	{
   1.516 -		if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-16LE', 'UTF-32BE')))
   1.517 -		{
   1.518 -			return $return;
   1.519 -		}
   1.520 -		elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-16LE', $this->data)))
   1.521 -		{
   1.522 -			return $return;
   1.523 -		}
   1.524 -		else
   1.525 -		{
   1.526 -			$codepoints = unpack('N*', $this->data);
   1.527 -			$return = '';
   1.528 -			foreach ($codepoints as $codepoint)
   1.529 -			{
   1.530 -				$return .= self::codepoint_to_utf16le($codepoint);
   1.531 -			}
   1.532 -			return $return;
   1.533 -		}
   1.534 -	}
   1.535 -	
   1.536 -	/**
   1.537 -	 * Convert a unicode codepoint to a UTF-16 character sequence
   1.538 -	 *
   1.539 -	 * @param int $codepoint
   1.540 -	 * @return string
   1.541 -	 */
   1.542 -	private static function codepoint_to_utf16($codepoint)
   1.543 -	{
   1.544 -		return self::codepoint_to_utf16be($codepoint);
   1.545 -	}
   1.546 -	
   1.547 -	/**
   1.548 -	 * Convert a unicode codepoint to a UTF-16BE character sequence
   1.549 -	 *
   1.550 -	 * @param int $codepoint
   1.551 -	 * @return string
   1.552 -	 */
   1.553 -	private static function codepoint_to_utf16be($codepoint)
   1.554 -	{
   1.555 -		// Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
   1.556 -		static $cache;
   1.557 -		
   1.558 -		// If we haven't already got it cached, go cache it
   1.559 -		if (!isset($cache[$codepoint]))
   1.560 -		{
   1.561 -			// If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
   1.562 -			if (!self::valid_unicode_codepoint($codepoint))
   1.563 -			{
   1.564 -				$cache[$codepoint] = "\xFF\xFD";
   1.565 -			}
   1.566 -			// Without a surrogate:
   1.567 -			elseif ($codepoint < 0x10000)
   1.568 -			{
   1.569 -				$cache[$codepoint] = pack('n', $codepoint);
   1.570 -			}
   1.571 -			// With a surrogate
   1.572 -			else
   1.573 -			{
   1.574 -				$surrogate_code_point = $codepoint - 0x10000;
   1.575 -				$cache[$codepoint] = pack('n*', ($codepoint >> 10) | 0xD800, ($codepoint & 0x03FF) | 0xDC00);
   1.576 -			}
   1.577 -		}
   1.578 -		return $cache[$codepoint];
   1.579 -	}
   1.580 -	
   1.581 -	/**
   1.582 -	 * Convert a unicode codepoint to a UTF-16LE character sequence
   1.583 -	 *
   1.584 -	 * @param int $codepoint
   1.585 -	 * @return string
   1.586 -	 */
   1.587 -	private static function codepoint_to_utf16le($codepoint)
   1.588 -	{
   1.589 -		// Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
   1.590 -		static $cache;
   1.591 -		
   1.592 -		// If we haven't already got it cached, go cache it
   1.593 -		if (!isset($cache[$codepoint]))
   1.594 -		{
   1.595 -			// If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
   1.596 -			if (!self::valid_unicode_codepoint($codepoint))
   1.597 -			{
   1.598 -				$cache[$codepoint] = "\xFD\xFF";
   1.599 -			}
   1.600 -			// Without a surrogate:
   1.601 -			elseif ($codepoint < 0x10000)
   1.602 -			{
   1.603 -				$cache[$codepoint] = pack('v', $codepoint);
   1.604 -			}
   1.605 -			// With a surrogate
   1.606 -			else
   1.607 -			{
   1.608 -				$surrogate_code_point = $codepoint - 0x10000;
   1.609 -				$cache[$codepoint] = pack('v*', ($codepoint >> 10) | 0xD800, ($codepoint & 0x03FF) | 0xDC00);
   1.610 -			}
   1.611 -		}
   1.612 -		return $cache[$codepoint];
   1.613 -	}
   1.614 -	
   1.615 -	/**
   1.616 -	 * Create a new Unicode object from a UTF-32 encoded string
   1.617 -	 *
   1.618 -	 * @param string $string
   1.619 -	 * @return Unicode
   1.620 -	 */
   1.621 -	public static function from_utf32($string)
   1.622 -	{
   1.623 -		// Check given parameter is a string
   1.624 -		if (!is_string($string))
   1.625 -		{
   1.626 -			trigger_error('Unicode::from_utf32() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
   1.627 -			return false;
   1.628 -		}
   1.629 -		
   1.630 -		// Create new object
   1.631 -		$unicode = new Unicode;
   1.632 -		
   1.633 -		// Set the data to an empty string
   1.634 -		$unicode->data = '';
   1.635 -		
   1.636 -		// See if the string is of a valid length (as UTF-32 is in four byte sequences, it must be divisible by four)
   1.637 -		$valid_length = (($len = strlen($string)) % 4) ? false : true;
   1.638 -		
   1.639 -		// If it is of an invalid length, trim all the invalid bytes at the end (we'll replace them with a U+FFFD REPLACEMENT CHARACTER later)
   1.640 -		if (!$valid_length)
   1.641 -		{
   1.642 -			$string = substr($string, 0, floor($len / 4) * 4);
   1.643 -		}
   1.644 -		
   1.645 -		// If the string starts with a UTF-32LE BOM, it is UTF-32LE, so decode it as such
   1.646 -		if (substr($string, 0, 4) === "\xFF\xFE\x00\x00")
   1.647 -		{
   1.648 -			$codepoints = unpack('V*', $string);
   1.649 -		}
   1.650 -		// Otherwise, it is UTF-32BE, so decode it as such
   1.651 -		else
   1.652 -		{
   1.653 -			$codepoints = unpack('N*', $string);
   1.654 -		}
   1.655 -		
   1.656 -		// Iterate through each and every codepoint
   1.657 -		foreach ($codepoints as $codepoint)
   1.658 -		{
   1.659 -			// If the codepoint is an invalid character replace it with a U+FFFD REPLACEMENT CHARACTER
   1.660 -			if (!self::valid_unicode_codepoint($codepoint))
   1.661 -			{
   1.662 -				$unicode->data .= "\x00\x00\xFF\xFD";
   1.663 -			}
   1.664 -			// Otherwise, append it to Unicode::$data
   1.665 -			else
   1.666 -			{
   1.667 -				$unicode->data .= pack('N', $codepoint);
   1.668 -			}
   1.669 -		}
   1.670 -		
   1.671 -		// If it was of an invalid length, append a U+FFFD REPLACEMENT CHARACTER
   1.672 -		if (!$valid_length)
   1.673 -		{
   1.674 -			$unicode->data .= "\x00\x00\xFF\xFD";
   1.675 -		}
   1.676 -		
   1.677 -		// Strip any leading BOM (as otherwise we chage the meaing of the new sequence, which is illegal)
   1.678 -		if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
   1.679 -		{
   1.680 -			$unicode->data = substr($unicode->data, 4);
   1.681 -		}
   1.682 -		
   1.683 -		return $unicode;
   1.684 -	}
   1.685 -	
   1.686 -	/**
   1.687 -	 * Create a new Unicode object from a UTF-32BE encoded string
   1.688 -	 *
   1.689 -	 * @param string $string
   1.690 -	 * @return Unicode
   1.691 -	 */
   1.692 -	public static function from_utf32be($string)
   1.693 -	{
   1.694 -		// Check given parameter is a string
   1.695 -		if (!is_string($string))
   1.696 -		{
   1.697 -			trigger_error('Unicode::from_utf32be() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
   1.698 -			return false;
   1.699 -		}
   1.700 -		
   1.701 -		// Add BOM before calling Unicode::from_utf32()
   1.702 -		return self::from_utf32("\x00\x00\xFE\xFF" . $string);
   1.703 -	}
   1.704 -	
   1.705 -	/**
   1.706 -	 * Create a new Unicode object from a UTF-32LE encoded string
   1.707 -	 *
   1.708 -	 * @param string $string
   1.709 -	 * @return Unicode
   1.710 -	 */
   1.711 -	public static function from_utf32le($string)
   1.712 -	{
   1.713 -		// Check given parameter is a string
   1.714 -		if (!is_string($string))
   1.715 -		{
   1.716 -			trigger_error('Unicode::from_utf32le() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
   1.717 -			return false;
   1.718 -		}
   1.719 -		
   1.720 -		// Add BOM before calling Unicode::from_utf32()
   1.721 -		return self::from_utf32("\xFF\xFE\x00\x00" . $string);
   1.722 -	}
   1.723 -	
   1.724 -	/**
   1.725 -	 * Create a UTF-32 binary string from the object
   1.726 -	 *
   1.727 -	 * @return string
   1.728 -	 */
   1.729 -	public function to_utf32()
   1.730 -	{
   1.731 -		return "\x00\x00\xFE\xFF" . $this->to_utf32be();
   1.732 -	}
   1.733 -	
   1.734 -	/**
   1.735 -	 * Create a UTF-32BE binary string from the object
   1.736 -	 *
   1.737 -	 * @return string
   1.738 -	 */
   1.739 -	public function to_utf32be()
   1.740 -	{
   1.741 -		return $this->data;
   1.742 -	}
   1.743 -	
   1.744 -	/**
   1.745 -	 * Create a UTF-32LE binary string from the object
   1.746 -	 *
   1.747 -	 * @return string
   1.748 -	 */
   1.749 -	public function to_utf32le()
   1.750 -	{
   1.751 -		if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-32LE', 'UTF-32BE')))
   1.752 -		{
   1.753 -			return $return;
   1.754 -		}
   1.755 -		elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-32LE', $this->data)))
   1.756 -		{
   1.757 -			return $return;
   1.758 -		}
   1.759 -		else
   1.760 -		{
   1.761 -			return call_user_func_array('pack', array_merge(array('V*'), unpack('N*', $this->data)));
   1.762 -		}
   1.763 -	}
   1.764 -	
   1.765 -	/**
   1.766 -	 * Convert a unicode codepoint to a UTF-32 character sequence
   1.767 -	 *
   1.768 -	 * @param int $codepoint
   1.769 -	 * @return string
   1.770 -	 */
   1.771 -	private static function codepoint_to_utf32($codepoint)
   1.772 -	{
   1.773 -		return self::codepoint_to_utf32be($codepoint);
   1.774 -	}
   1.775 -	
   1.776 -	/**
   1.777 -	 * Convert a unicode codepoint to a UTF-32BE character sequence
   1.778 -	 *
   1.779 -	 * @param int $codepoint
   1.780 -	 * @return string
   1.781 -	 */
   1.782 -	private static function codepoint_to_utf32be($codepoint)
   1.783 -	{
   1.784 -		if (self::valid_unicode_codepoint($codepoint))
   1.785 -		{
   1.786 -			return pack('N', $codepoint);
   1.787 -		}
   1.788 -		else
   1.789 -		{
   1.790 -			return "\x00\x00\xFF\xFD";
   1.791 -		}
   1.792 -	}
   1.793 -	
   1.794 -	/**
   1.795 -	 * Convert a unicode codepoint to a UTF-32LE character sequence
   1.796 -	 *
   1.797 -	 * @param int $codepoint
   1.798 -	 * @return string
   1.799 -	 */
   1.800 -	private static function codepoint_to_utf32le($codepoint)
   1.801 -	{
   1.802 -		if (self::valid_unicode_codepoint($codepoint))
   1.803 -		{
   1.804 -			return pack('V', $codepoint);
   1.805 -		}
   1.806 -		else
   1.807 -		{
   1.808 -			return "\xFD\xFF\x00\x00";
   1.809 -		}
   1.810 -	}
   1.811 +    /**
   1.812 +     * Contains the raw unicode data that we're working from
   1.813 +     *
   1.814 +     * @var string UTF-32BE binary string on PHP < 6, otherwise a unicode string
   1.815 +     */
   1.816 +    private $data;
   1.817 +    
   1.818 +    /**
   1.819 +     * Object should be created with some Unicode::from_*() method, therefore
   1.820 +     * this is private
   1.821 +     */
   1.822 +    private function __construct()
   1.823 +    {
   1.824 +    }
   1.825 +    
   1.826 +    /**
   1.827 +     * Prepare the object for serialisation
   1.828 +     */
   1.829 +    public function __sleep()
   1.830 +    {
   1.831 +        return array('data');
   1.832 +    }
   1.833 +    
   1.834 +    /**
   1.835 +     * Check the object is valid when being unserialised
   1.836 +     *
   1.837 +     * To prepare the object for use after being unserialised, we need to check
   1.838 +     * that it is valid. If Unicode::$data is not a string, a warning will be thrown. The
   1.839 +     * validity of the UTF-32BE Unicode::$data is also checked, and the string
   1.840 +     * is corrected if it is invalid.
   1.841 +     */
   1.842 +    public function __wakeup()
   1.843 +    {
   1.844 +        if (!isset($this->data))
   1.845 +        {
   1.846 +            trigger_error('Unicode::__wakeup() expects the serialised object to have a $data property, none exists', E_USER_WARNING);
   1.847 +            $this->data = '';
   1.848 +        }
   1.849 +        elseif (!is_string($this->data))
   1.850 +        {
   1.851 +            trigger_error('Unicode::__wakeup() expects Unicode::$data to be string, ' . get_type($this->data) . ' given', E_USER_WARNING);
   1.852 +            $this->data = '';
   1.853 +        }
   1.854 +        else
   1.855 +        {
   1.856 +            $this->data = Unicode::from_utf32be($this->data)->to_utf32be();
   1.857 +        }
   1.858 +    }
   1.859 +        
   1.860 +    /**
   1.861 +     * Check the given codepoint is a valid character
   1.862 +     *
   1.863 +     * @param int $codepoint
   1.864 +     * @return bool
   1.865 +     */
   1.866 +    private static function valid_unicode_codepoint($codepoint)
   1.867 +    {
   1.868 +        // Outside of Unicode codespace
   1.869 +        if ($codepoint < 0
   1.870 +            || $codepoint > 0x10FFFF
   1.871 +            // UTF-16 Surrogates
   1.872 +            || $codepoint >= 0xD800 && $codepoint <= 0xDFFF
   1.873 +            // Noncharacters
   1.874 +            || ($codepoint & 0xFFFE) === 0xFFFE
   1.875 +            || $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF)
   1.876 +        {
   1.877 +            return false;
   1.878 +        }
   1.879 +        else
   1.880 +        {
   1.881 +            return true;
   1.882 +        }
   1.883 +    }
   1.884 +    
   1.885 +    /**
   1.886 +     * Create a new Unicode object from an array of codepoints
   1.887 +     *
   1.888 +     * @param array $array
   1.889 +     * @return Unicode
   1.890 +     */
   1.891 +    public static function from_codepoint_array($array)
   1.892 +    {
   1.893 +        // Check given parameter is an array
   1.894 +        if (!is_array($string))
   1.895 +        {
   1.896 +            trigger_error('Unicode::from_codepoint_array() expects parameter 1 to be array, ' . get_type($string) . ' given', E_USER_WARNING);
   1.897 +            return false;
   1.898 +        }
   1.899 +        
   1.900 +        // Get U+FFFD as a binary string
   1.901 +        static $replacement_character = "\x00\x00\xFF\xFD";
   1.902 +        
   1.903 +        // Create new object
   1.904 +        $unicode = new Unicode;
   1.905 +        
   1.906 +        // Strip any leading BOM (as otherwise we chage the meaing of the new sequence, which is illegal)
   1.907 +        if (isset($array[0]) && $array[0] === 0xFFFD)
   1.908 +        {
   1.909 +            array_splice($array, 1);
   1.910 +        }
   1.911 +        
   1.912 +        // Iterate through each and every codepoint
   1.913 +        foreach ($array as $codepoint)
   1.914 +        {
   1.915 +            // If the codepoint is an invalid character replace it with a U+FFFD REPLACEMENT CHARACTER
   1.916 +            if (!self::valid_unicode_codepoint($codepoint))
   1.917 +            {
   1.918 +                $unicode->data .= $replacement_character;
   1.919 +            }
   1.920 +            // Otherwise, append it to Unicode::$data
   1.921 +            else
   1.922 +            {
   1.923 +                $unicode->data .= pack('N', $codepoint);
   1.924 +            }
   1.925 +        }
   1.926 +        
   1.927 +        return $unicode;
   1.928 +    }
   1.929 +    
   1.930 +    /**
   1.931 +     * Create an array of codepoints from the object
   1.932 +     *
   1.933 +     * @return string
   1.934 +     */
   1.935 +    public function to_codepoint_array()
   1.936 +    {
   1.937 +        $data = $this->data;
   1.938 +        return array_values(unpack('N*', $data));
   1.939 +    }
   1.940 +    
   1.941 +    /**
   1.942 +     * Create a new Unicode object from a UTF-8 encoded string
   1.943 +     *
   1.944 +     * @param string $string
   1.945 +     * @return Unicode
   1.946 +     */
   1.947 +    public static function from_utf8($string)
   1.948 +    {
   1.949 +        // Check given parameter is a string
   1.950 +        if (!is_string($string))
   1.951 +        {
   1.952 +            trigger_error('Unicode::from_utf8() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
   1.953 +            return false;
   1.954 +        }
   1.955 +        
   1.956 +        // Create new object
   1.957 +        $unicode = new Unicode;
   1.958 +        
   1.959 +        // Set the data to an empty string, and remaining bytes in the current sequence to zero
   1.960 +        $unicode->data = '';
   1.961 +        $remaining = 0;
   1.962 +        
   1.963 +        // Iterate through each and every byte
   1.964 +        for ($i = 0, $len = strlen($string); $i < $len; $i++)
   1.965 +        {
   1.966 +            $value = ord($string[$i]);
   1.967 +            
   1.968 +            // If we're the first byte of sequence:
   1.969 +            if (!$remaining)
   1.970 +            {
   1.971 +                // One byte sequence:
   1.972 +                if ($value <= 0x7F)
   1.973 +                {
   1.974 +                    $character = $value;
   1.975 +                    $length = 1;
   1.976 +                }
   1.977 +                // Two byte sequence:
   1.978 +                elseif (($value & 0xE0) === 0xC0)
   1.979 +                {
   1.980 +                    $character = ($value & 0x1F) << 6;
   1.981 +                    $length = 2;
   1.982 +                    $remaining = 1;
   1.983 +                }
   1.984 +                // Three byte sequence:
   1.985 +                elseif (($value & 0xF0) === 0xE0)
   1.986 +                {
   1.987 +                    $character = ($value & 0x0F) << 12;
   1.988 +                    $length = 3;
   1.989 +                    $remaining = 2;
   1.990 +                }
   1.991 +                // Four byte sequence:
   1.992 +                elseif (($value & 0xF8) === 0xF0)
   1.993 +                {
   1.994 +                    $character = ($value & 0x07) << 18;
   1.995 +                    $length = 4;
   1.996 +                    $remaining = 3;
   1.997 +                }
   1.998 +                // Invalid byte:
   1.999 +                else
  1.1000 +                {
  1.1001 +                    $character = 0xFFFD;
  1.1002 +                    $length = 3;
  1.1003 +                    $remaining = 0;
  1.1004 +                }
  1.1005 +            }
  1.1006 +            // Continuation byte:
  1.1007 +            else
  1.1008 +            {
  1.1009 +                // Check that the byte is valid, then add it to the character:
  1.1010 +                if (($value & 0xC0) === 0x80)
  1.1011 +                {
  1.1012 +                    $remaining--;
  1.1013 +                    $character |= ($value & 0x3F) << ($remaining * 6);
  1.1014 +                }
  1.1015 +                // If it is invalid, count the sequence as invalid and reprocess the current byte as the start of a sequence:
  1.1016 +                else
  1.1017 +                {
  1.1018 +                    $character = 0xFFFD;
  1.1019 +                    $length = 3;
  1.1020 +                    $remaining = 0;
  1.1021 +                    $i--;
  1.1022 +                }
  1.1023 +            }
  1.1024 +            
  1.1025 +            // If we've reached the end of the current byte sequence, append it to Unicode::$data
  1.1026 +            if (!$remaining)
  1.1027 +            {
  1.1028 +                // If the character is illegal replace it with U+FFFD REPLACEMENT CHARACTER
  1.1029 +                if ($length > 1 && $character <= 0x7F
  1.1030 +                    || $length > 2 && $character <= 0x7FF
  1.1031 +                    || $length > 3 && $character <= 0xFFFF
  1.1032 +                    || !self::valid_unicode_codepoint($character))
  1.1033 +                {
  1.1034 +                    $character = 0xFFFD;
  1.1035 +                }
  1.1036 +                
  1.1037 +                $unicode->data .= pack('N', $character);
  1.1038 +            }
  1.1039 +        }
  1.1040 +        
  1.1041 +        // Strip any U+FEFF BYTE ORDER MARK (as otherwise we chage the meaing of the new sequence, which is illegal)
  1.1042 +        if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
  1.1043 +        {
  1.1044 +            $unicode->data = substr($unicode->data, 4);
  1.1045 +        }
  1.1046 +        
  1.1047 +        // If we've reached the end of the string but not the end of a character sequence, append a U+FFFD REPLACEMENT CHARACTE
  1.1048 +        if ($remaining > 0)
  1.1049 +        {
  1.1050 +            $unicode->data .= "\x00\x00\xFF\xFD";
  1.1051 +        }
  1.1052 +        return $unicode;
  1.1053 +    }
  1.1054 +    
  1.1055 +    /**
  1.1056 +     * Create a UTF-8 binary string from the object
  1.1057 +     *
  1.1058 +     * @return string
  1.1059 +     */
  1.1060 +    public function to_utf8()
  1.1061 +    {
  1.1062 +        if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-8', 'UTF-32BE')))
  1.1063 +        {
  1.1064 +            return $return;
  1.1065 +        }
  1.1066 +        elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-8', $this->data)))
  1.1067 +        {
  1.1068 +            return $return;
  1.1069 +        }
  1.1070 +        else
  1.1071 +        {
  1.1072 +            $codepoints = unpack('N*', $this->data);
  1.1073 +            $return = '';
  1.1074 +            foreach ($codepoints as $codepoint)
  1.1075 +            {
  1.1076 +                $return .= self::codepoint_to_utf8($codepoint);
  1.1077 +            }
  1.1078 +            return $return;
  1.1079 +        }
  1.1080 +    }
  1.1081 +    
  1.1082 +    /**
  1.1083 +     * Convert a unicode codepoint to a UTF-8 character sequence
  1.1084 +     *
  1.1085 +     * @param int $codepoint
  1.1086 +     * @return string
  1.1087 +     */
  1.1088 +    private static function codepoint_to_utf8($codepoint)
  1.1089 +    {
  1.1090 +        // Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
  1.1091 +        static $cache;
  1.1092 +        
  1.1093 +        // If we haven't already got it cached, go cache it
  1.1094 +        if (!isset($cache[$codepoint]))
  1.1095 +        {
  1.1096 +            // If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
  1.1097 +            if (!self::valid_unicode_codepoint($codepoint))
  1.1098 +            {
  1.1099 +                $cache[$codepoint] = "\xEF\xBF\xBD";
  1.1100 +            }
  1.1101 +            // One byte sequence:
  1.1102 +            elseif ($codepoint <= 0x7F)
  1.1103 +            {
  1.1104 +                $cache[$codepoint] = chr($codepoint);
  1.1105 +            }
  1.1106 +            // Two byte sequence:
  1.1107 +            elseif ($codepoint <= 0x7FF)
  1.1108 +            {
  1.1109 +                $cache[$codepoint] = chr(0xC0 | ($codepoint >> 6)) . chr(0x80 | ($codepoint & 0x3F));
  1.1110 +            }
  1.1111 +            // Three byte sequence:
  1.1112 +            elseif ($codepoint <= 0xFFFF)
  1.1113 +            {
  1.1114 +                $cache[$codepoint] = chr(0xE0 | ($codepoint >> 12)) . chr(0x80 | (($codepoint >> 6) & 0x3F)) . chr(0x80 | ($codepoint & 0x3F));
  1.1115 +            }
  1.1116 +            // Four byte sequence:
  1.1117 +            else
  1.1118 +            {
  1.1119 +                $cache[$codepoint] = chr(0xF0 | ($codepoint >> 18)) . chr(0x80 | (($codepoint >> 12) & 0x3F)) . chr(0x80 | (($codepoint >> 6) & 0x3F)) . chr(0x80 | ($codepoint & 0x3F));
  1.1120 +            }
  1.1121 +        }
  1.1122 +        return $cache[$codepoint];
  1.1123 +    }
  1.1124 +    
  1.1125 +    /**
  1.1126 +     * Create a new Unicode object from a UTF-16 encoded string
  1.1127 +     *
  1.1128 +     * @param string $string
  1.1129 +     * @return Unicode
  1.1130 +     */
  1.1131 +    public static function from_utf16($string)
  1.1132 +    {
  1.1133 +        // Check given parameter is a string
  1.1134 +        if (!is_string($string))
  1.1135 +        {
  1.1136 +            trigger_error('Unicode::from_utf8() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
  1.1137 +            return false;
  1.1138 +        }
  1.1139 +        
  1.1140 +        // Create new object
  1.1141 +        $unicode = new Unicode;
  1.1142 +        
  1.1143 +        // Set the data to an empty string and surrogate to false
  1.1144 +        $unicode->data = '';
  1.1145 +        $surrogate = false;
  1.1146 +        
  1.1147 +        // See if the string is of a valid length (as UTF-16 is in two byte sequences, it must be divisible by two)
  1.1148 +        $valid_length = (($len = strlen($string)) % 2) ? false : true;
  1.1149 +        
  1.1150 +        // If it is of an invalid length, trim all the invalid bytes at the end (we'll replace them with a U+FFFD REPLACEMENT CHARACTER later)
  1.1151 +        if (!$valid_length)
  1.1152 +        {
  1.1153 +            $string = substr($string, 0, floor($len / 2) * 2);
  1.1154 +        }
  1.1155 +        
  1.1156 +        // If the string starts with a UTF-16LE BOM, it is UTF-16LE, so decode it as such
  1.1157 +        if (substr($string, 0, 2) === "\xFF\xFE")
  1.1158 +        {
  1.1159 +            $words = array_values(unpack('v*', $string));
  1.1160 +        }
  1.1161 +        // Otherwise, it is UTF-16BE, so decode it as such
  1.1162 +        else
  1.1163 +        {
  1.1164 +            $words = array_values(unpack('n*', $string));
  1.1165 +        }
  1.1166 +        
  1.1167 +        // Iterate through each and every word
  1.1168 +        for ($i = 0, $word_count = count($words); $i < $word_count; $i++)
  1.1169 +        {
  1.1170 +            // If we're the first word of sequence:
  1.1171 +            if (!$surrogate)
  1.1172 +            {
  1.1173 +                // One word sequence:
  1.1174 +                if (self::valid_unicode_codepoint($words[$i]))
  1.1175 +                {
  1.1176 +                    $unicode->data .= pack('N', $words[$i]);
  1.1177 +                }
  1.1178 +                // Two word sequence:
  1.1179 +                elseif ($words[$i] >= 0xD800 && $words[$i] <= 0xDFFF)
  1.1180 +                {
  1.1181 +                    $character = ($words[$i] & 0x3FF) << 10;
  1.1182 +                    $surrogate = true;
  1.1183 +                }
  1.1184 +                // Invalid word:
  1.1185 +                else
  1.1186 +                {
  1.1187 +                    $unicode->data .= pack('N', 0xFFFD);
  1.1188 +                }
  1.1189 +            }
  1.1190 +            // Second word:
  1.1191 +            else
  1.1192 +            {
  1.1193 +                // Surrogates are only ever two words, so we can say we've reached the end with certainty
  1.1194 +                $surrogate = false;
  1.1195 +                
  1.1196 +                // Check that the word is valid, then add it to the character:
  1.1197 +                if ($words[$i] >= 0xDC00 && $words[$i] <= 0xDFFF)
  1.1198 +                {
  1.1199 +                    $character |= $words[$i] & 0x3FF;
  1.1200 +                    if (self::valid_unicode_codepoint($character))
  1.1201 +                    {
  1.1202 +                        $unicode->data .= pack('N', $character);
  1.1203 +                    }
  1.1204 +                    else
  1.1205 +                    {
  1.1206 +                        $unicode->data .= pack('N', 0xFFFD);
  1.1207 +                    }
  1.1208 +                }
  1.1209 +                // If it is invalid, count the sequence as invalid and reprocess the current word as a first word:
  1.1210 +                else
  1.1211 +                {
  1.1212 +                    $unicode->data .= pack('N', 0xFFFD);
  1.1213 +                    $i--;
  1.1214 +                }
  1.1215 +            }
  1.1216 +        }
  1.1217 +        
  1.1218 +        // If we've reached the end of the string but not the end of a surrogate pair, append a U+FFFD REPLACEMENT CHARACTER
  1.1219 +        if ($surrogate)
  1.1220 +        {
  1.1221 +            $unicode->data .= "\x00\x00\xFF\xFD";
  1.1222 +        }
  1.1223 +        
  1.1224 +        // If it was of an invalid length, append a U+FFFD REPLACEMENT CHARACTER
  1.1225 +        if (!$valid_length)
  1.1226 +        {
  1.1227 +            $unicode->data .= "\x00\x00\xFF\xFD";
  1.1228 +        }
  1.1229 +        
  1.1230 +        // Strip any U+FEFF BYTE ORDER MARK (as otherwise we chage the meaing of the new sequence, which is illegal)
  1.1231 +        if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
  1.1232 +        {
  1.1233 +            $unicode->data = substr($unicode->data, 4);
  1.1234 +        }
  1.1235 +        return $unicode;
  1.1236 +    }
  1.1237 +    
  1.1238 +    /**
  1.1239 +     * Create a new Unicode object from a UTF-16BE encoded string
  1.1240 +     *
  1.1241 +     * @param string $string
  1.1242 +     * @return Unicode
  1.1243 +     */
  1.1244 +    public static function from_utf16be($string)
  1.1245 +    {
  1.1246 +        // Check given parameter is a string
  1.1247 +        if (!is_string($string))
  1.1248 +        {
  1.1249 +            trigger_error('Unicode::from_utf16be() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
  1.1250 +            return false;
  1.1251 +        }
  1.1252 +        
  1.1253 +        // Add BOM before calling Unicode::from_utf16()
  1.1254 +        return self::from_utf16("\xFE\xFF" . $string);
  1.1255 +    }
  1.1256 +    
  1.1257 +    /**
  1.1258 +     * Create a new Unicode object from a UTF-16LE encoded string
  1.1259 +     *
  1.1260 +     * @param string $string
  1.1261 +     * @return Unicode
  1.1262 +     */
  1.1263 +    public static function from_utf16le($string)
  1.1264 +    {
  1.1265 +        // Check given parameter is a string
  1.1266 +        if (!is_string($string))
  1.1267 +        {
  1.1268 +            trigger_error('Unicode::from_utf16le() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
  1.1269 +            return false;
  1.1270 +        }
  1.1271 +        
  1.1272 +        // Add BOM before calling Unicode::from_utf16()
  1.1273 +        return self::from_utf16("\xFF\xFE" . $string);
  1.1274 +    }
  1.1275 +    
  1.1276 +    /**
  1.1277 +     * Create a UTF-16 binary string from the object
  1.1278 +     *
  1.1279 +     * @return string
  1.1280 +     */
  1.1281 +    public function to_utf16()
  1.1282 +    {
  1.1283 +        return "\xFE\xFF" . $this->to_utf16be();
  1.1284 +    }
  1.1285 +    
  1.1286 +    /**
  1.1287 +     * Create a UTF-16BE binary string from the object
  1.1288 +     *
  1.1289 +     * @return string
  1.1290 +     */
  1.1291 +    public function to_utf16be()
  1.1292 +    {
  1.1293 +        if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-16BE', 'UTF-32BE')))
  1.1294 +        {
  1.1295 +            return $return;
  1.1296 +        }
  1.1297 +        elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-16BE', $this->data)))
  1.1298 +        {
  1.1299 +            return $return;
  1.1300 +        }
  1.1301 +        else
  1.1302 +        {
  1.1303 +            $codepoints = unpack('N*', $this->data);
  1.1304 +            $return = '';
  1.1305 +            foreach ($codepoints as $codepoint)
  1.1306 +            {
  1.1307 +                $return .= self::codepoint_to_utf16be($codepoint);
  1.1308 +            }
  1.1309 +            return $return;
  1.1310 +        }
  1.1311 +    }
  1.1312 +    
  1.1313 +    /**
  1.1314 +     * Create a UTF-16LE binary string from the object
  1.1315 +     *
  1.1316 +     * @return string
  1.1317 +     */
  1.1318 +    public function to_utf16le()
  1.1319 +    {
  1.1320 +        if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-16LE', 'UTF-32BE')))
  1.1321 +        {
  1.1322 +            return $return;
  1.1323 +        }
  1.1324 +        elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-16LE', $this->data)))
  1.1325 +        {
  1.1326 +            return $return;
  1.1327 +        }
  1.1328 +        else
  1.1329 +        {
  1.1330 +            $codepoints = unpack('N*', $this->data);
  1.1331 +            $return = '';
  1.1332 +            foreach ($codepoints as $codepoint)
  1.1333 +            {
  1.1334 +                $return .= self::codepoint_to_utf16le($codepoint);
  1.1335 +            }
  1.1336 +            return $return;
  1.1337 +        }
  1.1338 +    }
  1.1339 +    
  1.1340 +    /**
  1.1341 +     * Convert a unicode codepoint to a UTF-16 character sequence
  1.1342 +     *
  1.1343 +     * @param int $codepoint
  1.1344 +     * @return string
  1.1345 +     */
  1.1346 +    private static function codepoint_to_utf16($codepoint)
  1.1347 +    {
  1.1348 +        return self::codepoint_to_utf16be($codepoint);
  1.1349 +    }
  1.1350 +    
  1.1351 +    /**
  1.1352 +     * Convert a unicode codepoint to a UTF-16BE character sequence
  1.1353 +     *
  1.1354 +     * @param int $codepoint
  1.1355 +     * @return string
  1.1356 +     */
  1.1357 +    private static function codepoint_to_utf16be($codepoint)
  1.1358 +    {
  1.1359 +        // Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
  1.1360 +        static $cache;
  1.1361 +        
  1.1362 +        // If we haven't already got it cached, go cache it
  1.1363 +        if (!isset($cache[$codepoint]))
  1.1364 +        {
  1.1365 +            // If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
  1.1366 +            if (!self::valid_unicode_codepoint($codepoint))
  1.1367 +            {
  1.1368 +                $cache[$codepoint] = "\xFF\xFD";
  1.1369 +            }
  1.1370 +            // Without a surrogate:
  1.1371 +            elseif ($codepoint < 0x10000)
  1.1372 +            {
  1.1373 +                $cache[$codepoint] = pack('n', $codepoint);
  1.1374 +            }
  1.1375 +            // With a surrogate
  1.1376 +            else
  1.1377 +            {
  1.1378 +                $surrogate_code_point = $codepoint - 0x10000;
  1.1379 +                $cache[$codepoint] = pack('n*', ($codepoint >> 10) | 0xD800, ($codepoint & 0x03FF) | 0xDC00);
  1.1380 +            }
  1.1381 +        }
  1.1382 +        return $cache[$codepoint];
  1.1383 +    }
  1.1384 +    
  1.1385 +    /**
  1.1386 +     * Convert a unicode codepoint to a UTF-16LE character sequence
  1.1387 +     *
  1.1388 +     * @param int $codepoint
  1.1389 +     * @return string
  1.1390 +     */
  1.1391 +    private static function codepoint_to_utf16le($codepoint)
  1.1392 +    {
  1.1393 +        // Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
  1.1394 +        static $cache;
  1.1395 +        
  1.1396 +        // If we haven't already got it cached, go cache it
  1.1397 +        if (!isset($cache[$codepoint]))
  1.1398 +        {
  1.1399 +            // If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
  1.1400 +            if (!self::valid_unicode_codepoint($codepoint))
  1.1401 +            {
  1.1402 +                $cache[$codepoint] = "\xFD\xFF";
  1.1403 +            }
  1.1404 +            // Without a surrogate:
  1.1405 +            elseif ($codepoint < 0x10000)
  1.1406 +            {
  1.1407 +                $cache[$codepoint] = pack('v', $codepoint);
  1.1408 +            }
  1.1409 +            // With a surrogate
  1.1410 +            else
  1.1411 +            {
  1.1412 +                $surrogate_code_point = $codepoint - 0x10000;
  1.1413 +                $cache[$codepoint] = pack('v*', ($codepoint >> 10) | 0xD800, ($codepoint & 0x03FF) | 0xDC00);
  1.1414 +            }
  1.1415 +        }
  1.1416 +        return $cache[$codepoint];
  1.1417 +    }
  1.1418 +    
  1.1419 +    /**
  1.1420 +     * Create a new Unicode object from a UTF-32 encoded string
  1.1421 +     *
  1.1422 +     * @param string $string
  1.1423 +     * @return Unicode
  1.1424 +     */
  1.1425 +    public static function from_utf32($string)
  1.1426 +    {
  1.1427 +        // Check given parameter is a string
  1.1428 +        if (!is_string($string))
  1.1429 +        {
  1.1430 +            trigger_error('Unicode::from_utf32() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
  1.1431 +            return false;
  1.1432 +        }
  1.1433 +        
  1.1434 +        // Create new object
  1.1435 +        $unicode = new Unicode;
  1.1436 +        
  1.1437 +        // Set the data to an empty string
  1.1438 +        $unicode->data = '';
  1.1439 +        
  1.1440 +        // See if the string is of a valid length (as UTF-32 is in four byte sequences, it must be divisible by four)
  1.1441 +        $valid_length = (($len = strlen($string)) % 4) ? false : true;
  1.1442 +        
  1.1443 +        // If it is of an invalid length, trim all the invalid bytes at the end (we'll replace them with a U+FFFD REPLACEMENT CHARACTER later)
  1.1444 +        if (!$valid_length)
  1.1445 +        {
  1.1446 +            $string = substr($string, 0, floor($len / 4) * 4);
  1.1447 +        }
  1.1448 +        
  1.1449 +        // If the string starts with a UTF-32LE BOM, it is UTF-32LE, so decode it as such
  1.1450 +        if (substr($string, 0, 4) === "\xFF\xFE\x00\x00")
  1.1451 +        {
  1.1452 +            $codepoints = unpack('V*', $string);
  1.1453 +        }
  1.1454 +        // Otherwise, it is UTF-32BE, so decode it as such
  1.1455 +        else
  1.1456 +        {
  1.1457 +            $codepoints = unpack('N*', $string);
  1.1458 +        }
  1.1459 +        
  1.1460 +        // Iterate through each and every codepoint
  1.1461 +        foreach ($codepoints as $codepoint)
  1.1462 +        {
  1.1463 +            // If the codepoint is an invalid character replace it with a U+FFFD REPLACEMENT CHARACTER
  1.1464 +            if (!self::valid_unicode_codepoint($codepoint))
  1.1465 +            {
  1.1466 +                $unicode->data .= "\x00\x00\xFF\xFD";
  1.1467 +            }
  1.1468 +            // Otherwise, append it to Unicode::$data
  1.1469 +            else
  1.1470 +            {
  1.1471 +                $unicode->data .= pack('N', $codepoint);
  1.1472 +            }
  1.1473 +        }
  1.1474 +        
  1.1475 +        // If it was of an invalid length, append a U+FFFD REPLACEMENT CHARACTER
  1.1476 +        if (!$valid_length)
  1.1477 +        {
  1.1478 +            $unicode->data .= "\x00\x00\xFF\xFD";
  1.1479 +        }
  1.1480 +        
  1.1481 +        // Strip any leading BOM (as otherwise we chage the meaing of the new sequence, which is illegal)
  1.1482 +        if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
  1.1483 +        {
  1.1484 +            $unicode->data = substr($unicode->data, 4);
  1.1485 +        }
  1.1486 +        
  1.1487 +        return $unicode;
  1.1488 +    }
  1.1489 +    
  1.1490 +    /**
  1.1491 +     * Create a new Unicode object from a UTF-32BE encoded string
  1.1492 +     *
  1.1493 +     * @param string $string
  1.1494 +     * @return Unicode
  1.1495 +     */
  1.1496 +    public static function from_utf32be($string)
  1.1497 +    {
  1.1498 +        // Check given parameter is a string
  1.1499 +        if (!is_string($string))
  1.1500 +        {
  1.1501 +            trigger_error('Unicode::from_utf32be() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
  1.1502 +            return false;
  1.1503 +        }
  1.1504 +        
  1.1505 +        // Add BOM before calling Unicode::from_utf32()
  1.1506 +        return self::from_utf32("\x00\x00\xFE\xFF" . $string);
  1.1507 +    }
  1.1508 +    
  1.1509 +    /**
  1.1510 +     * Create a new Unicode object from a UTF-32LE encoded string
  1.1511 +     *
  1.1512 +     * @param string $string
  1.1513 +     * @return Unicode
  1.1514 +     */
  1.1515 +    public static function from_utf32le($string)
  1.1516 +    {
  1.1517 +        // Check given parameter is a string
  1.1518 +        if (!is_string($string))
  1.1519 +        {
  1.1520 +            trigger_error('Unicode::from_utf32le() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
  1.1521 +            return false;
  1.1522 +        }
  1.1523 +        
  1.1524 +        // Add BOM before calling Unicode::from_utf32()
  1.1525 +        return self::from_utf32("\xFF\xFE\x00\x00" . $string);
  1.1526 +    }
  1.1527 +    
  1.1528 +    /**
  1.1529 +     * Create a UTF-32 binary string from the object
  1.1530 +     *
  1.1531 +     * @return string
  1.1532 +     */
  1.1533 +    public function to_utf32()
  1.1534 +    {
  1.1535 +        return "\x00\x00\xFE\xFF" . $this->to_utf32be();
  1.1536 +    }
  1.1537 +    
  1.1538 +    /**
  1.1539 +     * Create a UTF-32BE binary string from the object
  1.1540 +     *
  1.1541 +     * @return string
  1.1542 +     */
  1.1543 +    public function to_utf32be()
  1.1544 +    {
  1.1545 +        return $this->data;
  1.1546 +    }
  1.1547 +    
  1.1548 +    /**
  1.1549 +     * Create a UTF-32LE binary string from the object
  1.1550 +     *
  1.1551 +     * @return string
  1.1552 +     */
  1.1553 +    public function to_utf32le()
  1.1554 +    {
  1.1555 +        if (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-32LE', 'UTF-32BE')))
  1.1556 +        {
  1.1557 +            return $return;
  1.1558 +        }
  1.1559 +        elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-32LE', $this->data)))
  1.1560 +        {
  1.1561 +            return $return;
  1.1562 +        }
  1.1563 +        else
  1.1564 +        {
  1.1565 +            return call_user_func_array('pack', array_merge(array('V*'), unpack('N*', $this->data)));
  1.1566 +        }
  1.1567 +    }
  1.1568 +    
  1.1569 +    /**
  1.1570 +     * Convert a unicode codepoint to a UTF-32 character sequence
  1.1571 +     *
  1.1572 +     * @param int $codepoint
  1.1573 +     * @return string
  1.1574 +     */
  1.1575 +    private static function codepoint_to_utf32($codepoint)
  1.1576 +    {
  1.1577 +        return self::codepoint_to_utf32be($codepoint);
  1.1578 +    }
  1.1579 +    
  1.1580 +    /**
  1.1581 +     * Convert a unicode codepoint to a UTF-32BE character sequence
  1.1582 +     *
  1.1583 +     * @param int $codepoint
  1.1584 +     * @return string
  1.1585 +     */
  1.1586 +    private static function codepoint_to_utf32be($codepoint)
  1.1587 +    {
  1.1588 +        if (self::valid_unicode_codepoint($codepoint))
  1.1589 +        {
  1.1590 +            return pack('N', $codepoint);
  1.1591 +        }
  1.1592 +        else
  1.1593 +        {
  1.1594 +            return "\x00\x00\xFF\xFD";
  1.1595 +        }
  1.1596 +    }
  1.1597 +    
  1.1598 +    /**
  1.1599 +     * Convert a unicode codepoint to a UTF-32LE character sequence
  1.1600 +     *
  1.1601 +     * @param int $codepoint
  1.1602 +     * @return string
  1.1603 +     */
  1.1604 +    private static function codepoint_to_utf32le($codepoint)
  1.1605 +    {
  1.1606 +        if (self::valid_unicode_codepoint($codepoint))
  1.1607 +        {
  1.1608 +            return pack('V', $codepoint);
  1.1609 +        }
  1.1610 +        else
  1.1611 +        {
  1.1612 +            return "\xFD\xFF\x00\x00";
  1.1613 +        }
  1.1614 +    }
  1.1615  }