unicode.php
author Geoffrey Sneddon <geoffers@gmail.com>
Fri May 30 12:38:43 2008 +0100 (3 months ago)
changeset 81 076ec20b0f39
parent 803bbe2bd8802a
permissions -rw-r--r--
Bye-bye unicode.semantics
     1 <?php
     2 /**
     3  * Class for manipulating Unicode data
     4  *
     5  * The MIT License
     6  *
     7  * Copyright (c) 2007 Geoffrey Sneddon
     8  * 
     9  * Permission is hereby granted, free of charge, to any person obtaining a copy
    10  * of this software and associated documentation files (the "Software"), to deal
    11  * in the Software without restriction, including without limitation the rights
    12  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
    13  * copies of the Software, and to permit persons to whom the Software is
    14  * furnished to do so, subject to the following conditions:
    15  * 
    16  * The above copyright notice and this permission notice shall be included in
    17  * all copies or substantial portions of the Software.
    18  * 
    19  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
    20  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
    21  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
    22  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
    23  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
    24  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
    25  * THE SOFTWARE.
    26  *
    27  * @package Unicode
    28  * @version 0.3-dev
    29  * @copyright 2007 Geoffrey Sneddon
    30  * @author Geoffrey Sneddon
    31  * @license http://www.opensource.org/licenses/mit-license.php The MIT License
    32 */
    33 
    34 /**
    35  * Unicode
    36  *
    37  * @package Unicode
    38  */
    39 class Unicode
    40 {
    41 	/**
    42 	 * Contains the raw unicode data that we're working from
    43 	 *
    44 	 * @var string UTF-32BE binary string on PHP < 6, otherwise a unicode string
    45 	 */
    46 	private $data;
    47 	
    48 	/**
    49 	 * Object should be created with some Unicode::from_*() method, therefore
    50 	 * this is private
    51 	 */
    52 	private function __construct()
    53 	{
    54 	}
    55 	
    56 	/**
    57 	 * Prepare the object for serialisation
    58 	 *
    59 	 * If we're on PHP6, convert the Unicode::$data to a UTF-32BE binary string
    60 	 * before serialising the object to allow for the object to be unserialised
    61 	 * on older PHP versions without affecting functionality
    62 	 */
    63 	public function __sleep()
    64 	{
    65 		if (version_compare(phpversion(), '6', '>=') && is_unicode($this->data))
    66 		{
    67 			$this->data = unicode_encode($this->data, 'UTF-32BE');
    68 		}
    69 		return array('data');
    70 	}
    71 	
    72 	/**
    73 	 * Check the object is valid when being unserialised
    74 	 *
    75 	 * To prepare the object for use after being unserialised, we need to check
    76 	 * that it is valid, and to convert Unicode::$data to a unicode string on
    77 	 * PHP6. If Unicode::$data is not a string, a warning will be thrown. The
    78 	 * validity of the UTF-32BE Unicode::$data is also checked, and the string
    79 	 * is corrected if it is invalid.
    80 	 */
    81 	public function __wakeup()
    82 	{
    83 		if (!isset($this->data))
    84 		{
    85 			trigger_error('Unicode::__wakeup() expects the serialised object to have a $data property, none exists', E_USER_WARNING);
    86 			$this->data = '';
    87 		}
    88 		elseif (!is_string($this->data))
    89 		{
    90 			trigger_error('Unicode::__wakeup() expects Unicode::$data to be string, ' . get_type($this->data) . ' given', E_USER_WARNING);
    91 			$this->data = '';
    92 		}
    93 		elseif (version_compare(phpversion(), '6', '>=') && is_binary($this->data))
    94 		{
    95 			$this->data = self::call_unicode_func('unicode_decode', $this->data, 'UTF-32BE');
    96 		}
    97 		elseif (version_compare(phpversion(), '6', '<'))
    98 		{
    99 			$this->data = Unicode::from_utf32be($this->data)->to_utf32be();
   100 		}
   101 	}
   102 	
   103 	/**
   104 	 * Call a function given by the first parameter in our own unicode setup
   105 	 *
   106 	 * @see call_user_func()
   107 	 * @see Unicode::call_unicode_func_array()
   108 	 * @param callback $function
   109 	 * @param mixed $parameter,...
   110 	 * @return mixed
   111 	 */
   112 	private static function call_unicode_func($function)
   113 	{
   114 		$param_arr = func_get_args();
   115 		unset($param_arr[0]);
   116 		return self::call_unicode_func_array($function, $param_arr);
   117 	}
   118 	
   119 	/**
   120 	 * Call a function given by the first parameter with an array of parameters
   121 	 * in our own unicode setup
   122 	 *
   123 	 * @see call_user_func_array()
   124 	 * @see Unicode::call_unicode_func()
   125 	 * @param callback $function
   126 	 * @param array $param_arr
   127 	 * @return mixed
   128 	 */
   129 	private static function call_unicode_func_array($function, $param_arr)
   130 	{		
   131 		// Save the current unicode enviroment settings
   132 		$substr_char = unicode_get_subst_char();
   133 		$from_mode = unicode_get_error_mode(FROM_UNICODE);
   134 		$to_mode = unicode_get_error_mode(TO_UNICODE);
   135 		
   136 		// Set our own unicode enviroment settings
   137 		unicode_set_subst_char("\uFFFD");
   138 		unicode_set_error_mode(FROM_UNICODE, U_CONV_ERROR_SUBST);
   139 		unicode_set_error_mode(TO_UNICODE, U_CONV_ERROR_SUBST);
   140 		
   141 		// Actually call the function
   142 		$return = call_user_func_array($function, $param_arr);
   143 		
   144 		// Return everything to its prior state
   145 		unicode_set_subst_char($substr_char);
   146 		unicode_set_error_mode(FROM_UNICODE, $from_mode);
   147 		unicode_set_error_mode(TO_UNICODE, $to_mode);
   148 		
   149 		// Finally return what the function returned
   150 		return $return;
   151 	}
   152 	
   153 	/**
   154 	 * Check the given codepoint is a valid character
   155 	 *
   156 	 * @param int $codepoint
   157 	 * @return bool
   158 	 */
   159 	private static function valid_unicode_codepoint($codepoint)
   160 	{
   161 		// Outside of Unicode codespace
   162 		if ($codepoint < 0
   163 			|| $codepoint > 0x10FFFF
   164 			// UTF-16 Surrogates
   165 			|| $codepoint >= 0xD800 && $codepoint <= 0xDFFF
   166 			// Noncharacters
   167 			|| ($codepoint & 0xFFFE) === 0xFFFE
   168 			|| $codepoint >= 0xFDD0 && $codepoint <= 0xFDEF)
   169 		{
   170 			return false;
   171 		}
   172 		else
   173 		{
   174 			return true;
   175 		}
   176 	}
   177 	
   178 	/**
   179 	 * Create a new Unicode object from an array of codepoints
   180 	 *
   181 	 * @param array $array
   182 	 * @return Unicode
   183 	 */
   184 	public static function from_codepoint_array($array)
   185 	{
   186 		// Check given parameter is an array
   187 		if (!is_array($string))
   188 		{
   189 			trigger_error('Unicode::from_codepoint_array() expects parameter 1 to be array, ' . get_type($string) . ' given', E_USER_WARNING);
   190 			return false;
   191 		}
   192 		
   193 		// Get U+FFFD as a binary string (which is slightly hard on PHP 6)
   194 		static $replacement_character;
   195 		if (!$replacement_character)
   196 		{
   197 			if (version_compare(phpversion(), '6', '>='))
   198 			{
   199 				$replacement_character = unicode_encode("\uFFFD", 'UTF-32');
   200 			}
   201 			else
   202 			{
   203 				$replacement_character = "\x00\x00\xFF\xFD";
   204 			}
   205 		}
   206 		
   207 		// Create new object
   208 		$unicode = new Unicode;
   209 		
   210 		// Strip any leading BOM (as otherwise we chage the meaing of the new sequence, which is illegal)
   211 		if (isset($array[0]) && $array[0] === 0xFFFD)
   212 		{
   213 			array_splice($array, 1);
   214 		}
   215 		
   216 		// Iterate through each and every codepoint
   217 		foreach ($array as $codepoint)
   218 		{
   219 			// If the codepoint is an invalid character replace it with a U+FFFD REPLACEMENT CHARACTER
   220 			if (!self::valid_unicode_codepoint($codepoint))
   221 			{
   222 				$unicode->data .= $replacement_character;
   223 			}
   224 			// Otherwise, append it to Unicode::$data
   225 			else
   226 			{
   227 				$unicode->data .= pack('N', $codepoint);
   228 			}
   229 		}
   230 		
   231 		// If we're on PHP6, convert it to a unicode string and store that
   232 		if (version_compare(phpversion(), '6', '>='))
   233 		{
   234 			$unicode->data = unicode_decode($string, 'UTF-32BE');
   235 		}
   236 		
   237 		return $unicode;
   238 	}
   239 	
   240 	/**
   241 	 * Create an array of codepoints from the object
   242 	 *
   243 	 * @return string
   244 	 */
   245 	public function to_codepoint_array()
   246 	{
   247 		if (version_compare(phpversion(), '6', '>=') && is_unicode($this->data))
   248 		{
   249 			$data = unicode_encode($this->data, 'UTF-32BE');
   250 		}
   251 		else
   252 		{
   253 			$data = $this->data;
   254 		}
   255 		return array_values(unpack('N*', $data));
   256 	}
   257 	
   258 	/**
   259 	 * Create a new Unicode object from a UTF-8 encoded string
   260 	 *
   261 	 * @param string $string
   262 	 * @return Unicode
   263 	 */
   264 	public static function from_utf8($string)
   265 	{
   266 		// Check given parameter is a string
   267 		if (!is_string($string))
   268 		{
   269 			trigger_error('Unicode::from_utf8() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
   270 			return false;
   271 		}
   272 		
   273 		// Create new object
   274 		$unicode = new Unicode;
   275 		
   276 		// If we're on PHP6, we'll just get a unicode string and store that
   277 		if (version_compare(phpversion(), '6', '>='))
   278 		{
   279 			if (is_unicode($string))
   280 			{
   281 				$unicode->data = $string;
   282 			}
   283 			else
   284 			{
   285 				$unicode->data = self::call_unicode_func('unicode_decode', $string, 'UTF-8');
   286 			}
   287 		}
   288 		// Otherwise, we need to decode the UTF-8 string
   289 		else
   290 		{
   291 			// Set the data to an empty string, and remaining bytes in the current sequence to zero
   292 			$unicode->data = '';
   293 			$remaining = 0;
   294 			
   295 			// Iterate through each and every byte
   296 			for ($i = 0, $len = strlen($string); $i < $len; $i++)
   297 			{
   298 				$value = ord($string[$i]);
   299 				
   300 				// If we're the first byte of sequence:
   301 				if (!$remaining)
   302 				{
   303 					// One byte sequence:
   304 					if ($value <= 0x7F)
   305 					{
   306 						$character = $value;
   307 						$length = 1;
   308 					}
   309 					// Two byte sequence:
   310 					elseif (($value & 0xE0) === 0xC0)
   311 					{
   312 						$character = ($value & 0x1F) << 6;
   313 						$length = 2;
   314 						$remaining = 1;
   315 					}
   316 					// Three byte sequence:
   317 					elseif (($value & 0xF0) === 0xE0)
   318 					{
   319 						$character = ($value & 0x0F) << 12;
   320 						$length = 3;
   321 						$remaining = 2;
   322 					}
   323 					// Four byte sequence:
   324 					elseif (($value & 0xF8) === 0xF0)
   325 					{
   326 						$character = ($value & 0x07) << 18;
   327 						$length = 4;
   328 						$remaining = 3;
   329 					}
   330 					// Invalid byte:
   331 					else
   332 					{
   333 						$character = 0xFFFD;
   334 						$length = 3;
   335 						$remaining = 0;
   336 					}
   337 				}
   338 				// Continuation byte:
   339 				else
   340 				{
   341 					// Check that the byte is valid, then add it to the character:
   342 					if (($value & 0xC0) === 0x80)
   343 					{
   344 						$remaining--;
   345 						$character |= ($value & 0x3F) << ($remaining * 6);
   346 					}
   347 					// If it is invalid, count the sequence as invalid and reprocess the current byte as the start of a sequence:
   348 					else
   349 					{
   350 						$character = 0xFFFD;
   351 						$length = 3;
   352 						$remaining = 0;
   353 						$i--;
   354 					}
   355 				}
   356 				
   357 				// If we've reached the end of the current byte sequence, append it to Unicode::$data
   358 				if (!$remaining)
   359 				{
   360 					// If the character is illegal replace it with U+FFFD REPLACEMENT CHARACTER
   361 					if ($length > 1 && $character <= 0x7F
   362 						|| $length > 2 && $character <= 0x7FF
   363 						|| $length > 3 && $character <= 0xFFFF
   364 						|| !self::valid_unicode_codepoint($character))
   365 					{
   366 						$character = 0xFFFD;
   367 					}
   368 					
   369 					$unicode->data .= pack('N', $character);
   370 				}
   371 			}
   372 			
   373 			// Strip any U+FEFF BYTE ORDER MARK (as otherwise we chage the meaing of the new sequence, which is illegal)
   374 			if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
   375 			{
   376 				$unicode->data = substr($unicode->data, 4);
   377 			}
   378 			
   379 			// If we've reached the end of the string but not the end of a character sequence, append a U+FFFD REPLACEMENT CHARACTE
   380 			if ($remaining > 0)
   381 			{
   382 				$unicode->data .= "\x00\x00\xFF\xFD";
   383 			}
   384 		}
   385 		return $unicode;
   386 	}
   387 	
   388 	/**
   389 	 * Create a UTF-8 binary string from the object
   390 	 *
   391 	 * @return string
   392 	 */
   393 	public function to_utf8()
   394 	{
   395 		if (version_compare(phpversion(), '6', '>=') && is_unicode($this->data))
   396 		{
   397 			return unicode_encode($this->data, 'UTF-8');
   398 		}
   399 		elseif (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-8', 'UTF-32BE')))
   400 		{
   401 			return $return;
   402 		}
   403 		elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-8', $this->data)))
   404 		{
   405 			return $return;
   406 		}
   407 		else
   408 		{
   409 			$codepoints = unpack('N*', $this->data);
   410 			$return = '';
   411 			foreach ($codepoints as $codepoint)
   412 			{
   413 				$return .= self::codepoint_to_utf8($codepoint);
   414 			}
   415 			return $return;
   416 		}
   417 	}
   418 	
   419 	/**
   420 	 * Convert a unicode codepoint to a UTF-8 character sequence
   421 	 *
   422 	 * @param int $codepoint
   423 	 * @return string
   424 	 */
   425 	private static function codepoint_to_utf8($codepoint)
   426 	{
   427 		// Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
   428 		static $cache;
   429 		
   430 		// If we haven't already got it cached, go cache it
   431 		if (!isset($cache[$codepoint]))
   432 		{
   433 			// On PHP6, we can use its own unicode support
   434 			if (version_compare(phpversion(), '6', '>='))
   435 			{
   436 				$cache[$codepoint] = unicode_encode(self::call_unicode_func('chr', $codepoint), 'UTF-8');
   437 			}
   438 			// If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
   439 			elseif (!self::valid_unicode_codepoint($codepoint))
   440 			{
   441 				$cache[$codepoint] = "\xEF\xBF\xBD";
   442 			}
   443 			// One byte sequence:
   444 			elseif ($codepoint <= 0x7F)
   445 			{
   446 				$cache[$codepoint] = chr($codepoint);
   447 			}
   448 			// Two byte sequence:
   449 			elseif ($codepoint <= 0x7FF)
   450 			{
   451 				$cache[$codepoint] = chr(0xC0 | ($codepoint >> 6)) . chr(0x80 | ($codepoint & 0x3F));
   452 			}
   453 			// Three byte sequence:
   454 			elseif ($codepoint <= 0xFFFF)
   455 			{
   456 				$cache[$codepoint] = chr(0xE0 | ($codepoint >> 12)) . chr(0x80 | (($codepoint >> 6) & 0x3F)) . chr(0x80 | ($codepoint & 0x3F));
   457 			}
   458 			// Four byte sequence:
   459 			else
   460 			{
   461 				$cache[$codepoint] = chr(0xF0 | ($codepoint >> 18)) . chr(0x80 | (($codepoint >> 12) & 0x3F)) . chr(0x80 | (($codepoint >> 6) & 0x3F)) . chr(0x80 | ($codepoint & 0x3F));
   462 			}
   463 		}
   464 		return $cache[$codepoint];
   465 	}
   466 	
   467 	/**
   468 	 * Create a new Unicode object from a UTF-16 encoded string
   469 	 *
   470 	 * @param string $string
   471 	 * @return Unicode
   472 	 */
   473 	public static function from_utf16($string)
   474 	{
   475 		// Check given parameter is a string
   476 		if (!is_string($string))
   477 		{
   478 			trigger_error('Unicode::from_utf8() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
   479 			return false;
   480 		}
   481 		
   482 		// Create new object
   483 		$unicode = new Unicode;
   484 		
   485 		// If we're on PHP6, we'll just get a unicode string and store that
   486 		if (version_compare(phpversion(), '6', '>='))
   487 		{
   488 			if (is_unicode($string))
   489 			{
   490 				$unicode->data = $string;
   491 			}
   492 			else
   493 			{
   494 				$unicode->data = self::call_unicode_func('unicode_decode', $string, 'UTF-16');
   495 			}
   496 		}
   497 		// Otherwise, we need to decode the UTF-16 string
   498 		else
   499 		{
   500 			// Set the data to an empty string and surrogate to false
   501 			$unicode->data = '';
   502 			$surrogate = false;
   503 			
   504 			// See if the string is of a valid length (as UTF-16 is in two byte sequences, it must be divisible by two)
   505 			$valid_length = (($len = strlen($string)) % 2) ? false : true;
   506 			
   507 			// If it is of an invalid length, trim all the invalid bytes at the end (we'll replace them with a U+FFFD REPLACEMENT CHARACTER later)
   508 			if (!$valid_length)
   509 			{
   510 				$string = substr($string, 0, floor($len / 2) * 2);
   511 			}
   512 			
   513 			// If the string starts with a UTF-16LE BOM, it is UTF-16LE, so decode it as such
   514 			if (substr($string, 0, 2) === "\xFF\xFE")
   515 			{
   516 				$words = array_values(unpack('v*', $string));
   517 			}
   518 			// Otherwise, it is UTF-16BE, so decode it as such
   519 			else
   520 			{
   521 				$words = array_values(unpack('n*', $string));
   522 			}
   523 			
   524 			// Iterate through each and every word
   525 			for ($i = 0, $word_count = count($words); $i < $word_count; $i++)
   526 			{
   527 				// If we're the first word of sequence:
   528 				if (!$surrogate)
   529 				{
   530 					// One word sequence:
   531 					if (self::valid_unicode_codepoint($words[$i]))
   532 					{
   533 						$unicode->data .= pack('N', $words[$i]);
   534 					}
   535 					// Two word sequence:
   536 					elseif ($words[$i] >= 0xD800 && $words[$i] <= 0xDFFF)
   537 					{
   538 						$character = ($words[$i] & 0x3FF) << 10;
   539 						$surrogate = true;
   540 					}
   541 					// Invalid word:
   542 					else
   543 					{
   544 						$unicode->data .= pack('N', 0xFFFD);
   545 					}
   546 				}
   547 				// Second word:
   548 				else
   549 				{
   550 					// Surrogates are only ever two words, so we can say we've reached the end with certainty
   551 					$surrogate = false;
   552 					
   553 					// Check that the word is valid, then add it to the character:
   554 					if ($words[$i] >= 0xDC00 && $words[$i] <= 0xDFFF)
   555 					{
   556 						$character |= $words[$i] & 0x3FF;
   557 						if (self::valid_unicode_codepoint($character))
   558 						{
   559 							$unicode->data .= pack('N', $character);
   560 						}
   561 						else
   562 						{
   563 							$unicode->data .= pack('N', 0xFFFD);
   564 						}
   565 					}
   566 					// If it is invalid, count the sequence as invalid and reprocess the current word as a first word:
   567 					else
   568 					{
   569 						$unicode->data .= pack('N', 0xFFFD);
   570 						$i--;
   571 					}
   572 				}
   573 			}
   574 			
   575 			// If we've reached the end of the string but not the end of a surrogate pair, append a U+FFFD REPLACEMENT CHARACTER
   576 			if ($surrogate)
   577 			{
   578 				$unicode->data .= "\x00\x00\xFF\xFD";
   579 			}
   580 			
   581 			// If it was of an invalid length, append a U+FFFD REPLACEMENT CHARACTER
   582 			if (!$valid_length)
   583 			{
   584 				$unicode->data .= "\x00\x00\xFF\xFD";
   585 			}
   586 			
   587 			// Strip any U+FEFF BYTE ORDER MARK (as otherwise we chage the meaing of the new sequence, which is illegal)
   588 			if (substr($unicode->data, 0, 4) === "\x00\x00\xFE\xFF")
   589 			{
   590 				$unicode->data = substr($unicode->data, 4);
   591 			}
   592 		}
   593 		return $unicode;
   594 	}
   595 	
   596 	/**
   597 	 * Create a new Unicode object from a UTF-16BE encoded string
   598 	 *
   599 	 * @param string $string
   600 	 * @return Unicode
   601 	 */
   602 	public static function from_utf16be($string)
   603 	{
   604 		// Check given parameter is a string
   605 		if (!is_string($string))
   606 		{
   607 			trigger_error('Unicode::from_utf16be() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
   608 			return false;
   609 		}
   610 		
   611 		// Add BOM before calling Unicode::from_utf16()
   612 		if ((version_compare(phpversion(), '6', '<') || is_binary($string)))
   613 		{
   614 			// Get U+FEFF as a binary string (which is slightly hard on PHP 6)
   615 			static $bom;
   616 			if (!$bom)
   617 			{
   618 				if (version_compare(phpversion(), '6', '>='))
   619 				{
   620 					$bom = unicode_encode("\uFEFF", 'UTF-16BE');
   621 				}
   622 				else
   623 				{
   624 					$bom = "\xFE\xFF";
   625 				}
   626 			}
   627 			$string = $bom . $string;
   628 		}
   629 		return self::from_utf16($string);
   630 	}
   631 	
   632 	/**
   633 	 * Create a new Unicode object from a UTF-16LE encoded string
   634 	 *
   635 	 * @param string $string
   636 	 * @return Unicode
   637 	 */
   638 	public static function from_utf16le($string)
   639 	{
   640 		// Check given parameter is a string
   641 		if (!is_string($string))
   642 		{
   643 			trigger_error('Unicode::from_utf16le() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
   644 			return false;
   645 		}
   646 		
   647 		// Add BOM before calling Unicode::from_utf16()
   648 		if ((version_compare(phpversion(), '6', '<') || is_binary($string)))
   649 		{
   650 			// Get U+FEFF as a binary string (which is slightly hard on PHP 6)
   651 			static $bom;
   652 			if (!$bom)
   653 			{
   654 				if (version_compare(phpversion(), '6', '>='))
   655 				{
   656 					$bom = unicode_encode("\uFEFF", 'UTF-16LE');
   657 				}
   658 				else
   659 				{
   660 					$bom = "\xFF\xFE";
   661 				}
   662 			}
   663 			$string = $bom . $string;
   664 		}
   665 		return self::from_utf16($string);
   666 	}
   667 	
   668 	/**
   669 	 * Create a UTF-16 binary string from the object
   670 	 *
   671 	 * @return string
   672 	 */
   673 	public function to_utf16()
   674 	{
   675 		if (version_compare(phpversion(), '6', '>='))
   676 		{
   677 			return unicode_encode("\uFEFF", 'UTF-16BE') . $this->to_utf16be();
   678 		}
   679 		else
   680 		{
   681 			return "\xFE\xFF" . $this->to_utf16be();
   682 		}
   683 	}
   684 	
   685 	/**
   686 	 * Create a UTF-16BE binary string from the object
   687 	 *
   688 	 * @return string
   689 	 */
   690 	public function to_utf16be()
   691 	{
   692 		if (version_compare(phpversion(), '6', '>=') && is_unicode($this->data))
   693 		{
   694 			return unicode_encode($this->data, 'UTF-16BE');
   695 		}
   696 		elseif (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-16BE', 'UTF-32BE')))
   697 		{
   698 			return $return;
   699 		}
   700 		elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-16BE', $this->data)))
   701 		{
   702 			return $return;
   703 		}
   704 		else
   705 		{
   706 			$codepoints = unpack('N*', $this->data);
   707 			$return = '';
   708 			foreach ($codepoints as $codepoint)
   709 			{
   710 				$return .= self::codepoint_to_utf16be($codepoint);
   711 			}
   712 			return $return;
   713 		}
   714 	}
   715 	
   716 	/**
   717 	 * Create a UTF-16LE binary string from the object
   718 	 *
   719 	 * @return string
   720 	 */
   721 	public function to_utf16le()
   722 	{
   723 		if (version_compare(phpversion(), '6', '>=') && is_unicode($this->data))
   724 		{
   725 			return unicode_encode($this->data, 'UTF-16LE');
   726 		}
   727 		elseif (extension_loaded('mbstring') && ($return = @mb_convert_encoding($this->data, 'UTF-16LE', 'UTF-32BE')))
   728 		{
   729 			return $return;
   730 		}
   731 		elseif (extension_loaded('iconv') && ($return = @iconv('UTF-32BE', 'UTF-16LE', $this->data)))
   732 		{
   733 			return $return;
   734 		}
   735 		else
   736 		{
   737 			$codepoints = unpack('N*', $this->data);
   738 			$return = '';
   739 			foreach ($codepoints as $codepoint)
   740 			{
   741 				$return .= self::codepoint_to_utf16le($codepoint);
   742 			}
   743 			return $return;
   744 		}
   745 	}
   746 	
   747 	/**
   748 	 * Convert a unicode codepoint to a UTF-16 character sequence
   749 	 *
   750 	 * @param int $codepoint
   751 	 * @return string
   752 	 */
   753 	private static function codepoint_to_utf16($codepoint)
   754 	{
   755 		return self::codepoint_to_utf16be($codepoint);
   756 	}
   757 	
   758 	/**
   759 	 * Convert a unicode codepoint to a UTF-16BE character sequence
   760 	 *
   761 	 * @param int $codepoint
   762 	 * @return string
   763 	 */
   764 	private static function codepoint_to_utf16be($codepoint)
   765 	{
   766 		// Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
   767 		static $cache;
   768 		
   769 		// If we haven't already got it cached, go cache it
   770 		if (!isset($cache[$codepoint]))
   771 		{
   772 			// On PHP6, we can use its own unicode support
   773 			if (version_compare(phpversion(), '6', '>='))
   774 			{
   775 				$cache[$codepoint] = unicode_encode(self::call_unicode_func('chr', $codepoint), 'UTF-16BE');
   776 			}
   777 			// If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
   778 			elseif (!self::valid_unicode_codepoint($codepoint))
   779 			{
   780 				$cache[$codepoint] = "\xFF\xFD";
   781 			}
   782 			// Without a surrogate:
   783 			elseif ($codepoint < 0x10000)
   784 			{
   785 				$cache[$codepoint] = pack('n', $codepoint);
   786 			}
   787 			// With a surrogate
   788 			else
   789 			{
   790 				$surrogate_code_point = $codepoint - 0x10000;
   791 				$cache[$codepoint] = pack('n*', ($codepoint >> 10) | 0xD800, ($codepoint & 0x03FF) | 0xDC00);
   792 			}
   793 		}
   794 		return $cache[$codepoint];
   795 	}
   796 	
   797 	/**
   798 	 * Convert a unicode codepoint to a UTF-16LE character sequence
   799 	 *
   800 	 * @param int $codepoint
   801 	 * @return string
   802 	 */
   803 	private static function codepoint_to_utf16le($codepoint)
   804 	{
   805 		// Keep a cache of all the codepoints we have already converted (this is actually quicker even with such simple code)
   806 		static $cache;
   807 		
   808 		// If we haven't already got it cached, go cache it
   809 		if (!isset($cache[$codepoint]))
   810 		{
   811 			// On PHP6, we can use its own unicode support
   812 			if (version_compare(phpversion(), '6', '>='))
   813 			{
   814 				$cache[$codepoint] = unicode_encode(self::call_unicode_func('chr', $codepoint), 'UTF-16LE');
   815 			}
   816 			// If the codepoint is invalid, just store it as U+FFFD REPLACEMENT CHARACTER
   817 			elseif (!self::valid_unicode_codepoint($codepoint))
   818 			{
   819 				$cache[$codepoint] = "\xFD\xFF";
   820 			}
   821 			// Without a surrogate:
   822 			elseif ($codepoint < 0x10000)
   823 			{
   824 				$cache[$codepoint] = pack('v', $codepoint);
   825 			}
   826 			// With a surrogate
   827 			else
   828 			{
   829 				$surrogate_code_point = $codepoint - 0x10000;
   830 				$cache[$codepoint] = pack('v*', ($codepoint >> 10) | 0xD800, ($codepoint & 0x03FF) | 0xDC00);
   831 			}
   832 		}
   833 		return $cache[$codepoint];
   834 	}
   835 	
   836 	/**
   837 	 * Create a new Unicode object from a UTF-32 encoded string
   838 	 *
   839 	 * @param string $string
   840 	 * @return Unicode
   841 	 */
   842 	public static function from_utf32($string)
   843 	{
   844 		// Check given parameter is a string
   845 		if (!is_string($string))
   846 		{
   847 			trigger_error('Unicode::from_utf32() expects parameter 1 to be string, ' . get_type($string) . ' given', E_USER_WARNING);
   848 			return false;
   849 		}
   850 		
   851 		// Create new object
   852 		$unicode = new Unicode;
   853 		
   854 		// If we're on PHP6, we'll just get a unicode string and store that
   855 		if (version_compare(phpversion(), '6', '>='))
   856 		{
   857 			if (is_unicode($string))
   858 			{
   859 				$unicode->data = $string;
   860 			}
   861 			else
   862 			{
   863 				$unicode->data = self::call_unicode_func('unicode_decode', $string, 'UTF-32');
   864 			}
   865 		}
   866 		// Otherwise, we need to decode the UTF-32 string
   867 		else
   868 		{
   869 			// Set the data to an empty string
   870 			$unicode->data = '';
   871 			
   872 			// See if the string is of a valid length (as UTF-32 is in