208 lines
7.9 KiB
PHP
208 lines
7.9 KiB
PHP
<?php
|
|
// CubicleSoft PHP UTF8 (Unicode) functions.
|
|
// (C) 2014 CubicleSoft. All Rights Reserved.
|
|
|
|
class UTF8
|
|
{
|
|
// Removes invalid characters from the data string.
|
|
// http://www.w3.org/International/questions/qa-forms-utf-8
|
|
public static function MakeValid($data)
|
|
{
|
|
$result = "";
|
|
$x = 0;
|
|
$y = strlen($data);
|
|
while ($x < $y)
|
|
{
|
|
$tempchr = ord($data[$x]);
|
|
if ($y - $x > 1) $tempchr2 = ord($data[$x + 1]);
|
|
else $tempchr2 = 0x00;
|
|
if ($y - $x > 2) $tempchr3 = ord($data[$x + 2]);
|
|
else $tempchr3 = 0x00;
|
|
if ($y - $x > 3) $tempchr4 = ord($data[$x + 3]);
|
|
else $tempchr4 = 0x00;
|
|
if ($tempchr == 0x09 || $tempchr == 0x0A || $tempchr == 0x0D || ($tempchr >= 0x20 && $tempchr <= 0x7E))
|
|
{
|
|
// ASCII minus control and special characters.
|
|
$result .= chr($tempchr);
|
|
$x++;
|
|
}
|
|
else if (($tempchr >= 0xC2 && $tempchr <= 0xDF) && ($tempchr2 >= 0x80 && $tempchr2 <= 0xBF))
|
|
{
|
|
// Non-overlong (2 bytes).
|
|
$result .= chr($tempchr);
|
|
$result .= chr($tempchr2);
|
|
$x += 2;
|
|
}
|
|
else if ($tempchr == 0xE0 && ($tempchr2 >= 0xA0 && $tempchr2 <= 0xBF) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF))
|
|
{
|
|
// Non-overlong (3 bytes).
|
|
$result .= chr($tempchr);
|
|
$result .= chr($tempchr2);
|
|
$result .= chr($tempchr3);
|
|
$x += 3;
|
|
}
|
|
else if ((($tempchr >= 0xE1 && $tempchr <= 0xEC) || $tempchr == 0xEE || $tempchr == 0xEF) && ($tempchr2 >= 0x80 && $tempchr2 <= 0xBF) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF))
|
|
{
|
|
// Normal/straight (3 bytes).
|
|
$result .= chr($tempchr);
|
|
$result .= chr($tempchr2);
|
|
$result .= chr($tempchr3);
|
|
$x += 3;
|
|
}
|
|
else if ($tempchr == 0xED && ($tempchr2 >= 0x80 && $tempchr2 <= 0x9F) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF))
|
|
{
|
|
// Non-surrogates (3 bytes).
|
|
$result .= chr($tempchr);
|
|
$result .= chr($tempchr2);
|
|
$result .= chr($tempchr3);
|
|
$x += 3;
|
|
}
|
|
else if ($tempchr == 0xF0 && ($tempchr2 >= 0x90 && $tempchr2 <= 0xBF) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF) && ($tempchr4 >= 0x80 && $tempchr4 <= 0xBF))
|
|
{
|
|
// Planes 1-3 (4 bytes).
|
|
$result .= chr($tempchr);
|
|
$result .= chr($tempchr2);
|
|
$result .= chr($tempchr3);
|
|
$result .= chr($tempchr4);
|
|
$x += 4;
|
|
}
|
|
else if (($tempchr >= 0xF1 && $tempchr <= 0xF3) && ($tempchr2 >= 0x80 && $tempchr2 <= 0xBF) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF) && ($tempchr4 >= 0x80 && $tempchr4 <= 0xBF))
|
|
{
|
|
// Planes 4-15 (4 bytes).
|
|
$result .= chr($tempchr);
|
|
$result .= chr($tempchr2);
|
|
$result .= chr($tempchr3);
|
|
$result .= chr($tempchr4);
|
|
$x += 4;
|
|
}
|
|
else if ($tempchr == 0xF4 && ($tempchr2 >= 0x80 && $tempchr2 <= 0x8F) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF) && ($tempchr4 >= 0x80 && $tempchr4 <= 0xBF))
|
|
{
|
|
// Plane 16 (4 bytes).
|
|
$result .= chr($tempchr);
|
|
$result .= chr($tempchr2);
|
|
$result .= chr($tempchr3);
|
|
$result .= chr($tempchr4);
|
|
$x += 4;
|
|
}
|
|
else $x++;
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
public static function IsValid($data)
|
|
{
|
|
$x = 0;
|
|
$y = strlen($data);
|
|
while ($x < $y)
|
|
{
|
|
$tempchr = ord($data[$x]);
|
|
if ($y - $x > 1) $tempchr2 = ord($data[$x + 1]);
|
|
else $tempchr2 = 0x00;
|
|
if ($y - $x > 2) $tempchr3 = ord($data[$x + 2]);
|
|
else $tempchr3 = 0x00;
|
|
if ($y - $x > 3) $tempchr4 = ord($data[$x + 3]);
|
|
else $tempchr4 = 0x00;
|
|
if ($tempchr == 0x09 || $tempchr == 0x0A || $tempchr == 0x0D || ($tempchr >= 0x20 && $tempchr <= 0x7E)) $x++;
|
|
else if (($tempchr >= 0xC2 && $tempchr <= 0xDF) && ($tempchr2 >= 0x80 && $tempchr2 <= 0xBF)) $x += 2;
|
|
else if ($tempchr == 0xE0 && ($tempchr2 >= 0xA0 && $tempchr2 <= 0xBF) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF)) $x += 3;
|
|
else if ((($tempchr >= 0xE1 && $tempchr <= 0xEC) || $tempchr == 0xEE || $tempchr == 0xEF) && ($tempchr2 >= 0x80 && $tempchr2 <= 0xBF) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF)) $x += 3;
|
|
else if ($tempchr == 0xED && ($tempchr2 >= 0x80 && $tempchr2 <= 0x9F) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF)) $x += 3;
|
|
else if ($tempchr == 0xF0 && ($tempchr2 >= 0x90 && $tempchr2 <= 0xBF) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF) && ($tempchr4 >= 0x80 && $tempchr4 <= 0xBF)) $x += 4;
|
|
else if (($tempchr >= 0xF1 && $tempchr <= 0xF3) && ($tempchr2 >= 0x80 && $tempchr2 <= 0xBF) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF) && ($tempchr4 >= 0x80 && $tempchr4 <= 0xBF)) $x += 4;
|
|
else if ($tempchr == 0xF4 && ($tempchr2 >= 0x80 && $tempchr2 <= 0x8F) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF) && ($tempchr4 >= 0x80 && $tempchr4 <= 0xBF)) $x += 4;
|
|
else return false;
|
|
}
|
|
|
|
return true;
|
|
}
|
|
|
|
// Locates the next UTF8 character in a UTF8 string.
|
|
// Set Pos and Size to 0 to start at the beginning.
|
|
// Returns false at the end of the string or bad UTF8 character. Otherwise, returns true.
|
|
public static function NextChrPos(&$data, $datalen, &$pos, &$size)
|
|
{
|
|
$pos += $size;
|
|
$size = 0;
|
|
$x = $pos;
|
|
$y = $datalen;
|
|
if ($x >= $y) return false;
|
|
|
|
$tempchr = ord($data[$x]);
|
|
if ($y - $x > 1) $tempchr2 = ord($data[$x + 1]);
|
|
else $tempchr2 = 0x00;
|
|
if ($y - $x > 2) $tempchr3 = ord($data[$x + 2]);
|
|
else $tempchr3 = 0x00;
|
|
if ($y - $x > 3) $tempchr4 = ord($data[$x + 3]);
|
|
else $tempchr4 = 0x00;
|
|
if ($tempchr == 0x09 || $tempchr == 0x0A || $tempchr == 0x0D || ($tempchr >= 0x20 && $tempchr <= 0x7E)) $size = 1;
|
|
else if (($tempchr >= 0xC2 && $tempchr <= 0xDF) && ($tempchr2 >= 0x80 && $tempchr2 <= 0xBF)) $size = 2;
|
|
else if ($tempchr == 0xE0 && ($tempchr2 >= 0xA0 && $tempchr2 <= 0xBF) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF)) $size = 3;
|
|
else if ((($tempchr >= 0xE1 && $tempchr <= 0xEC) || $tempchr == 0xEE || $tempchr == 0xEF) && ($tempchr2 >= 0x80 && $tempchr2 <= 0xBF) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF)) $size = 3;
|
|
else if ($tempchr == 0xED && ($tempchr2 >= 0x80 && $tempchr2 <= 0x9F) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF)) $size = 3;
|
|
else if ($tempchr == 0xF0 && ($tempchr2 >= 0x90 && $tempchr2 <= 0xBF) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF) && ($tempchr4 >= 0x80 && $tempchr4 <= 0xBF)) $size = 4;
|
|
else if (($tempchr >= 0xF1 && $tempchr <= 0xF3) && ($tempchr2 >= 0x80 && $tempchr2 <= 0xBF) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF) && ($tempchr4 >= 0x80 && $tempchr4 <= 0xBF)) $size = 4;
|
|
else if ($tempchr == 0xF4 && ($tempchr2 >= 0x80 && $tempchr2 <= 0x8F) && ($tempchr3 >= 0x80 && $tempchr3 <= 0xBF) && ($tempchr4 >= 0x80 && $tempchr4 <= 0xBF)) $size = 4;
|
|
else return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// Determines if a UTF8 string can also be viewed as ASCII.
|
|
public static function IsASCII($data)
|
|
{
|
|
$pos = 0;
|
|
$size = 0;
|
|
$y = strlen($data);
|
|
while (self::NextChrPos($data, $y, $pos, $size) && $size == 1) {}
|
|
if ($pos < $y || $size > 1) return false;
|
|
|
|
return true;
|
|
}
|
|
|
|
// Returns the number of characters in a UTF8 string.
|
|
public static function strlen($data)
|
|
{
|
|
$num = 0;
|
|
$pos = 0;
|
|
$size = 0;
|
|
$y = strlen($data);
|
|
while (self::NextChrPos($data, $y, $pos, $size)) $num++;
|
|
|
|
return $num;
|
|
}
|
|
|
|
// Converts a UTF8 string to ASCII and drops bad UTF8 and non-ASCII characters in the process.
|
|
public static function ConvertToASCII($data)
|
|
{
|
|
$result = "";
|
|
|
|
$pos = 0;
|
|
$size = 0;
|
|
$y = strlen($data);
|
|
while ($pos < $y)
|
|
{
|
|
if (self::NextChrPos($data, $y, $pos, $size) && $size == 1) $result .= $data[$pos];
|
|
else if (!$size) $size = 1;
|
|
}
|
|
|
|
return $result;
|
|
}
|
|
|
|
// Converts UTF8 characters in a string to HTML entities.
|
|
public static function ConvertToHTML($data)
|
|
{
|
|
return preg_replace_callback('/([\xC0-\xF7]{1,1}[\x80-\xBF]+)/', 'UTF8::ConvertToHTML__Callback', $data);
|
|
}
|
|
|
|
private static function ConvertToHTML__Callback($data)
|
|
{
|
|
$data = $data[1];
|
|
$num = 0;
|
|
$data = str_split(strrev(chr((ord(substr($data, 0, 1)) % 252 % 248 % 240 % 224 % 192) + 128) . substr($data, 1)));
|
|
foreach ($data as $k => $v) $num += (ord($v) % 128) * pow(64, $k);
|
|
|
|
return "&#" . $num . ";";
|
|
}
|
|
}
|
|
?>
|