1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
|
<?php
namespace dokuwiki\Utf8;
/**
* Methods to convert from and to UTF-8 strings
*/
class Conversion
{
/**
* Encodes UTF-8 characters to HTML entities
*
* @author Tom N Harris <tnharris@whoopdedo.org>
* @author <vpribish at shopping dot com>
* @link http://php.net/manual/en/function.utf8-decode.php
*
* @param string $str
* @param bool $all Encode non-utf8 char to HTML as well
* @return string
*/
public static function toHtml($str, $all = false)
{
$ret = '';
foreach (Unicode::fromUtf8($str) as $cp) {
if ($cp < 0x80 && !$all) {
$ret .= chr($cp);
} elseif ($cp < 0x100) {
$ret .= "&#$cp;";
} else {
$ret .= '&#x' . dechex($cp) . ';';
}
}
return $ret;
}
/**
* Decodes HTML entities to UTF-8 characters
*
* Convert any &#..; entity to a codepoint,
* The entities flag defaults to only decoding numeric entities.
* Pass HTML_ENTITIES and named entities, including & < etc.
* are handled as well. Avoids the problem that would occur if you
* had to decode "&#38;&amp;#38;"
*
* unhtmlspecialchars(\dokuwiki\Utf8\Conversion::fromHtml($s)) -> "&&"
* \dokuwiki\Utf8\Conversion::fromHtml(unhtmlspecialchars($s)) -> "&&#38;"
* what it should be -> "&&#38;"
*
* @author Tom N Harris <tnharris@whoopdedo.org>
*
* @param string $str UTF-8 encoded string
* @param boolean $entities decode name entities in addtition to numeric ones
* @return string UTF-8 encoded string with numeric (and named) entities replaced.
*/
public static function fromHtml($str, $entities = false)
{
if (!$entities) {
return preg_replace_callback(
'/(&#([Xx])?([0-9A-Za-z]+);)/m',
[__CLASS__, 'decodeNumericEntity'],
$str
);
}
return preg_replace_callback(
'/&(#)?([Xx])?([0-9A-Za-z]+);/m',
[__CLASS__, 'decodeAnyEntity'],
$str
);
}
/**
* Decodes any HTML entity to it's correct UTF-8 char equivalent
*
* @param string $ent An entity
* @return string
*/
protected static function decodeAnyEntity($ent)
{
// create the named entity lookup table
static $table = null;
if ($table === null) {
$table = get_html_translation_table(HTML_ENTITIES);
$table = array_flip($table);
$table = array_map(
static function ($c) {
return Unicode::toUtf8(array(ord($c)));
},
$table
);
}
if ($ent[1] === '#') {
return self::decodeNumericEntity($ent);
}
if (array_key_exists($ent[0], $table)) {
return $table[$ent[0]];
}
return $ent[0];
}
/**
* Decodes numeric HTML entities to their correct UTF-8 characters
*
* @param $ent string A numeric entity
* @return string|false
*/
protected static function decodeNumericEntity($ent)
{
switch ($ent[2]) {
case 'X':
case 'x':
$cp = hexdec($ent[3]);
break;
default:
$cp = intval($ent[3]);
break;
}
return Unicode::toUtf8(array($cp));
}
/**
* UTF-8 to UTF-16BE conversion.
*
* Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
*
* @param string $str
* @param bool $bom
* @return string
*/
public static function toUtf16be($str, $bom = false)
{
$out = $bom ? "\xFE\xFF" : '';
if (UTF8_MBSTRING) {
return $out . mb_convert_encoding($str, 'UTF-16BE', 'UTF-8');
}
$uni = Unicode::fromUtf8($str);
foreach ($uni as $cp) {
$out .= pack('n', $cp);
}
return $out;
}
/**
* UTF-8 to UTF-16BE conversion.
*
* Maybe really UCS-2 without mb_string due to utf8_to_unicode limits
*
* @param string $str
* @return false|string
*/
public static function fromUtf16be($str)
{
$uni = unpack('n*', $str);
return Unicode::toUtf8($uni);
}
}
|