1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199
|
// Copyright (c) 2014-2020 Dr. Colin Hirsch and Daniel Frey
// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/
#ifndef TAO_JSON_PEGTL_CONTRIB_UNESCAPE_HPP
#define TAO_JSON_PEGTL_CONTRIB_UNESCAPE_HPP
#include <cassert>
#include <string>
#include "../ascii.hpp"
#include "../config.hpp"
#include "../parse_error.hpp"
namespace TAO_JSON_PEGTL_NAMESPACE::unescape
{
// Utility functions for the unescape actions.
[[nodiscard]] inline bool utf8_append_utf32( std::string& string, const unsigned utf32 )
{
if( utf32 <= 0x7f ) {
string += char( utf32 & 0xff );
return true;
}
if( utf32 <= 0x7ff ) {
char tmp[] = { char( ( ( utf32 & 0x7c0 ) >> 6 ) | 0xc0 ),
char( ( ( utf32 & 0x03f ) ) | 0x80 ) };
string.append( tmp, sizeof( tmp ) );
return true;
}
if( utf32 <= 0xffff ) {
if( utf32 >= 0xd800 && utf32 <= 0xdfff ) {
// nope, this is a UTF-16 surrogate
return false;
}
char tmp[] = { char( ( ( utf32 & 0xf000 ) >> 12 ) | 0xe0 ),
char( ( ( utf32 & 0x0fc0 ) >> 6 ) | 0x80 ),
char( ( ( utf32 & 0x003f ) ) | 0x80 ) };
string.append( tmp, sizeof( tmp ) );
return true;
}
if( utf32 <= 0x10ffff ) {
char tmp[] = { char( ( ( utf32 & 0x1c0000 ) >> 18 ) | 0xf0 ),
char( ( ( utf32 & 0x03f000 ) >> 12 ) | 0x80 ),
char( ( ( utf32 & 0x000fc0 ) >> 6 ) | 0x80 ),
char( ( ( utf32 & 0x00003f ) ) | 0x80 ) };
string.append( tmp, sizeof( tmp ) );
return true;
}
return false;
}
// This function MUST only be called for characters matching TAO_JSON_PEGTL_NAMESPACE::ascii::xdigit!
template< typename I >
[[nodiscard]] I unhex_char( const char c )
{
switch( c ) {
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
return I( c - '0' );
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
return I( c - 'a' + 10 );
case 'A':
case 'B':
case 'C':
case 'D':
case 'E':
case 'F':
return I( c - 'A' + 10 );
default: // LCOV_EXCL_LINE
throw std::runtime_error( "invalid character in unhex" ); // LCOV_EXCL_LINE
}
}
template< typename I >
[[nodiscard]] I unhex_string( const char* begin, const char* end )
{
I r = 0;
while( begin != end ) {
r <<= 4;
r += unhex_char< I >( *begin++ );
}
return r;
}
// Actions for common unescape situations.
struct append_all
{
template< typename ActionInput >
static void apply( const ActionInput& in, std::string& s )
{
s.append( in.begin(), in.size() );
}
};
// This action MUST be called for a character matching T which MUST be TAO_JSON_PEGTL_NAMESPACE::one< ... >.
template< typename T, char... Rs >
struct unescape_c
{
template< typename ActionInput >
static void apply( const ActionInput& in, std::string& s )
{
assert( in.size() == 1 );
s += apply_one( in, static_cast< const T* >( nullptr ) );
}
template< typename ActionInput, char... Qs >
[[nodiscard]] static char apply_one( const ActionInput& in, const one< Qs... >* /*unused*/ )
{
static_assert( sizeof...( Qs ) == sizeof...( Rs ), "size mismatch between escaped characters and their mappings" );
return apply_two( in, { Qs... }, { Rs... } );
}
template< typename ActionInput >
[[nodiscard]] static char apply_two( const ActionInput& in, const std::initializer_list< char >& q, const std::initializer_list< char >& r )
{
const char c = *in.begin();
for( std::size_t i = 0; i < q.size(); ++i ) {
if( *( q.begin() + i ) == c ) {
return *( r.begin() + i );
}
}
throw parse_error( "invalid character in unescape", in ); // LCOV_EXCL_LINE
}
};
// See src/example/pegtl/unescape.cpp for why the following two actions
// skip the first input character. They also MUST be called
// with non-empty matched inputs!
struct unescape_u
{
template< typename ActionInput >
static void apply( const ActionInput& in, std::string& s )
{
assert( !in.empty() ); // First character MUST be present, usually 'u' or 'U'.
if( !utf8_append_utf32( s, unhex_string< unsigned >( in.begin() + 1, in.end() ) ) ) {
throw parse_error( "invalid escaped unicode code point", in );
}
}
};
struct unescape_x
{
template< typename ActionInput >
static void apply( const ActionInput& in, std::string& s )
{
assert( !in.empty() ); // First character MUST be present, usually 'x'.
s += unhex_string< char >( in.begin() + 1, in.end() );
}
};
// The unescape_j action is similar to unescape_u, however unlike
// unescape_u it
// (a) assumes exactly 4 hexdigits per escape sequence,
// (b) accepts multiple consecutive escaped 16-bit values.
// When applied to more than one escape sequence, unescape_j
// translates UTF-16 surrogate pairs in the input into a single
// UTF-8 sequence in s, as required for JSON by RFC 8259.
struct unescape_j
{
template< typename ActionInput >
static void apply( const ActionInput& in, std::string& s )
{
assert( ( ( in.size() + 1 ) % 6 ) == 0 ); // Expects multiple "\\u1234", starting with the first "u".
for( const char* b = in.begin() + 1; b < in.end(); b += 6 ) {
const auto c = unhex_string< unsigned >( b, b + 4 );
if( ( 0xd800 <= c ) && ( c <= 0xdbff ) && ( b + 6 < in.end() ) ) {
const auto d = unhex_string< unsigned >( b + 6, b + 10 );
if( ( 0xdc00 <= d ) && ( d <= 0xdfff ) ) {
b += 6;
(void)utf8_append_utf32( s, ( ( ( c & 0x03ff ) << 10 ) | ( d & 0x03ff ) ) + 0x10000 );
continue;
}
}
if( !utf8_append_utf32( s, c ) ) {
throw parse_error( "invalid escaped unicode code point", in );
}
}
}
};
} // namespace TAO_JSON_PEGTL_NAMESPACE::unescape
#endif
|