File: unescape.hpp

package info (click to toggle)
tao-json 0.0%2Bgit20200604.f357d72-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye
  • size: 8,900 kB
  • sloc: cpp: 48,129; makefile: 55
file content (199 lines) | stat: -rw-r--r-- 6,689 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
// Copyright (c) 2014-2020 Dr. Colin Hirsch and Daniel Frey
// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/

#ifndef TAO_JSON_PEGTL_CONTRIB_UNESCAPE_HPP
#define TAO_JSON_PEGTL_CONTRIB_UNESCAPE_HPP

#include <cassert>
#include <string>

#include "../ascii.hpp"
#include "../config.hpp"
#include "../parse_error.hpp"

namespace TAO_JSON_PEGTL_NAMESPACE::unescape
{
   // Utility functions for the unescape actions.

   [[nodiscard]] inline bool utf8_append_utf32( std::string& string, const unsigned utf32 )
   {
      if( utf32 <= 0x7f ) {
         string += char( utf32 & 0xff );
         return true;
      }
      if( utf32 <= 0x7ff ) {
         char tmp[] = { char( ( ( utf32 & 0x7c0 ) >> 6 ) | 0xc0 ),
                        char( ( ( utf32 & 0x03f ) ) | 0x80 ) };
         string.append( tmp, sizeof( tmp ) );
         return true;
      }
      if( utf32 <= 0xffff ) {
         if( utf32 >= 0xd800 && utf32 <= 0xdfff ) {
            // nope, this is a UTF-16 surrogate
            return false;
         }
         char tmp[] = { char( ( ( utf32 & 0xf000 ) >> 12 ) | 0xe0 ),
                        char( ( ( utf32 & 0x0fc0 ) >> 6 ) | 0x80 ),
                        char( ( ( utf32 & 0x003f ) ) | 0x80 ) };
         string.append( tmp, sizeof( tmp ) );
         return true;
      }
      if( utf32 <= 0x10ffff ) {
         char tmp[] = { char( ( ( utf32 & 0x1c0000 ) >> 18 ) | 0xf0 ),
                        char( ( ( utf32 & 0x03f000 ) >> 12 ) | 0x80 ),
                        char( ( ( utf32 & 0x000fc0 ) >> 6 ) | 0x80 ),
                        char( ( ( utf32 & 0x00003f ) ) | 0x80 ) };
         string.append( tmp, sizeof( tmp ) );
         return true;
      }
      return false;
   }

   // This function MUST only be called for characters matching TAO_JSON_PEGTL_NAMESPACE::ascii::xdigit!
   template< typename I >
   [[nodiscard]] I unhex_char( const char c )
   {
      switch( c ) {
         case '0':
         case '1':
         case '2':
         case '3':
         case '4':
         case '5':
         case '6':
         case '7':
         case '8':
         case '9':
            return I( c - '0' );
         case 'a':
         case 'b':
         case 'c':
         case 'd':
         case 'e':
         case 'f':
            return I( c - 'a' + 10 );
         case 'A':
         case 'B':
         case 'C':
         case 'D':
         case 'E':
         case 'F':
            return I( c - 'A' + 10 );
         default:                                                      // LCOV_EXCL_LINE
            throw std::runtime_error( "invalid character in unhex" );  // LCOV_EXCL_LINE
      }
   }

   template< typename I >
   [[nodiscard]] I unhex_string( const char* begin, const char* end )
   {
      I r = 0;
      while( begin != end ) {
         r <<= 4;
         r += unhex_char< I >( *begin++ );
      }
      return r;
   }

   // Actions for common unescape situations.

   struct append_all
   {
      template< typename ActionInput >
      static void apply( const ActionInput& in, std::string& s )
      {
         s.append( in.begin(), in.size() );
      }
   };

   // This action MUST be called for a character matching T which MUST be TAO_JSON_PEGTL_NAMESPACE::one< ... >.
   template< typename T, char... Rs >
   struct unescape_c
   {
      template< typename ActionInput >
      static void apply( const ActionInput& in, std::string& s )
      {
         assert( in.size() == 1 );
         s += apply_one( in, static_cast< const T* >( nullptr ) );
      }

      template< typename ActionInput, char... Qs >
      [[nodiscard]] static char apply_one( const ActionInput& in, const one< Qs... >* /*unused*/ )
      {
         static_assert( sizeof...( Qs ) == sizeof...( Rs ), "size mismatch between escaped characters and their mappings" );
         return apply_two( in, { Qs... }, { Rs... } );
      }

      template< typename ActionInput >
      [[nodiscard]] static char apply_two( const ActionInput& in, const std::initializer_list< char >& q, const std::initializer_list< char >& r )
      {
         const char c = *in.begin();
         for( std::size_t i = 0; i < q.size(); ++i ) {
            if( *( q.begin() + i ) == c ) {
               return *( r.begin() + i );
            }
         }
         throw parse_error( "invalid character in unescape", in );  // LCOV_EXCL_LINE
      }
   };

   // See src/example/pegtl/unescape.cpp for why the following two actions
   // skip the first input character. They also MUST be called
   // with non-empty matched inputs!

   struct unescape_u
   {
      template< typename ActionInput >
      static void apply( const ActionInput& in, std::string& s )
      {
         assert( !in.empty() );  // First character MUST be present, usually 'u' or 'U'.
         if( !utf8_append_utf32( s, unhex_string< unsigned >( in.begin() + 1, in.end() ) ) ) {
            throw parse_error( "invalid escaped unicode code point", in );
         }
      }
   };

   struct unescape_x
   {
      template< typename ActionInput >
      static void apply( const ActionInput& in, std::string& s )
      {
         assert( !in.empty() );  // First character MUST be present, usually 'x'.
         s += unhex_string< char >( in.begin() + 1, in.end() );
      }
   };

   // The unescape_j action is similar to unescape_u, however unlike
   // unescape_u it
   // (a) assumes exactly 4 hexdigits per escape sequence,
   // (b) accepts multiple consecutive escaped 16-bit values.
   // When applied to more than one escape sequence, unescape_j
   // translates UTF-16 surrogate pairs in the input into a single
   // UTF-8 sequence in s, as required for JSON by RFC 8259.

   struct unescape_j
   {
      template< typename ActionInput >
      static void apply( const ActionInput& in, std::string& s )
      {
         assert( ( ( in.size() + 1 ) % 6 ) == 0 );  // Expects multiple "\\u1234", starting with the first "u".
         for( const char* b = in.begin() + 1; b < in.end(); b += 6 ) {
            const auto c = unhex_string< unsigned >( b, b + 4 );
            if( ( 0xd800 <= c ) && ( c <= 0xdbff ) && ( b + 6 < in.end() ) ) {
               const auto d = unhex_string< unsigned >( b + 6, b + 10 );
               if( ( 0xdc00 <= d ) && ( d <= 0xdfff ) ) {
                  b += 6;
                  (void)utf8_append_utf32( s, ( ( ( c & 0x03ff ) << 10 ) | ( d & 0x03ff ) ) + 0x10000 );
                  continue;
               }
            }
            if( !utf8_append_utf32( s, c ) ) {
               throw parse_error( "invalid escaped unicode code point", in );
            }
         }
      }
   };

}  // namespace TAO_JSON_PEGTL_NAMESPACE::unescape

#endif