File: unescape.hpp

package info (click to toggle)
tao-json 0.0%2Bgit20200604.f357d72-2
links: PTS, VCS
area: main
in suites: bookworm, bullseye
size: 8,900 kB
sloc: cpp: 48,129; makefile: 55
file content (199 lines) | stat: -rw-r--r-- 6,689 bytes
parent folder | download | duplicates (2)
// Copyright (c) 2014-2020 Dr. Colin Hirsch and Daniel Frey
// Please see LICENSE for license or visit https://github.com/taocpp/PEGTL/

#ifndef TAO_JSON_PEGTL_CONTRIB_UNESCAPE_HPP
#define TAO_JSON_PEGTL_CONTRIB_UNESCAPE_HPP

#include <cassert>
#include <string>

#include "../ascii.hpp"
#include "../config.hpp"
#include "../parse_error.hpp"

namespace TAO_JSON_PEGTL_NAMESPACE::unescape
{
   // Utility functions for the unescape actions.

   [[nodiscard]] inline bool utf8_append_utf32( std::string& string, const unsigned utf32 )
   {
      if( utf32 <= 0x7f ) {
         string += char( utf32 & 0xff );
         return true;
      }
      if( utf32 <= 0x7ff ) {
         char tmp[] = { char( ( ( utf32 & 0x7c0 ) >> 6 ) | 0xc0 ),
                        char( ( ( utf32 & 0x03f ) ) | 0x80 ) };
         string.append( tmp, sizeof( tmp ) );
         return true;
      }
      if( utf32 <= 0xffff ) {
         if( utf32 >= 0xd800 && utf32 <= 0xdfff ) {
            // nope, this is a UTF-16 surrogate
            return false;
         }
         char tmp[] = { char( ( ( utf32 & 0xf000 ) >> 12 ) | 0xe0 ),
                        char( ( ( utf32 & 0x0fc0 ) >> 6 ) | 0x80 ),
                        char( ( ( utf32 & 0x003f ) ) | 0x80 ) };
         string.append( tmp, sizeof( tmp ) );
         return true;
      }
      if( utf32 <= 0x10ffff ) {
         char tmp[] = { char( ( ( utf32 & 0x1c0000 ) >> 18 ) | 0xf0 ),
                        char( ( ( utf32 & 0x03f000 ) >> 12 ) | 0x80 ),
                        char( ( ( utf32 & 0x000fc0 ) >> 6 ) | 0x80 ),
                        char( ( ( utf32 & 0x00003f ) ) | 0x80 ) };
         string.append( tmp, sizeof( tmp ) );
         return true;
      }
      return false;
   }

   // This function MUST only be called for characters matching TAO_JSON_PEGTL_NAMESPACE::ascii::xdigit!
   template< typename I >
   [[nodiscard]] I unhex_char( const char c )
   {
      switch( c ) {
         case '0':
         case '1':
         case '2':
         case '3':
         case '4':
         case '5':
         case '6':
         case '7':
         case '8':
         case '9':
            return I( c - '0' );
         case 'a':
         case 'b':
         case 'c':
         case 'd':
         case 'e':
         case 'f':
            return I( c - 'a' + 10 );
         case 'A':
         case 'B':
         case 'C':
         case 'D':
         case 'E':
         case 'F':
            return I( c - 'A' + 10 );
         default:                                                      // LCOV_EXCL_LINE
            throw std::runtime_error( "invalid character in unhex" );  // LCOV_EXCL_LINE
      }
   }

   template< typename I >
   [[nodiscard]] I unhex_string( const char* begin, const char* end )
   {
      I r = 0;
      while( begin != end ) {
         r <<= 4;
         r += unhex_char< I >( *begin++ );
      }
      return r;
   }

   // Actions for common unescape situations.

   struct append_all
   {
      template< typename ActionInput >
      static void apply( const ActionInput& in, std::string& s )
      {
         s.append( in.begin(), in.size() );
      }
   };

   // This action MUST be called for a character matching T which MUST be TAO_JSON_PEGTL_NAMESPACE::one< ... >.
   template< typename T, char... Rs >
   struct unescape_c
   {
      template< typename ActionInput >
      static void apply( const ActionInput& in, std::string& s )
      {
         assert( in.size() == 1 );
         s += apply_one( in, static_cast< const T* >( nullptr ) );
      }

      template< typename ActionInput, char... Qs >
      [[nodiscard]] static char apply_one( const ActionInput& in, const one< Qs... >* /*unused*/ )
      {
         static_assert( sizeof...( Qs ) == sizeof...( Rs ), "size mismatch between escaped characters and their mappings" );
         return apply_two( in, { Qs... }, { Rs... } );
      }

      template< typename ActionInput >
      [[nodiscard]] static char apply_two( const ActionInput& in, const std::initializer_list< char >& q, const std::initializer_list< char >& r )
      {
         const char c = *in.begin();
         for( std::size_t i = 0; i < q.size(); ++i ) {
            if( *( q.begin() + i ) == c ) {
               return *( r.begin() + i );
            }
         }
         throw parse_error( "invalid character in unescape", in );  // LCOV_EXCL_LINE
      }
   };

   // See src/example/pegtl/unescape.cpp for why the following two actions
   // skip the first input character. They also MUST be called
   // with non-empty matched inputs!

   struct unescape_u
   {
      template< typename ActionInput >
      static void apply( const ActionInput& in, std::string& s )
      {
         assert( !in.empty() );  // First character MUST be present, usually 'u' or 'U'.
         if( !utf8_append_utf32( s, unhex_string< unsigned >( in.begin() + 1, in.end() ) ) ) {
            throw parse_error( "invalid escaped unicode code point", in );
         }
      }
   };

   struct unescape_x
   {
      template< typename ActionInput >
      static void apply( const ActionInput& in, std::string& s )
      {
         assert( !in.empty() );  // First character MUST be present, usually 'x'.
         s += unhex_string< char >( in.begin() + 1, in.end() );
      }
   };

   // The unescape_j action is similar to unescape_u, however unlike
   // unescape_u it
   // (a) assumes exactly 4 hexdigits per escape sequence,
   // (b) accepts multiple consecutive escaped 16-bit values.
   // When applied to more than one escape sequence, unescape_j
   // translates UTF-16 surrogate pairs in the input into a single
   // UTF-8 sequence in s, as required for JSON by RFC 8259.

   struct unescape_j
   {
      template< typename ActionInput >
      static void apply( const ActionInput& in, std::string& s )
      {
         assert( ( ( in.size() + 1 ) % 6 ) == 0 );  // Expects multiple "\\u1234", starting with the first "u".
         for( const char* b = in.begin() + 1; b < in.end(); b += 6 ) {
            const auto c = unhex_string< unsigned >( b, b + 4 );
            if( ( 0xd800 <= c ) && ( c <= 0xdbff ) && ( b + 6 < in.end() ) ) {
               const auto d = unhex_string< unsigned >( b + 6, b + 10 );
               if( ( 0xdc00 <= d ) && ( d <= 0xdfff ) ) {
                  b += 6;
                  (void)utf8_append_utf32( s, ( ( ( c & 0x03ff ) << 10 ) | ( d & 0x03ff ) ) + 0x10000 );
                  continue;
               }
            }
            if( !utf8_append_utf32( s, c ) ) {
               throw parse_error( "invalid escaped unicode code point", in );
            }
         }
      }
   };

}  // namespace TAO_JSON_PEGTL_NAMESPACE::unescape

#endif