File: Tokenizer.pm

package info (click to toggle)
libjson-path-perl 1.0.6-1
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 692 kB
sloc: perl: 891; javascript: 62; sh: 3; makefile: 2
file content (219 lines) | stat: -rw-r--r-- 6,053 bytes
package JSON::Path::Tokenizer;
$JSON::Path::Tokenizer::VERSION = '1.0.6';
use strict;
use warnings;

use Carp;
use Readonly;
use JSON::Path::Constants qw(:symbols :operators);
use Exporter::Shiny 'tokenize';

Readonly my $ESCAPE_CHAR => qq{\\};
Readonly my %OPERATORS => (
    $TOKEN_ROOT                => 1,    # $
    $TOKEN_RECURSIVE           => 1,    # ..
    $TOKEN_CHILD               => 1,    # .
    $TOKEN_FILTER_OPEN         => 1,    # [?(
    $TOKEN_FILTER_SCRIPT_CLOSE => 1,    # )]
    $TOKEN_SCRIPT_OPEN         => 1,    # [(
    $TOKEN_SUBSCRIPT_OPEN      => 1,    # [
    $TOKEN_SUBSCRIPT_CLOSE     => 1,    # ]
    $TOKEN_QUOTE               => 1,    # "
);

# my $invocation = 0;

# ABSTRACT: Helper class for JSON::Path::Evaluator. Do not call directly.

# Take an expression and break it up into tokens
sub tokenize {
    my $expression = shift;
    #print "Tokenize \"$expression\"\n";
    my $chars = [ split //, $expression ];

    my @tokens;
    while ( defined( my $token = _read_to_next_token($chars) ) ) {

        #        print "$invocation: Got token: $token\n";
        push @tokens, $token;
        if ( $token eq $TOKEN_SCRIPT_OPEN || $token eq $TOKEN_FILTER_OPEN ) {

            #            print "$invocation: script/filter open: $token\n";
            push @tokens, _read_to_filter_script_close($chars);
        }
    }
    return @tokens;
}

sub _read_to_filter_script_close {
    my $chars = shift;

    my %escaped_chars = (
        "b"  => "\x{0008}",
        "f"  => "\x{000C}",
        "n"  => "\x{000A}",
        "r"  => "\x{000D}",
        "t"  => "\x{0009}",
        "v"  => "\x{000B}",
        "0"  => "\x{0000}",
        "'"  => "\x{0027}",
        '"'  => "\x{0022}",
        "\\" => "\x{005C}",
    );

    #print "$invocation: read to filter script close: " . join( '', @{$chars} ) . "\n";
    my $filter;
    my $in_regex;
    my $in_quote;
    my $escape = 0;

    my @quote_chars = ($APOSTROPHE, $QUOTATION_MARK);
    my @regex_chars = "/";

    while ( defined( my $char = shift @{$chars} ) ) {
        if ( $in_quote ) {
            if ( $escape ) {
                ## Replace \t by tab, \\ by \, etc
                $char = $escaped_chars{$char} || $char;
                $escape = 0;
            }
            elsif ( $char eq "\\" ) {
                ## Don't include \ and flag so next char in sequence
                ## is replaced correctly.
                $escape = 1;
                next;
            }
            elsif ( $char eq $in_quote ) {
                $in_quote = '';
            }
        }
        elsif ( $in_regex ) {
            if ( $escape ) {
                $escape = 0;
            }
            elsif ( $char eq "\\" ) {
                $escape = 1;
            }
            elsif ( $char eq $in_regex ) {
                $in_regex = '';
            }
        }
        elsif (grep { $_ eq $char } @quote_chars) {
            $in_quote = $char;
        }
        elsif (grep { $_ eq $char } @regex_chars) {
            $in_regex = $char;
        }

        $filter .= $char;

        last unless @{$chars};
        last if $chars->[0] eq $RIGHT_PARENTHESIS && !$in_quote && !$in_regex;
    }
    return $filter;
}

sub _read_to_next_token {
    #$invocation++;
    my $chars = shift;

    #print "$invocation: Get next token: " . join( '', @{$chars} ) . "\n";
    my $in_quote = '';
    my $token;
    while ( defined( my $char = shift @{$chars} ) ) {

        if ( $char eq $APOSTROPHE || $char eq $QUOTATION_MARK ) {
            #print "$invocation: Char is $APOSTROPHE or $QUOTATION_MARK. Char: $char, in_quote: $in_quote\n";
            if ( $in_quote && $in_quote eq $char ) {
                $in_quote = '';
                last;
            }
            $in_quote = $char;

            #print "$invocation: Set \$in_quote to $in_quote\n";
            next;
        }

        if ( $char eq $ESCAPE_CHAR && !$in_quote ) {

            #print "$invocation: Got escape char: $char\n";
            $token .= shift @{$chars};
            next;
        }

        #print "Append '$char' to token '$token'\n";
        $token .= $char;
        next if $in_quote;

 # Break out of the loop if the current character is the last one in the stream.
        last unless @{$chars};

        if ( $char eq $LEFT_SQUARE_BRACKET )
        {    # distinguish between '[', '[(', and '[?('
            if ( $chars->[0] eq $LEFT_PARENTHESIS ) {
                next;
            }
            if ( $chars->[0] eq $QUESTION_MARK ) {

# The below appends the '?'. The '(' will be appended in the next iteration of the loop
                $token .= shift @{$chars};
                next;
            }
        }
        elsif ( $char eq $RIGHT_PARENTHESIS ) {

            #print "$invocation: Found right parenthesis\n";

# A right parenthesis should be followed by a right square bracket, which itself is a token.
# Append the next character and proceed.
            $token .= shift @{$chars};

            #print "$invocation: Token is now: $token\n";
        }
        elsif ( $char eq $FULL_STOP ) {

# A full stop (i.e. a period, '.') may be the child operator '.' or the recursive operator '..'
            $token .= shift @{$chars} if $chars->[0] eq $FULL_STOP;
        }

        # If we've assembled an operator, we're done.
        last if $OPERATORS{$token};

        # Similarly, if the next character is an operator, we're done
        last if $OPERATORS{ $chars->[0] };
    }
    no warnings qw/uninitialized/;

    #print "$invocation: Token: $token\n";
    use warnings qw/uninitialized/;
    return $token;
}

1;

__END__

=pod

=encoding UTF-8

=head1 NAME

JSON::Path::Tokenizer - Helper class for JSON::Path::Evaluator. Do not call directly.

=head1 VERSION

version 1.0.6

=head1 AUTHOR

Aurelia Peters <popefelix@gmail.com>

=head1 COPYRIGHT AND LICENSE

This software is copyright (c) 2024 by Aurelia Peters.

This is free software; you can redistribute it and/or modify it under
the same terms as the Perl 5 programming language system itself.

=cut