File: Parser.pm

package info (click to toggle)
liblatex-tom-perl 1.06-1
links: PTS, VCS
area: main
in suites: bookworm, forky, sid, trixie
size: 304 kB
sloc: perl: 1,787; makefile: 2
file content (1723 lines) | stat: -rw-r--r-- 54,513 bytes
###############################################################################
#
# LaTeX::TOM::Parser
#
# The parsing class
#
###############################################################################

package LaTeX::TOM::Parser;

use strict;
use base qw(
    LaTeX::TOM::Node
    LaTeX::TOM::Tree
);
use constant true  => 1;
use constant false => 0;

use Carp qw(carp croak);
use File::Basename qw(fileparse);

our $VERSION = '0.13';

my %error_handlers = (
    0 => sub { warn "parse error: $_[0].\n" },
    1 => sub { die  "parse error: $_[0].\n" },
    2 => sub {},
);

# Constructor
#
sub _new {
    my $class = shift;

    no strict 'refs';

    my $self = bless {
        config => {
            BRACELESS          => \%{'LaTeX::TOM::BRACELESS'},
            INNERCMDS          => \%{'LaTeX::TOM::INNERCMDS'},
            MATHENVS           => \%{'LaTeX::TOM::MATHENVS'},
            MATHBRACKETS       => \%{'LaTeX::TOM::MATHBRACKETS'},
            PARSE_ERRORS_FATAL =>  ${'LaTeX::TOM::PARSE_ERRORS_FATAL'},
            TEXTENVS           => \%{'LaTeX::TOM::TEXTENVS'},
        },
    };

    $self->_init(@_);

    return $self;
}

sub new {
    # XXX deprecated as of 2023-01-30
    carp 'Direct use of LaTeX::TOM::Parser constructor is deprecated and will be removed in future version';
    shift->_new(@_);
}

# Set/reset "globals"
#
sub _init {
    my $parser = shift;
    my ($parse_errors_fatal, $readinputs, $applymappings) = @_;

    my $retrieve_opt_default = sub
    {
        my ($opt, $default) = @_;
        return $opt if defined $opt;
        return $default;
    };

    # set user options
    #
    $parser->{readinputs}         = $retrieve_opt_default->($readinputs, 0);
    $parser->{applymappings}      = $retrieve_opt_default->($applymappings, 0);
    $parser->{PARSE_ERRORS_FATAL} = $retrieve_opt_default->($parse_errors_fatal, $parser->{config}{PARSE_ERRORS_FATAL});

    # init internal stuff
    #
    $parser->{MATHBRACKETS} = $parser->{config}{MATHBRACKETS};

    # this will hold a running list/hash of commands that have been remapped
    $parser->{MAPPEDCMDS} = {};

    # this will hold a running list/hash of commands that have been used. We dont
    # bother apply mappings except to commands that have been used.
    $parser->{USED_COMMANDS} = {};

    # no file yet
    $parser->{file} = undef;
}

# Parse a LaTeX file, return a tree. You probably want this method.
#
sub parseFile {
    my $parser = shift;
    my $filename = shift;

    # init variables
    #
    $parser->{file} = $filename;        # file name member data
    my $tree = {};                      # init output tree

    # read in text from file or bomb out
    #
    my $text = _readFile($filename, true);

    # do the parse
    #
    $tree = $parser->parse($text);

    return $tree;
}

# main parsing entrypoint
#
sub parse {
    my $parser = shift;
    my ($text) = @_;

    # first half of parsing (goes up to finding commands, reading inputs)
    #
    my ($tree, $bracehash) = $parser->_parseA($text);
    _debug(
        'done with _parseA',
         sub { $tree->_warn() },
    );

    # handle mappings
    #
    $parser->_applyMappings($tree) if $parser->{applymappings};
    _debug(
        'done with _applyMappings',
         sub { $tree->_warn() },
    );

    # second half of parsing (environments)
    #
    $parser->_parseB($tree);
    _debug(
        'done with _parseB',
         sub { $tree->_warn() },
    );

    # once all the above is done we can propegate math/plaintext modes down
    #
    $parser->_propegateModes($tree, 0, 0);   # math = 0, plaintext = 0
    _debug(
        'done with _propegateModes',
         sub { $tree->_warn() },
    );

    # handle kooky \[ \] math mode
    #
    if (not exists $parser->{MAPPEDCMDS}->{'\\['}) {
        # math mode (\[ \], \( \))
        $parser->_stage5($tree, {'\\[' => '\\]', '\\(' => '\\)'}, 1);
        $parser->_propegateModes($tree, 0, 0);     # have to do this again of course
        $parser->{MATHBRACKETS}->{'\\['} = '\\]';  # put back in brackets list for
        $parser->{MATHBRACKETS}->{'\\('} = '\\)';  # printing purposes.
    }
    _debug(
        undef,
        sub { $tree->_warn() },
    );

    $tree->listify;     # add linked-list stuff

    return $tree;
}

# Parsing with no mappings and no externally accessible parser object.
#
sub _basicparse {
    my $parser = shift; # @_ would break code
    my $text   = shift;

    my $parse_errors_fatal = (defined $_[0] ? $_[0] : $parser->{config}{PARSE_ERRORS_FATAL});
    my $readinputs = (defined $_[1] ? $_[1] : 1);

    $parser = LaTeX::TOM::Parser->_new($parse_errors_fatal, $readinputs);
    my ($tree, $bracehash) = $parser->_parseA($text);

    $parser->_parseB($tree);

    $tree->listify; # add linked-list stuff

    return ($tree, $bracehash);
}

# start the tree. separate out comment and text nodes.
#
sub _stage1 {
    my $parser = shift;
    my $text = shift;

    my @nodes = _getTextAndCommentNodes($text, 0, length($text));

    return LaTeX::TOM::Tree->_new([@nodes]);
}

# this stage parses the braces ({}) and adds the corresponding structure to
# the tree.
#
sub _stage2 {
    my $parser = shift;

    my $tree = shift;
    my $bracehash = shift || undef;
    my $startidx = shift || 0;      # last two params for starting at some specific
    my $startpos = shift || 0;      # node and offset.

    my %blankhash;

    if (not defined $bracehash) {
        $bracehash = {%blankhash};
    }

    my $leftidx = -1;
    my $leftpos = -1;
    my $leftcount = 0;

    # loop through the nodes
    for (my $i = $startidx; $i < @{$tree->{nodes}}; $i++) {
        my $node = $tree->{nodes}[$i];
        my $spos = $node->{start};	# get text start position

        # set position placeholder within the text block
        my $pos = ($i == $startidx) ? $startpos : 0;

        if ($node->{type} eq 'TEXT') {

         _debug("parseStage2: looking at text node: [$node->{content}]", undef);

         my ($nextpos, $brace) = _findbrace($node->{content}, $pos);
         while ($nextpos != -1) {

            $pos = $nextpos + 1; # update position pointer

            # handle left brace
            if ($brace eq '{') {
                _debug("found '{' at position $nextpos, leftcount is $leftcount", undef);
                if ($leftcount == 0) {
                    $leftpos = $nextpos;
                    $leftidx = $i
                }
                $leftcount++;
            }

            # handle right brance
            elsif ($brace eq '}') {

                _debug("found '}' at position $nextpos, leftcount is $leftcount", undef);
                my $rightpos = $nextpos;
                $leftcount--;

                # found the corresponding right brace to our starting left brace
                if ($leftcount == 0) {

                    # see if we have to split the text node into 3 parts
                    #
                    if ($leftidx == $i) {

                        my ($leftside, $textnode3) = $node->split($rightpos, $rightpos);
                        my ($textnode1, $textnode2) = $leftside->split($leftpos, $leftpos);

                        # make the new GROUP node
                        my $groupnode = LaTeX::TOM::Node->_new(
                            {type => 'GROUP',
                             start => $textnode2->{start} - 1,
                             end => $textnode2->{end} + 1,
                             children => LaTeX::TOM::Tree->_new([$textnode2]),
                            });

                        # splice the new subtree into the old location
                        splice @{$tree->{nodes}}, $i, 1, $textnode1, $groupnode, $textnode3;

                        # add to the brace-pair lookup table
                        $bracehash->{$groupnode->{start}} = $groupnode->{end};
                        $bracehash->{$groupnode->{end}} = $groupnode->{start};

                        # recur into new child node
                        $parser->_stage2($groupnode->{children}, $bracehash);

                        $i++; # skip to textnode3 for further processing
                    }

                    # split across nodes
                    #
                    else {

                        my ($textnode1, $textnode2) = $tree->{nodes}[$leftidx]->split($leftpos, $leftpos);
                        my ($textnode3, $textnode4) = $node->split($rightpos, $rightpos);

                        # remove nodes in between the node we found '{' in and the node
                        # we found '}' in
                        #
                        my @removed = splice @{$tree->{nodes}}, $leftidx+1, $i-$leftidx-1;

                        # create a group node that contains the text after the left brace,
                        # then all the nodes up until the next text node, then the text
                        # before the right brace.
                        #
                        my $groupnode = LaTeX::TOM::Node->_new(
                            {type => 'GROUP',
                             start => $textnode2->{start} - 1,
                             end => $textnode3->{end} + 1,
                             children => LaTeX::TOM::Tree->_new(
                                [$textnode2,
                                 @removed,
                                 $textnode3]),
                            });

                        # replace the two original text nodes with the leftover left and
                        # right portions, as well as the group node with everything in
                        # the middle.
                        #
                        splice @{$tree->{nodes}}, $leftidx, 2, $textnode1, $groupnode, $textnode4;

                        # add to the brace-pair lookup table
                        $bracehash->{$groupnode->{start}} = $groupnode->{end};
                        $bracehash->{$groupnode->{end}} = $groupnode->{start};

                        # recur into new child nodes
                        $parser->_stage2($groupnode->{children}, $bracehash);

                        # step back to textnode4 on this level for further processing
                        $i -= scalar @removed;
                    }

                    $leftpos = -1; # reset left data
                    $leftidx = -1;
                    last;
                } # $leftcount == 0

                # check for '}'-based error
                #
                if ($leftcount < 0) {
                    $error_handlers{$parser->{PARSE_ERRORS_FATAL}}->("'}' before '{' at " . ($spos + $rightpos));
                    $leftcount = 0; # reset and continue
                }
            } # right brace

            ($nextpos, $brace) = _findbrace($node->{content}, $pos);

         } # while (braces left)

         } # if TEXT

    } # loop over all nodes

    # check for extra '{' parse error
    #
    if ($leftcount > 0) {
        my $spos = $tree->{nodes}[$leftidx]->{start}; # get text start position
        $error_handlers{$parser->{PARSE_ERRORS_FATAL}}->("unmatched '{' at " . ($spos + $leftpos));

        # try to continue on, after the offending brace
        $parser->_stage2($tree, $bracehash, $leftidx, $leftpos + 1);
    }

    return $bracehash;
}

# this stage finds LaTeX commands and accordingly turns GROUP nodes into
# command nodes, labeled with the command
#
sub _stage3 {
    my $parser = shift;

    my $tree = shift;
    my $parent = shift;

    for (my $i = 0; $i < @{$tree->{nodes}}; $i++) {

        my $node = $tree->{nodes}[$i];

        # check text node for command tag
        if ($node->{type} eq 'TEXT') {
            my $text = $node->{content};

            # inner command (such as {\command text text}). our regexp checks to see
            # if this text chunk begins with \command, since that would be the case
            # due to the previous parsing stages. if found, the parent node is
            # promoted to a command.
            #
            if ($text =~ /^\s*\\(\w+\*?)/ && defined $parent && $parser->{config}{INNERCMDS}->{$1}) {
                my $command = $1;

                # if the parent is already a command node, we have to make a new
                # nested command node
                #
                if ($parent->{type} eq 'COMMAND') {

                    # make a new command node
                    my $newnode = LaTeX::TOM::Node->_new(
                        {type => 'COMMAND',
                         command => $command,
                         start => $parent->{start},
                         end => $parent->{end},
                         position => 'inner',
                         children => $parent->{children} });

                    # point parent to it
                    $parent->{children} = LaTeX::TOM::Tree->_new([$newnode]);

                    # start over at this level (get additional inner commands)
                    $parent = $newnode;
                    $i = -1;

                    $parser->{USED_COMMANDS}->{$newnode->{command}} = 1;
                }

                # parent is a naked group, we can make it into a command node
                #
                elsif ($parent->{type} eq 'GROUP') {
                    $parent->{type} = 'COMMAND';
                    $parent->{command} = $command;
                    $parent->{position} = 'inner';

                    # start over at this level
                    $i = -1;

                    $parser->{USED_COMMANDS}->{$parent->{command}} = 1;
                }

                $node->{content} =~ s/^\s*\\(?:\w+\*?)//o;
            }

            # outer command (such as \command{parameters}). our regexp checks to
            # see if this text chunk ends in \command, since that would be the case
            # due to the previous parsing stages.
            #
            if ($text =~ /(?:^|[^\\])(\\\w+\*?(\s*\[.*?\])?)\s*$/os &&
                    defined $tree->{nodes}[$i+1] &&
                    $tree->{nodes}[$i+1]->{type} eq 'GROUP') {

                my $tag = $1;

                _debug("found text node [$text] with command tag [$tag]", undef);

                # remove the text
                $node->{content} =~ s/\\\w+\*?\s*(?:\[.*?\])?\s*$//os;

                # parse it for command and ops
                $tag =~ /^\\(\w+\*?)\s*(?:\[(.*?)\])?$/os;

                my $command = $1;
                my $opts = $2;

                # make the next node a command node with the above data
                my $next = $tree->{nodes}[$i+1];

                $next->{type} = 'COMMAND';
                $next->{command} = $command;
                $next->{opts} = $opts;
                $next->{position} = 'outer';

                $parser->{USED_COMMANDS}->{$next->{command}} = 1;
            }

            # recognize braceless commands
            #
            if ($text =~ /(\\(\w+\*?)[ \t]+(\S+))/gso || $text =~ /(\\(\w+)(\d+))/gso) {
                my $all = $1;
                my $command = $2;
                my $param = $3;

                if ($parser->{config}{BRACELESS}->{$command}) {
                 # warn "found braceless command $command with param $param";

                    # get location to split from node text
                    my $a = index $node->{content}, $all, 0;
                    my $b = $a + length($all) - 1;

                    # make all the new nodes

                    # new left and right text nodes
                    my ($leftnode, $rightnode) = $node->split($a, $b);

                    # param contents node
                    my $pstart = index $node->{content}, $param, $a;
                    my $newchild = LaTeX::TOM::Node->_new(
                        {type => 'TEXT',
                         start => $node->{start} + $pstart,
                         end => $node->{start} + $pstart + length($param) - 1,
                         content => $param });

                    # new command node
                    my $commandnode = LaTeX::TOM::Node->_new(
                        {type => 'COMMAND',
                         braces => 0,
                         command => $command,
                         start => $node->{start} + $a,
                         end => $node->{start} + $b,
                         children => LaTeX::TOM::Tree->_new([$newchild]),
                        });

                    $parser->{USED_COMMANDS}->{$commandnode->{command}} = 1;

                    # splice these all into the original array
                    splice @{$tree->{nodes}}, $i, 1, $leftnode, $commandnode, $rightnode;

                    # make the rightnode the node we're currently analyzing
                    $node = $rightnode;

                    # make sure outer loop will continue parsing *after* rightnode
                    $i += 2;
                }
            }
        }

        # recur
        if ($node->{type} eq 'GROUP' ||
            $node->{type} eq 'COMMAND') {

            $parser->_stage3($node->{children}, $node);
        }
    }
}

# this stage finds \begin{x} \end{x} environments and shoves their contents
#	down into a new child node, with a parent node of ENVIRONMENT type.
#
# this has the effect of making the tree deeper, since much of the structure
#	is in environment tags and will now be picked up.
#
# for ENVIRONMENTs, "start" means the ending } on the \begin tag,
# "end" means the starting \ on the \end tag,
# "ostart" is the starting \ on the "begin" tag,
# "oend" is the ending } on the "end" tag, and
# and "class" is the "x" from above.
#
sub _stage4 {
    my $parser = shift;
    my $tree = shift;

    my $bcount = 0; # \begin "stack count"
    my $class = ""; # environment class
    my $bidx = 0;   # \begin array index.

    for (my $i = 0; $i < @{$tree->{nodes}}; $i++) {
        my $node = $tree->{nodes}->[$i];

        # see if this is a "\begin" command node
        if ($node->{type} eq 'COMMAND' && $node->{command} eq 'begin') {

            _debug("parseStage4: found a begin COMMAND node, $node->{children}->{nodes}[0]->{content} @ $node->{start}", undef);

            # start a new "stack"
            if ($bcount == 0) {
                $bidx = $i;
                $bcount++;
                $class = $node->{children}->{nodes}->[0]->{content};
                _debug("parseStage4: opening environment tag found, class = $class", undef);
            }

            # add to the "stack"
            elsif ($node->{children}->{nodes}->[0]->{content} eq $class) {
                $bcount++;
                _debug("parseStage4: incrementing tag count for $class", undef);
            }
        }

        # handle "\end" command nodes
        elsif ($node->{type} eq 'COMMAND' &&
               $node->{command} eq 'end' &&
               $node->{children}->{nodes}->[0]->{content} eq $class) {

            $bcount--;
            _debug("parseStage4: decrementing tag count for $class", undef);

            # we found our closing "\end" tag. replace everything with the proper
            # ENVIRONMENT tag and subtree.
            #
            if ($bcount == 0) {

                _debug("parseStage4: closing environment $class", undef);

                # first we must take everything between the "\begin" and "\end"
                # nodes and put them in a new array, removing them from the old one
                my @newarray = splice @{$tree->{nodes}}, $bidx+1, $i - ($bidx + 1);

                # make the ENVIRONMENT node
                my $start = $tree->{nodes}[$bidx]->{end};
                my $end = $node->{start};
                my $envnode = LaTeX::TOM::Node->_new(
                    {type => 'ENVIRONMENT',
                     class => $class,
                     start => $start, # "inner" start and end
                     end => $end,
                     ostart => $start - length('begin') - length($class) - 2,
                     oend => $end + length('end') + length($class) + 2,
                     children => LaTeX::TOM::Tree->_new([@newarray]),
                    });

                if ($parser->{config}{MATHENVS}->{$envnode->{class}}) {
                    $envnode->{math} = 1;
                }

                # replace the \begin and \end COMMAND nodes with the single
                # environment node
                splice @{$tree->{nodes}}, $bidx, 2, $envnode;

                $class = ""; # reset class.

                # i is going to change by however many nodes we removed
                $i -= scalar @newarray;

                # recur into the children
                $parser->_stage4($envnode->{children});
            }
        }

        # recur in general
        elsif ($node->{children}) {
            $parser->_stage4($node->{children});
        }
    }

    # parse error if we're missing an "\end" tag.
    if ($bcount > 0) {
        $error_handlers{$parser->{PARSE_ERRORS_FATAL}}->(
            "missing \\end{$class} for \\begin{$class} at position $tree->{nodes}[$bidx]->{end}"
        );
    }
}

# This is the "math" stage: here we grab simple-delimiter math modes from
# the text they are embedded in, and turn those into new groupings, with the
# "math" flag set.
#
# having this top level to go over all the bracket types prevents some pretty
# bad combinatorial explosion
#
sub _stage5 {
    my $parser = shift;

    my $tree = shift;
    my $caremath = shift || 0;

    my $brackets = $parser->{MATHBRACKETS};

    # loop through all the different math mode bracket types
    foreach my $left (sort {length($b) <=> length($a)} keys %$brackets) {
        my $right = $brackets->{$left};

        $parser->_stage5_r($tree, $left, $right, $caremath);
    }
}

# recursive meat of above
#
sub _stage5_r {
    my $parser = shift;

    my $tree = shift;
    my $left = shift;
    my $right = shift;
    my $caremath = shift || 0; # do we care if we're already in math mode?
                               # this matters for \( \), \[ \]

    my $leftpos = -1; # no text pos for found left brace yet.
    my $leftidx = -1; # no array index for found left brace yet.

        # loop through the nodes
        for (my $i = 0; $i < scalar @{$tree->{nodes}}; $i++) {
            my $node = $tree->{nodes}[$i];
            my $pos = 0; # position placeholder within the text block
            my $spos = $node->{start}; # get text start position

            if ($node->{type} eq 'TEXT' &&
               (!$caremath || (!$node->{math} && $caremath))) {

                # search for left brace if we haven't started a pair yet
                if ($leftidx == -1) {
                    $leftpos = _findsymbol($node->{content}, $left, $pos);

                    if ($leftpos != -1) {
                        _debug("found (left) $left in [$node->{content}]", undef);
                        $leftidx = $i;
                        $pos = $leftpos + 1; # next pos to search from
                    }
                }

                # search for a right brace
                if ($leftpos != -1) {
                    my $rightpos = _findsymbol($node->{content}, $right, $pos);

                    # found
                    if ($rightpos != -1) {

                        # we have to split the text node into 3 parts
                        if ($leftidx == $i) {
                            _debug("splitwithin: found (right) $right in [$node->{content}]", undef);

                            my ($leftnode, $textnode3) = $node->split($rightpos, $rightpos + length($right) - 1);
                            my ($textnode1, $textnode2) = $leftnode->split($leftpos, $leftpos + length($left) - 1);

                            my $startpos = $spos; # get text start position

                            # make the math ENVIRONMENT node
                            my $mathnode = LaTeX::TOM::Node->_new(
                                {type => 'ENVIRONMENT',
                                class => $left,	# use left delim as class
                                math => 1,
                                start => $startpos + $leftpos,
                                ostart => $startpos + $leftpos - length($left) + 1,
                                end => $startpos + $rightpos,
                                oend => $startpos + $rightpos + length($right) - 1,
                                children => LaTeX::TOM::Tree->_new([$textnode2]),
                                });

                            splice @{$tree->{nodes}}, $i, 1, $textnode1, $mathnode, $textnode3;

                            $i++; # skip ahead two nodes, so we'll be parsing textnode3
                        }

                        # split across nodes
                        else {

                            _debug("splitacross: found (right) $right in [$node->{content}]", undef);

                            # create new set of 4 smaller text nodes from the original two
                            # that contain the left and right delimiters
                            #
                            my ($textnode1, $textnode2) = $tree->{nodes}[$leftidx]->split($leftpos, $leftpos + length($left) - 1);
                            my ($textnode3, $textnode4) = $tree->{nodes}[$i]->split($rightpos, $rightpos + length($right) - 1);

                            # nodes to remove "from the middle" (between the left and right
                            # text nodes which contain the delimiters)
                            #
                            my @remnodes = splice @{$tree->{nodes}}, $leftidx+1, $i - $leftidx - 1;

                            # create a math node that contains the text after the left brace,
                            # then all the nodes up until the next text node, then the text
                            # before the right brace.
                            #
                            my $mathnode = LaTeX::TOM::Node->_new(
                                {type => 'ENVIRONMENT',
                                class => $left,
                                math => 1,
                                start => $textnode2->{start} - 1,
                                end => $textnode3->{end} + 1,
                                ostart => $textnode2->{start} - 1 - length($left) + 1,
                                oend => $textnode3->{end} + 1 + length($right) - 1,
                                children => LaTeX::TOM::Tree->_new(
                                [$textnode2,
                                 @remnodes,
                                 $textnode3]),
                                });

                            # replace (TEXT_A, ... , TEXT_B) with the mathnode created above
                            splice @{$tree->{nodes}}, $leftidx, 2, $textnode1, $mathnode, $textnode4;

                            # do all nodes again but the very leftmost
                            #
                            $i = $leftidx;
                        }

                        $leftpos = -1; # reset left data
                        $leftidx = -1;
                    } # right brace
                } # left brace
                else {

                    my $rightpos = _findsymbol($node->{content}, $right, $pos);

                    if ($rightpos != -1) {
                        my $startpos = $node->{start}; # get text start position
                        $error_handlers{$parser->{PARSE_ERRORS_FATAL}}->("unmatched '$right' at " . ($startpos + $rightpos));
                    }
                }
            } # if TEXT

            # recur, but not into verbatim environments!
            #
            elsif ($node->{children} &&
                         !(
                             ($node->{type} eq 'COMMAND' && $node->{command} =~ /^verb/) ||
                             ($node->{type} eq 'ENVIRONMENT' && $node->{class} =~ /^verbatim/))) {

                if ($LaTeX::TOM::DEBUG) {
                    my $message  = "Recurring into $node->{type} node ";
                       $message .= $node->{command} if ($node->{type} eq 'COMMAND');
                       $message .= $node->{class}   if ($node->{type} eq 'ENVIRONMENT');
                    _debug($message, undef);
                }

                $parser->_stage5_r($node->{children}, $left, $right, $caremath);
            }

        } # loop over text blocks

        if ($leftpos != -1) {
            my $startpos = $tree->{nodes}[$leftidx]->{start};   # get text start position
            $error_handlers{$parser->{PARSE_ERRORS_FATAL}}->("unmatched '$left' at " . ($startpos + $leftpos));
        }
}

# This stage propegates the math mode flag and plaintext flags downward.
#
# After this is done, we can make the claim that only text nodes marked with
# the plaintext flag should be printed.	math nodes will have the "math" flag,
# and also plantext = 0.
#
sub _propegateModes {
    my $parser = shift;

    my $tree = shift;
    my $math = shift;       # most likely want to call this with 0
    my $plaintext = shift;  # ditto this-- default to nothing visible.

    foreach my $node (@{$tree->{nodes}}) {

        # handle text nodes on this level. set flags.
        #
        if ($node->{type} eq 'TEXT') {
            $node->{math} = $math;
            $node->{plaintext} = $plaintext;
        }

        # propegate flags downward, possibly modified
        #
        elsif (defined $node->{children}) {

            my $mathflag = $math;    # math propegates down by default
            my $plaintextflag = 0;   # plaintext flag does NOT propegate by default

            # handle math or plain text forcing envs
            #
            if ($node->{type} eq 'ENVIRONMENT' || $node->{type} eq 'COMMAND') {
                if (defined $node->{class} && (
                    $parser->{config}{MATHENVS}->{$node->{class}} ||
                    $parser->{config}{MATHENVS}->{"$node->{class}*"})
                   )
                {
                    $mathflag = 1;
                    $plaintextflag = 0;
                }
                elsif (($node->{type} eq 'COMMAND' &&
                                ($parser->{config}{TEXTENVS}->{$node->{command}} ||
                                 $parser->{config}{TEXTENVS}->{"$node->{command}*"})) ||
                             ($node->{type} eq 'ENVIRONMENT' &&
                                ($parser->{config}{TEXTENVS}->{$node->{class}} ||
                                 $parser->{config}{TEXTENVS}{"$node->{command}*"}))
                            ) {

                    $mathflag = 0;
                    $plaintextflag = 1;
                }
            }

            # groupings change nothing
            #
            elsif ($node->{type} eq 'GROUP') {
                $mathflag = $math;
                $plaintextflag = $plaintext;
            }

            # recur
            $parser->_propegateModes($node->{children}, $mathflag, $plaintextflag);
        }
    }
}

# apply a mapping to text nodes in a tree
#
# for newcommands and defs: mapping is a hash:
#
# {name, nparams, template, type}
#
# name is a string
# nparams is an integer
# template is a tree fragement containing text nodes with #x flags, where
# parameters will be replaced.
# type is "command"
#
# for newenvironments:
#
# {name, nparams, btemplate, etemplate, type}
#
# same as above, except type is "environment" and there are two templates,
# btemplate and etemplate.
#
sub _applyMapping {
    my $parser = shift;

    my $tree = shift;
    my $mapping = shift;
    my $i = shift || 0;  # index to start with, in tree.

    my $applications = 0; # keep track of # of applications

    for (; $i < @{$tree->{nodes}}; $i++) {

        my $node = $tree->{nodes}[$i];

        # begin environment nodes
        #
        if ($node->{type}                            eq 'COMMAND'
         && $node->{command}                         eq 'begin'
         && $node->{children}->{nodes}[0]->{content} eq $mapping->{name}
        ) {
            # grab the nparams next group nodes as parameters
            #
            my @params = ();

            my $remain = $mapping->{nparams};
            my $j = 1;
            while ($remain > 0 && ($i + $j) < scalar @{$tree->{nodes}}) {

                my $node = $tree->{nodes}[$i + $j];

                # grab group node
                if ($node->{type} eq 'GROUP') {
                    push @params, $node->{children};
                    $remain--;
                }

                $j++;
            }

            # if we didn't get enough group nodes, bomb out
            next if $remain;

            # otherwise make new subtree
            my $applied = _applyParamsToTemplate($mapping->{btemplate}, @params);

            # splice in the result
            splice @{$tree->{nodes}}, $i, $j, @{$applied->{nodes}};

            # skip past all the new stuff
            $i += scalar @{$applied->{nodes}} - 1;
        }

        # end environment nodes
        #
        elsif ($node->{type}                            eq 'COMMAND'
            && $node->{command}                         eq 'end'
            && $node->{children}->{nodes}[0]->{content} eq $mapping->{name}
        ) {
            # make new subtree (no params)
            my $applied = $mapping->{etemplate}->copy();

            # splice in the result
            splice @{$tree->{nodes}}, $i, 1, @{$applied->{nodes}};

            # skip past all the new stuff
            $i += scalar @{$applied->{nodes}} - 1;

            $applications++; # only count end environment nodes
        }

        # newcommand nodes
        #
        elsif ($node->{type}       eq 'COMMAND'
            && $node->{command}    eq $mapping->{name}
            && $mapping->{nparams}
        ) {
            my @params = ();

            # children of COMMAND node will be first parameter
            push @params, $node->{children};

            # find next nparams GROUP nodes and push their children onto @params
            my $remain = $mapping->{nparams} - 1;
            my $j = 1;
            while ($remain > 0 && ($i + $j) < scalar @{$tree->{nodes}}) {

                my $node = $tree->{nodes}[$i + $j];

                # grab group node
                if ($node->{type} eq 'GROUP') {
                    push @params, $node->{children};
                    $remain--;
                }

                $j++;
            }

            # if we didn't get enough group nodes, bomb out
            next if ($remain > 0);

            # apply the params to the template
            my $applied = _applyParamsToTemplate($mapping->{template}, @params);

            # splice in the result
            splice @{$tree->{nodes}}, $i, $j, @{$applied->{nodes}};

            # skip past all the new stuff
            $i += scalar @{$applied->{nodes}} - 1;

            $applications++;
        }

        # find 0-param mappings
        elsif ($node->{type} eq 'TEXT' && !$mapping->{nparams}) {

             my $text = $node->{content};
             my $command = $mapping->{name};

             # find occurrences of the mapping command
             #
             my $wordend = ($command =~ /\w$/ ? 1 : 0);
             while (($wordend && $text =~ /\\\Q$command\E(\W|$)/g) ||
                            (!$wordend && $text =~ /\\\Q$command\E/g)) {

                 _debug("found occurrence of mapping $command", undef);

                 my $idx = index $node->{content}, '\\' . $command, 0;

                 # split the text node at that command
                 my ($leftnode, $rightnode) = $node->split($idx, $idx + length($command));

                 # copy the mapping template
                 my $applied = $mapping->{template}->copy();

                 # splice the new nodes in
                 splice @{$tree->{nodes}}, $i, 1, $leftnode, @{$applied->{nodes}}, $rightnode;

                 # adjust i so we end up on rightnode when we're done
                 $i += scalar @{$applied->{nodes}} + 1;

                 # get the next node
                 $node = $tree->{nodes}[$i];

                 # count application
                 $applications++;
             }
        }

        # recur
        elsif ($node->{children}) {

            $applications += $parser->_applyMapping($node->{children}, $mapping);
        }
    }

    return $applications;
}

# find and apply all mappings in the tree, progressively and recursively.
# a mapping applies to the entire tree and subtree consisting of nodes AFTER
# itself in the level array.
#
sub _applyMappings {
    my $parser = shift;

    my $tree = shift;

    for (my $i = 0; $i < @{$tree->{nodes}}; $i++) {

        my $prev = $tree->{nodes}[$i-1];
        my $node = $tree->{nodes}[$i];

        # find newcommands
        if ($node->{type} eq 'COMMAND' &&
                $node->{command} =~ /^(re)?newcommand$/) {

            my $mapping = _makeMapping($tree, $i);
            next if (!$mapping->{name}); # skip fragged commands

            if ($parser->{USED_COMMANDS}->{$mapping->{name}}) {
                _debug("applying (nc) mapping $mapping->{name}", undef);
            } else {
                _debug("NOT applying (nc) mapping $mapping->{name}", undef);
                next;
            }

            # add to mappings list
            #
            $parser->{MAPPEDCMDS}->{"\\$mapping->{name}"} = 1;

            _debug("found a mapping with name $mapping->{name}, $mapping->{nparams} params", undef);

            # remove the mapping declaration
            #
            splice @{$tree->{nodes}}, $i, $mapping->{skip} + 1;

            # apply the mapping
            my $count = $parser->_applyMapping($tree, $mapping, $i);

            if ($count > 0) {
                _debug("printing altered subtree", sub { $tree->_warn() });
            }

            $i--; # since we removed the cmd node, check this index again
        }

        # handle "\newenvironment" mappings
        elsif ($node->{type} eq 'COMMAND' &&
                 $node->{command} =~ /^(re)?newenvironment$/) {

            # make a mapping hash
            #
            my $mapping = $parser->_makeEnvMapping($tree, $i);
            next if (!$mapping->{name}); # skip fragged commands.

            _debug("applying (ne) mapping $mapping->{name}", undef);

            # remove the mapping declaration
            #
            splice @{$tree->{nodes}}, $i, $mapping->{skip} + 1;

            # apply the mapping
            #
            my $count = $parser->_applyMapping($tree, $mapping, $i);
        }

        # handle "\def" stype commands.
        elsif ($node->{type} eq 'COMMAND' &&
                 defined $prev &&
                 $prev->{type} eq 'TEXT' &&
                 $prev->{content} =~ /\\def\s*$/o) {

             _debug("found def style mapping $node->{command}", undef);

             # remove the \def
             $prev->{content} =~ s/\\def\s*$//o;

             # make the mapping
             my $mapping = {name => $node->{command},
                nparams => 0,
                template => $node->{children}->copy(),
                type => 'command'};

             next if (!$mapping->{name}); # skip fragged commands

             if ($parser->{USED_COMMANDS}->{$mapping->{name}}) {
                 _debug("applying (def) mapping $mapping->{name}", undef);
             } else {
                 _debug("NOT applying (def) mapping $mapping->{name}", undef);
                 next;
             }

             # add to mappings list
             #
             $parser->{MAPPEDCMDS}->{"\\$mapping->{name}"} = 1;

             _debug("template is", sub { $mapping->{template}->_warn() });

             # remove the command node
             splice @{$tree->{nodes}}, $i, 1;

             # apply the mapping
             my $count = $parser->_applyMapping($tree, $mapping, $i);

             $i--; # check this index again
        }

        # recur
        elsif ($node->{children}) {

            $parser->_applyMappings($node->{children});
        }
    }
}

# read files from \input commands and place into the tree, parsed
#
# also include bibliographies
#
sub _addInputs {
    my $parser = shift;

    my $tree = shift;

    for (my $i = 0; $i < @{$tree->{nodes}}; $i++) {

        my $node = $tree->{nodes}[$i];

        if ($node->{type}    eq 'COMMAND'
         && $node->{command} eq 'input'
        ) {
            my $file = $node->{children}->{nodes}[0]->{content};
            next if $file =~ /pstex/; # ignore pstex images

            _debug("reading input file $file", undef);

            my $contents;
            my $filename = fileparse($file);
            my $has_extension = qr/\.\S+$/;

            # read in contents of file
            if (-e $file && $filename =~ $has_extension) {
                $contents = _readFile($file);
            }
            elsif ($filename !~ $has_extension) {
                $file = "$file.tex";
                $contents = _readFile($file) if -e $file;
            }

            # dump Psfig/TeX files, they aren't useful to us and have
            # nonconforming syntax. Use declaration line as our heuristic.
            #
            if (defined $contents
                     && $contents =~ m!^ \% \s*? Psfig/TeX \s* $!mx
            ) {
                undef $contents;
                carp "ignoring Psfig input `$file'";
            }

            # actually do the parse of the sub-content
            #
            if (defined $contents) {
                # parse into a tree
                my ($subtree,) = $parser->_basicparse($contents, $parser->{PARSE_ERRORS_FATAL});

                # replace \input command node with subtree
                splice @{$tree->{nodes}}, $i, 1, @{$subtree->{nodes}};

                # step back
                $i--;
            }
        }
        elsif ($node->{type}    eq 'COMMAND'
            && $node->{command} eq 'bibliography'
        ) {
             # try to find a .bbl file
             #
             foreach my $file (<*.bbl>) {

                 my $contents = _readFile($file);

                 if (defined $contents) {

                     my ($subtree,) = $parser->_basicparse($contents, $parser->{PARSE_ERRORS_FATAL});
                     splice @{$tree->{nodes}}, $i, 1, @{$subtree->{nodes}};
                     $i--;
                 }
             }
        }

        # recur
        if ($node->{children}) {
            $parser->_addInputs($node->{children});
        }
    }
}

# do pre-mapping parsing
#
sub _parseA {
    my $parser = shift;
    my $text = shift;

    my $tree = $parser->_stage1($text);
    my $bracehash = $parser->_stage2($tree);

    $parser->_stage3($tree);

    $parser->_addInputs($tree) if $parser->{readinputs};

    return ($tree, $bracehash);
}

# do post-mapping parsing (make environments)
#
sub _parseB {
    my $parser = shift;
    my $tree = shift;

    $parser->_stage4($tree);

    _debug("done with parseStage4", undef);

    $parser->_stage5($tree, 0);

    _debug("done with parseStage5", undef);
}

###############################################################################
#
# Parser "Static" Subroutines
#
###############################################################################

# find next unescaped char in some text
#
sub _uindex {
    my $text = shift;
    my $char = shift;
    my $pos = shift;

    my $realbrace = 0;
    my $idx = -1;

    # get next opening brace
    do {
        $realbrace = 1;
        $idx = index $text, $char, $pos;

        if ($idx != -1) {
            $pos = $idx + 1;
            my $prevchar = substr $text, $idx - 1, 1;
            if ($prevchar eq '\\') {
                $realbrace = 0;
                $idx = -1;
            }
        }
    } while (!$realbrace);

    return $idx;
}

sub _find {
    my ($text, $symbol, $pos) = @_;

    my ($found, $index);

    # get next occurrence of the symbol
    do {
        $found = true;
        $index = index $text, $symbol, $pos;

        if ($symbol eq '}' && $index - 1 >= 0 && substr($text, $index - 1, 1) eq ' ') {
            #$pos = $index + 1;
            $index = -1;
        }

        if ($index != -1) {
            $pos = $index + 1;

            # make sure this occurrence isn't escaped. this is imperfect.
            my $prev_char  = ($index - 1 >= 0) ? (substr $text, $index - 1, 1) : '';
            my $pprev_char = ($index - 2 >= 0) ? (substr $text, $index - 2, 1) : '';

            if ($prev_char eq '\\' && $pprev_char ne '\\') {
                $found = false;
            }
        }
    } until ($found);

    return $index;
}

# support function: find the next occurrence of some symbol which is
# not escaped.
#
sub _findsymbol {
    return _find(@_);
}

# support function: find the earliest next brace in some (flat) text
#
sub _findbrace {
    my ($text, $pos) = @_;

    my $index_o = _find($text, '{', $pos);
    my $index_c = _find($text, '}', $pos);

    # handle all find cases
    if ($index_o == -1 && $index_c == -1) {
        return (-1, '');
    }
    elsif ($index_c == -1 || ($index_o != -1 && $index_o < $index_c)) {
        return ($index_o, '{');
    }
    elsif ($index_o == -1 || $index_c < $index_o) {
        return ($index_c, '}');
    }
}


# skip "blank nodes" in a tree, starting at some position. will finish
# at the first non-blank node. (ie, not a comment or whitespace TEXT node.
#
sub _skipBlankNodes {
    my ($tree, $i) = @_;

    my $node = $tree->{nodes}[$$i];

    while ($node->{type}    eq 'COMMENT'
       || ($node->{type}    eq 'TEXT'
       &&  $node->{content} =~ /^\s*$/s)
    ) {
        $node = $tree->{nodes}[++$$i];
    }
}

# is the passed-in node a valid parameter node? for this to be true, it must
# either be a GROUP or a position = inner command.
#
sub _validParamNode {
    my ($node) = @_;

    if ($node->{type} eq 'GROUP'
    || ($node->{type} eq 'COMMAND' && $node->{position} eq 'inner')) {
        return true;
    }
    return false;
}

# duplicate a valid param node.	This means for a group, copy the child tree.
# for a command, make a new tree with just the command node and its child tree.
#
sub _duplicateParam {
    my $parser = shift;
    my $node = shift;

    if ($node->{type} eq 'GROUP') {
        return $node->{children}->copy();
    }
    elsif ($node->{type} eq 'COMMAND') {

        my $subtree = $node->{children}->copy(); # copy child subtree
        my $nodecopy = $node->copy(); # make a new node with old data
        $nodecopy->{children} = $subtree; # set the child pointer to new subtree

        # return a new tree with the new node (subtree) as its only element
        return LaTeX::TOM::Tree->_new([$nodecopy]);
    }

    return undef;
}

sub _getMapping {
    my ($type, $tree, $i) = @_;

    my $node = $tree->{nodes}[$$i];

    if ($node->{type}    ne 'COMMAND'
    || ($node->{command} ne "new$type"
     && $node->{command} ne "renew$type")
    ) {
        return ();
    }

    # figure out command (first child, text node)
    my $command = $node->{children}->{nodes}[0]->{content};
    if ($command =~ /^\s* \\(\S+) \s*$/x) {
        $command = $1;
    }

    $node = $tree->{nodes}[++$$i];

    # figure out number of params
    my $nparams = 0;
    if ($node->{type} eq 'TEXT') {
        my $text = $node->{content};

        if ($text =~ /^\s* \[\s* ([0-9]+) \s*\] \s*$/x) {
            $nparams = $1;
        }

        $$i++;
    }

    return ($command, $nparams);
}

# make a mapping from a newenvironment fragment
#
# newenvironments have the following syntax:
#
# \newenvironment{name}[nparams]?{beginTeX}{endTeX}
#
sub _makeEnvMapping {
    my $parser = shift;
    my ($tree, $index) = @_;

    my $i = $index;

    my ($command, $nparams) = _getMapping('environment', $tree, \$i) or return undef;

    # default templates-- just repeat the declarations
    #
    my ($btemplate) = $parser->_basicparse("\\begin{$command}", 2, 0);
    my ($etemplate) = $parser->_basicparse("\\end{$command}",   2, 0);

    my $end_pos = $i;

    # get two group subtrees... one for the begin and one for the end
    # templates. we only ignore whitespace TEXT nodes and comments
    #
    _skipBlankNodes($tree, \$i);
    my $node = $tree->{nodes}[$i];

    if (_validParamNode($node)) {
        $btemplate = $parser->_duplicateParam($node);

        $i++;
        _skipBlankNodes($tree, \$i);
        $node = $tree->{nodes}[$i];

        if (_validParamNode($node)) {
            $etemplate = $parser->_duplicateParam($node);
            $end_pos = $i;
        }
    }

    # build and return the mapping hash
    #
    return {
        type      => 'environment',
        name      => $command,
        nparams   => $nparams,
        btemplate => $btemplate,          # begin template
        etemplate => $etemplate,          # end template
        skip      => $end_pos - $index,
    };
}

# make a mapping from a newcommand fragment
# takes tree pointer and index of command node
#
# newcommands have the following syntax:
#
# \newcommand{\name}[nparams]?{anyTeX}
#
sub _makeMapping {
    my ($tree, $index) = @_;

    my $i = $index;

    my ($command, $nparams) = _getMapping('command', $tree, \$i) or return undef;

    # grab subtree template (array ref)
    #
    my $node = $tree->{nodes}[$i];
    my $template;

    if ($node->{type} eq 'GROUP') {
        $template = $node->{children}->copy();
    }
    else {
        return undef;
    }

    # build and return the mapping hash
    #
    return {
        type     => 'command',
        name     => $command,
        nparams  => $nparams,
        template => $template,
        skip     => $i - $index,
    };
}

# this sub is the main entry point for the sub that actually takes a set of
# parameter trees and inserts them into a template tree. the return result,
# newly allocated, should be plopped back into the original tree where the
# parameters (along with the initial command invocation)
#
sub _applyParamsToTemplate {
    my $template = shift;
    my @params = @_;

    # have to copy the template to a freshly allocated tree
    #
    my $applied = $template->copy();

    # now recursively apply the params.
    #
    _applyParamsToTemplate_r($applied, @params);

    return $applied;
}

# recursive helper for above
#
sub _applyParamsToTemplate_r {
    my $template = shift;
    my @params = @_;

    for (my $i = 0; $i < @{$template->{nodes}}; $i++) {

        my $node = $template->{nodes}[$i];

        if ($node->{type} eq 'TEXT') {

            my $text = $node->{content};

            # find occurrences of the parameter flags
            #
            if ($text =~ /(#([0-9]+))/) {

                my $all = $1;
                my $num = $2;

                # get the index of the flag we just found
                #
                my $idx = index $text, $all, 0;

                # split the node on the location of the flag
                #
                my ($leftnode, $rightnode) = $node->split($idx, $idx + length($all) - 1);

                # make a copy of the param we want
                #
                my $param = $params[$num - 1]->copy();

                # splice the new text nodes, along with the parameter subtree, into
                # the old location
                #
                splice @{$template->{nodes}}, $i, 1, $leftnode, @{$param->{nodes}}, $rightnode;

                # skip forward to where $rightnode is in $template on next iteration
                #
                $i += scalar @{$param->{nodes}};
            }
        }

        # recur
        elsif (defined $node->{children}) {

            _applyParamsToTemplate_r($node->{children}, @params);
        }
    }
}


# This sub takes a chunk of the document text between two points and makes
# it into a list of TEXT nodes and COMMENT nodes, as we would expect from
# '%' prefixed LaTeX comment lines
#
sub _getTextAndCommentNodes {
    my ($text, $begins, $ends) = @_;

    my $node_text = substr $text, $begins, $ends - $begins;

    _debug("getTextAndCommentNodes: looking at [$node_text]", undef);

    my $make_node = sub {
        my ($mode_type, $begins, $start_pos, $output) = @_;

        return LaTeX::TOM::Node->_new({
            type    => uc $mode_type,
            start   => $begins + $start_pos,
            end     => $begins + $start_pos + length($output) - 1,
            content => $output,
        });
    };

    my @lines = split (/(
       (?:\s*     # whitespace
         (?<!\\)  # unescaped
         \%[^\n]* # comment
       \n)+       # newline
    )/mx, $node_text);

    my @nodes;

    my $start_pos = 0;
    my $output;
    my $mode_type;
    my $first = true;

    foreach my $line (@lines) {

         my $line_type = (
                 $line =~ /^\s*\%/
         && $node_text !~ /
                           \\begin\{verbatim\}
                             .* \Q$line\E .*
                           \\end\{verbatim\}
                          /sx
        ) ? 'comment' : 'text';

        # if type stays the same, add to output and do nothing
        if ($first || $line_type eq $mode_type) {

            $output .= $line;

            # handle turning off initialization stuff
            $first &&= false;
            $mode_type ||= $line_type;
        }

        # if type changes, make new node from current chunk, change mode type
        # and start a new chunk
        else {
            push @nodes, $make_node->($mode_type, $begins, $start_pos, $output);

            $start_pos += length($output); # update start position
            $output = $line;

            $mode_type = $line_type;
        }
    }

    push @nodes, $make_node->($mode_type, $begins, $start_pos, $output) if defined $output;

    return @nodes;
}

# Read in the contents of a text file on disk. Return in string scalar.
#
sub _readFile {
    my ($file, $raise_error) = @_;

    $raise_error ||= false;

    my $opened = open(my $fh, '<', $file);

    unless ($opened) {
        croak "Cannot open `$file': $!" if $raise_error;
        return undef;
    }

    my $contents = do { local $/; <$fh> };
    close($fh);

    return $contents;
}

sub _debug {
    my ($message, $code) = @_;

    my $DEBUG = $LaTeX::TOM::DEBUG;

    return unless $DEBUG >= 1 && $DEBUG <= 2;

    my ($filename, $line) = (caller)[1,2];
    my $caller = join ':', (fileparse($filename))[0], $line;

    warn "$caller: $message\n" if $DEBUG >= 1 && defined $message;
    $code->()                  if $DEBUG == 2 && defined $code;
}

1;