#!/usr/bin/perl -w
# Copyright (C) 2000, 2001 Bob McElrath.
# See the file COPYING for redistribution and modification terms.
#
#    This program is free software; you can redistribute it and/or modify
#    it under the terms of the GNU General Public License as published by
#    the Free Software Foundation; either version 2 of the License, or
#    (at your option) any later version.
#
#    This program is distributed in the hope that it will be useful,
#    but WITHOUT ANY WARRANTY; without even the implied warranty of
#    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#    GNU General Public License for more details.
#
#    You should have received a copy of the GNU General Public License
#    along with this program; if not, write to the Free Software
#    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package FilterProxy::Rewrite;

use strict;
no strict 'subs';
no strict 'refs';
use vars qw($VERSION $CONFIG);
use Time::HiRes;
use URI::Escape;
$VERSION = 0.30;
push @FilterProxy::MODULES, "Rewrite";
$FilterProxy::Rewrite::CONFIG = {};
$CONFIG->{mime_types} = ["text/html", "application/x-javascript"];
$CONFIG->{order} = 4;
*logger = \&FilterProxy::logger;
my($whitespace) = qr/[\040\015\013\012\011]/si; # whitespace character class
my($notspace) = qr/[^\040\015\013\012\011]/si;  # not whitespace character class
my($regexdelim) = qr/[\#\/\%\&!,=:]/s;           # regex delimiters for tagspecs
# Regex matcher:
# ([\/#])(.*?[^\\](?:\\\\)*)?\1
my($dontcarecomm) = qr/(?:$whitespace|&nbsp;|<[bh]r[^>]*>|<p[^>]*>|<\!--(.*?)-->)*?/si; # no more than one comment.
my($dontcare) = qr/(?:$whitespace|&nbsp;|<[bh]r[^>]*>|<p[^>]*>)/si; 
my($nmatches, $nsuccess) = (0, 0);
my($markupinstead) = 0; # A flag telling us to mark where changes would be made,
                        # rather than rewriting/stripping.
my(@markup) = (); # list of lists of: rule name, start, length, replacement


# The main function that gets called by FilterProxy
sub filter {
  my($req, $res, $siteconfig) = @_;
  my(%filters);
  my($i) = 0;
  my($content_ref) = $res->content_ref;
# assemble master list of filter specs, processing ignore directives.
  foreach my $site (@$siteconfig) {
    foreach my $filter (@{$site->{'filters'}}) {
      if($filter =~ /^(\w+):(.*)$/s) {
        $filters{$1} = $2;
      } else {
        $filters{'UNNAMED_' . $i++} = $filter;
      }
    }
  }
  foreach my $filter (keys %filters) {
    if(defined $filters{$filter} && $filters{$filter} =~ /\G\s*ignore\s+/sg) {
      while($filters{$filter} =~ /\G(\w+)\s*/sg) {
        delete $filters{$1};
      }
      delete $filters{$filter};
    }
  }
# apply filters.
  my($start, $end) = (0,0);
  my($operation, $matcher, $key);
  my($filterstart, $filterend, $oldkey);
FILTER:
  foreach $key (sort keys %filters) {
    if($FilterProxy::CONFIG->{timing}) {
      $filterend = Time::HiRes::time;
      if($filterstart) {
        logger(TIMING, "  Rewrite: $oldkey took ", sprintf("%0.5f", $filterend-$filterstart), 
            " seconds, ", $nmatches-$nsuccess, " failed, $nsuccess successful\n");
      }
      $filterstart = Time::HiRes::time;
    }
    $oldkey = $key;
    $nmatches = 0;
    $nsuccess = 0;
    my($filter) = $filters{$key};
    if($filter =~ /\G\s*(\w+)\s+(\w+)/sg) {
      $operation = lc $1;
      pos($$content_ref) = 0;
      if(defined $2 && defined &{$2}) {
        $matcher = lc $2;
        my($startfilterpos) = pos($filter); # matcher may be called several times, and needs pos each time.
        my($filterpos) = $startfilterpos;
        my($docmatchstart, $docmatchend, $ndocmatches) = (0,0,0);
        if($FilterProxy::CONFIG->{timing}) {
          $docmatchstart = Time::HiRes::time;
        }
DOCMATCH:
        while(defined pos($$content_ref)) {
          $ndocmatches++;
          pos($filter) = $startfilterpos;
          $start = $end = undef;
          ($start, $end) = &{$matcher}($content_ref, \$filter, $start, $end);
          $filterpos = pos($filter);
          next FILTER unless (defined $start && defined $end);
#          print "initial filter found ($start, $end)\n";
          my($keyword);
          while($filter =~ /\G\s*-?(\w+)/sg) {
            $keyword = lc $1;
            $filterpos = pos($filter);
            if(defined &{"FilterProxy::Rewrite::$1"}) {
              my($newstart, $newend) = &{$1}($content_ref, \$filter, $start, $end);
              unless(defined $newstart && defined $newend) {
                pos($$content_ref) = $end;
                next DOCMATCH;
              } else {
                ($start, $end) = ($newstart, $newend);
              }
            } elsif(($operation eq "rewrite") and ($keyword eq "as")) {
              last;
            } else {
              logger(ERROR, "  Rewrite: unrecognized option flag: '$keyword'\n");
            }
            if(defined pos($filter)) { $filterpos = pos($filter); }
          } # when match fails, pos will be undef, so reset it to end of options.
          next unless(defined $start && defined $end);
#          print "filter has good ($start, $end), gonna strip\n";
          pos($filter) = $filterpos;
          if($operation eq 'strip') {
            if($FilterProxy::CONFIG->{debug}) {
              logger(DEBUG, "  Rewrite stripping at pos $start by rule $key:\n", substr($$content_ref, $start, $end-$start), "\n");
            }
            $nsuccess++;
            if($markupinstead) {
                push @markup, [$key, $start, $end-$start, ""];
            }
            substr($$content_ref, $start, $end-$start) = "";
            pos($$content_ref) = $start;
          } elsif($operation eq 'rewrite') {
            if($filter =~ /\G\s*(.*)$/sg) {
              my($replacement) = $1;
              if($FilterProxy::CONFIG->{debug}) {
                logger(DEBUG, "  Rewrite rewriting by rule $key: '", 
                    substr($$content_ref, $start, $end-$start), "' as '", $replacement, "'\n");
              }
              $nsuccess++;
              if($markupinstead) {
                  push @markup, [$key, $start, $end-$start, $replacement];
              }
              substr($$content_ref, $start, $end-$start) = $replacement;
              pos($$content_ref) = $start+length($replacement);
            } else {
              logger(ERROR, "  Rewrite: mangled rewrite specifier ('as' part): '$filter' in rule $key\n");
            }
          } else {
            logger(ERROR, "  Rewrite: unrecognized operation '$operation' in rule $key!\n");
          }
          pos($filter) = $filterpos; # reset match position for $filter.
        } 
      } else {
        logger(ERROR, "  Rewrite: unknown matcher '$matcher' in rule $key!\n");
      }
    } else {
      logger(ERROR, "  Rewrite: Rule does not start with an action (strip|rewrite|ignore): '$filter' in rule $key!\n");
    }
  }
  if($FilterProxy::CONFIG->{timing}) {
    $filterend = Time::HiRes::time;
    if($filterstart) {
      logger(TIMING, "  Rewrite: $oldkey took ", sprintf("%0.5f", $filterend-$filterstart), 
          " seconds, ", $nmatches-$nsuccess, " failed, $nsuccess successful\n");
    }
  }
}

# Matcher:     balanced
# Syntax:      add balanced
# Description: recursively add in balanced tags to an ad.  i.e. <center> ...ad... </center>
sub balanced {
  if(!wantarray) {
    logger(DEBUG, "  balanced called improperly!  Must be in array context!\n");
    return;
  }
  my($content_ref) = shift;
  shift; # ignore spec
  my($adstartpos) = shift;
  my($adendpos) = shift;
  my($tmpendpos, $tmpstartpos, $lasttmpstartpos);
  my($found);
ENCLOSINGTAG:
  while(1) {
    pos($$content_ref) = $adendpos;
    last ENCLOSINGTAG unless $$content_ref =~ /\G$dontcarecomm<(?:$whitespace)*\/([a-z]+)[^>]*>$dontcarecomm/sig;
    my($tag) = $2; # note 1 parenthesis in $dontcarecomm
    my($leadcomment, $trailcomment) = ($1, $3);
    # <!--(.*?)--> in $dontcarecomm matched a comment spanning more than one comment, 
    # so we consider this a failed match.
    last ENCLOSINGTAG if(defined $leadcomment && $leadcomment =~ /-->/);
    last ENCLOSINGTAG if(defined $trailcomment && $trailcomment =~ /-->/);
    $tmpendpos = pos($$content_ref);
    $tmpstartpos = $adstartpos;
    do {
      $lasttmpstartpos = $tmpstartpos;
      $tmpstartpos = &prevtag($content_ref, $tmpstartpos);
    } while(&hasopencomment($content_ref, $tmpstartpos, $adstartpos) == 1
        && $tmpstartpos > 0 && $tmpstartpos < $lasttmpstartpos);
# find previous tag
    pos($$content_ref) = $tmpstartpos;
    last ENCLOSINGTAG unless $$content_ref =~ /\G<(?:$whitespace)*$tag[^>]*>/sig;
    $adstartpos = $tmpstartpos;
    $adendpos = $tmpendpos;
  }
  pos($$content_ref) = $adendpos; # always keep pos at end.
  return($adstartpos, $adendpos);
}

# Matcher:     whitespace
# Syntax:      add whitespace
# Description: add in "whitespace", defined by $dontcare, which contains normal 
#   whitespace as well as the tags <br>, <hr>, <p>, and the entity &nbsp;
sub whitespace {
  my($content_ref) = shift;
  my($spec_ref) = shift;
  my($adstartpos) = shift;
  my($adendpos) = shift;
  my($origadstart, $origadend) = ($adstartpos, $adendpos);
  my($tmpstartpos) = $adstartpos;
  my($whitespacerx) = qr/(?:$whitespace+|&nbsp;|<[bh]r[^>]*>|<p[^>]*>|<\!--(.*?)-->)/si;

  pos($$content_ref) = $adendpos;
  while($$content_ref =~ /\G$whitespacerx+?/sig) {
    my($comment) = $1;
    # <!--(.*?)--> in $dontcarecomm matched a comment spanning more than one comment, 
    # so we consider this a failed match.
    unless(defined $comment && $comment =~ /-->/) { # FIXME We could use &hasopencommend and
      $adendpos = pos($$content_ref);               # be thorough about this, but whitespace
    }                                               # just isn't that important?
  }
  while(1) {
    $tmpstartpos = rindex $$content_ref, '<', $tmpstartpos-1;
#    logger(DEBUG, "whitespace() rewound to: " . substr($$content_ref, $tmpstartpos, 20));
    last if($tmpstartpos < 0); #find failed.
    last unless (substr($$content_ref, $tmpstartpos, $adstartpos-$tmpstartpos) =~ /^$whitespacerx$/si);
    my($comment) = $1;
#    logger(DEBUG, "  ... looks like whitespace");
    last if(defined $comment && $comment =~ /-->/); # open comment
    $adstartpos = $tmpstartpos;
  }
  pos($$content_ref) = $adendpos; # always keep pos at end.
  return($adstartpos, $adendpos);
}

# Matcher:     alternate
# Syntax:      add alternate
# Description: add in "alternate" content.  i.e. <nolayer> corresponding to a 
#   <layer>, and <noscript> corresponding to a <script> (and v/v).
sub alternate {
  if(!wantarray) {
    logger(DEBUG, "  alternate called improperly!  Must be in array context!\n");
    return(undef, undef);
  }
  my($content_ref) = shift;
  shift; # ignore spec
  my($adstartpos) = shift;
  my($adendpos) = shift;
  my($tmpendpos, $tmpstartpos) = ($adendpos, $adstartpos);
  my($tag, $alttag);
  my($gotalt) = 0;
  pos($$content_ref) = $adstartpos;
  if($$content_ref =~ /\G$dontcarecomm<(?:$whitespace)*([-\w]+)(?:(?:$whitespace)[^>]*>|>)/sig) {
    $tag = lc $1;
    pos($$content_ref) = &prevtag($content_ref, $adendpos);
    if($$content_ref =~ /\G<(?:$whitespace)*\/$tag(?:$whitespace)*>/sig) {
#  if(substr($$content_ref, $adstartpos, $adendpos-$adstartpos) =~
#      /^$dontcarecomm<(?:$whitespace)*(\w+)(?:(?:$whitespace)[^>]*>|>).*<(?:$whitespace)*\/\2(?:$whitespace)*>$dontcarecomm$/si) {
#    $tag = lc $2;
      if($tag eq "layer")    { $alttag = "nolayer"; }
      if($tag eq "ilayer")   { $alttag = "nolayer"; }
      if($tag eq "script")   { $alttag = "noscript"; }
      if($tag eq "noscript") { $alttag = "script"; }
      if($tag eq "nolayer")  { $alttag = "i?layer"; }
      return($adstartpos, $adendpos) unless defined $alttag;
      # Corresponding tag trails
      pos($$content_ref) = $adendpos;
      while($$content_ref =~ /\G$dontcarecomm/sig) {
        if(defined $1) { # there was a comment...
          if(&hasopencomment($content_ref, $adstartpos, pos($$content_ref))) {
            pos($$content_ref) = $adendpos;
            last;
          }
        }
        $adendpos = pos($$content_ref);
      }
      pos($$content_ref) = $adendpos; # last failed match in while loop will make pos undef.
      if($$content_ref =~ /\G<(?:$whitespace)*($alttag)(?:(?:$whitespace)[^>]*?>|>)(.*?)<(?:$whitespace)*\/\1(?:(?:$whitespace)*[^>]*?>|>)$dontcarecomm/sig) {
        my($block, $trailcomment) = ($2, $3);
        if(&hasopencomment($$content_ref, $adendpos, pos($$content_ref))) {
          pos($$content_ref) = $adendpos;
          if($$content_ref =~ /\G<(?:$whitespace)*($alttag)(?:$whitespace)[^>]*?>(.*?<!--.*?-->.*?)
                      <(?:$whitespace)*\/\2(?:$whitespace)[^>]*?>$dontcarecomm/sigx) {
            ($block, $trailcomment) = ($2, $3);
          }
        }
        unless(defined $trailcomment && $trailcomment =~ /-->/) {
          $adendpos = pos($$content_ref);
          $gotalt = 1;
        } else {
          logger(DEBUG, "  -addalt failed because of comment problems in trailing alt block.\n");
        }
      }
      # Corresponding tag leads
      unless($gotalt) { 
        my($first) = 1;
        while(1) {
          $tmpstartpos = &prevtag($content_ref, $tmpstartpos);
          last if($tmpstartpos < 0);
          last if($first && $$content_ref !~ /\G<(?:$whitespace)*\/$alttag(?:(?:$whitespace)[^>]*>|>)/sig); 
          $tmpstartpos = &rewindtotag($content_ref, $alttag, $tmpstartpos);
          last if($tmpstartpos < 0);
          $first = 0;
          next if &hasopencomment(substr($$content_ref, $tmpstartpos, $adendpos-$tmpstartpos));
          $adstartpos = $tmpstartpos;
          last;
        }
      }
    }
  }
  pos($$content_ref) = $adendpos;
  return($adstartpos, $adendpos);
}

# Qualifying Predicate: inside
# Syntax:               [preceding matcher] inside [following matcher]
# Description:          require that the preceding match be inside the block
#   found by the following matcher.  If it is not, the WHOLE MATCH fails.
sub inside {
  my($content_ref) = shift;
  my($spec_ref) = shift;
  my($adstartpos) = shift;
  my($adendpos) = shift;
  my($tmpendpos, $tmpstartpos) = ($adendpos, $adstartpos);
  my($encloserstart, $encloserend);
  
#  print "inside called with existing match ($adstartpos,$adendpos): ", substr($$content_ref, $adstartpos, $adendpos-$adstartpos), "\n";
  if($$spec_ref =~ /\G\s*(\w+)/sg && defined &{"FilterProxy::Rewrite::$1"}) {
    pos($$content_ref) = $adendpos;
    ($encloserstart, $encloserend) = &{$1}($content_ref, $spec_ref, $adstartpos, $adendpos, 1);
#    print "inside found enclosing block ($encloserstart, $encloserend)\n";
    if(defined $encloserstart && defined $encloserend && 
        $encloserstart < $adstartpos && $encloserend > $adendpos) {
    #  return($encloserstart, $encloserend); # successful match.
        return($adstartpos, $adendpos); # successful match.
    } else { # match is not inside, and therefore this is a failed match.
      return(undef,undef);
    }
  } else {
    logger(ERROR, "  unknown matcher passed to 'inside' in spec: $$spec_ref\n");
  }
}

# Qualifying Predicate: containing
# Syntax:               [preceding matcher] containing [following matcher]
# Description:          the preceding matched block must contain the text
#   matched by the following matcher.  If it is not, the WHOLE MATCH fails.
sub containing {
  my($content_ref) = shift;
  my($spec_ref) = shift;
  my($start) = shift;
  my($end) = shift;
  my($containstart, $containend);

  my($str) = substr($$content_ref, $start, $end-$start);
  if($$spec_ref =~ /\G\s*(\w+)/sg && defined &{"FilterProxy::Rewrite::$1"}) {
    pos($str) = 0;
    ($containstart, $containend) = &{$1}(\$str, $spec_ref, undef, undef);
    if(defined $containstart && defined $containend) {
      return($start, $end); # match succeeded
    } else { 
      return(undef, undef); # match failed
    }
  }
}

# Qualifying Predicate: preceeding
# Syntax:               [preceding matcher] preceeding [following matcher]
# Description:          the following matched block must contain the text
#   matched by the following matcher.  If it is not, the WHOLE MATCH fails.
sub preceeding {
  my($content_ref) = shift;
  my($spec_ref) = shift;
  my($start) = shift;
  my($end) = shift;
  my($containstart, $containend);

  if($$spec_ref =~ /\G\s*(\w+)/sg && defined &{"FilterProxy::Rewrite::$1"}) {
    pos($$content_ref) = $end;
    ($containstart, $containend) = &{$1}($content_ref, $spec_ref, $start, $end);
    if(defined $containstart && defined $containend) {
      return($start, $end); # match succeeded
    } else { 
      return(undef, undef); # match failed
    }
  }
}

# Qualifying Predicate: leader
# Syntax:               [matcher] add leader [matcher]
# Description:          Searches backward from the existing match to find 
# Note: It would be nice if this were a quailfying predicate instead, allowing
#   use of any matcher.  But that would require all matchers to be able to
#   search backwards!  (regex can't, and never will be able to search backwards)
sub leader {
  my($content_ref, $spec_ref, $start, $end) = @_;
  my(@ret);
  @ret = &tag($content_ref, $spec_ref, $start, $end, 0, 0, 0, 1);
  return(@ret);
}

# Expanding Predicate: add
# Syntax:              [matcher] add [matcher]
# Description:         makes the match bigger by including the results of the
#   following matcher
sub add {
  my($content_ref) = shift;
  my($spec_ref) = shift;
  my($adstartpos) = shift;
  my($adendpos) = shift;
  my($tmpendpos, $tmpstartpos) = ($adendpos, $adstartpos);
  my($tagstart, $tagend);
  
  if($$spec_ref =~ /\G\s*(\w+)/sg && defined &{"FilterProxy::Rewrite::$1"}) {
    pos($$content_ref) = $adendpos;
#    print "add calling $1\n";
    ($tagstart, $tagend) = &{$1}($content_ref, $spec_ref, $adstartpos, $adendpos);
    $adstartpos = defined $tagstart?($tagstart>$adstartpos?$adstartpos:$tagstart):$adstartpos;
    $adendpos = defined $tagend?($tagend>$adendpos?$tagend:$adendpos):$adendpos;
#    print "add returning ad ($adstartpos, $adendpos) was ($tagstart, $tagend): ", substr($$content_ref, $adstartpos, $adendpos-$adstartpos), "\n";
    return($adstartpos, $adendpos);
  } else {
    logger(ERROR, "  unknown matcher passed to 'add' in spec: $$spec_ref\n");
  }
}

# Matcher:     tagblock
# Syntax:      tagblock <tag [[attrib]=[~][value]] ...>
# Description: Matches a tag, including all text until the corresponding
#   closing tag.  Uses 'sub tag' to do its dirty work.
sub tagblock {
  my($content_ref, $spec_ref, $start, $end, $ifencloses) = @_;
  return(&tag($content_ref, $spec_ref, $start, $end, 1, $ifencloses));
}

# Matcher:     encloser
# Syntax:      [matcher] add encloser <tag [[attrib]=[~][value]] ...>
# Description: like tagblock, except that the block must enclose the previous
#   match.  (only makes sense as argument to 'add', and should really be named
#   "enclosing tag block" but that's too long)
sub encloser { # "enclosing tag block"
  my($content_ref, $spec_ref, $start, $end) = @_;
  return(&tag($content_ref, $spec_ref, $start, $end, 1, 1));
}

# Matcher: tag
# Syntax: tag <tag [[attrib]=[~][value]] ...>
# Description: find a tag given a specifier.  Any of [tag], [attrib], [value]
#   can be a regex by delimiting it with $regexdelim: #/%&!,=:
#   multiple attribs are allowed
#   and a value need not be specified.
sub tag {
  if(!wantarray) {
    logger(DEBUG, "  tag called improperly!  Must be in array context!\n");
    return;
  }
  my($content_ref, $spec_ref, $prevstart, $prevend, $tagblock, $ifencloses, $exactattribs, $backwards) = @_;
  my($contentpos) = pos($$content_ref);
  $tagblock=0 unless(defined $tagblock);
  $ifencloses=0 unless(defined $ifencloses);
  $exactattribs=0 unless(defined $exactattribs);
  $backwards=0 unless(defined $backwards);

  my($tag, $attribspec, %attribs, $foundtag, $found);
  my($tmpstartpos, $tmpendpos);
  my($start, $end) = ($prevstart, $prevend);
  my($specpos) = pos($$spec_ref);
  my($tagnestlevel) = 0;
# process options
  while($$spec_ref =~ /\G\s*(?:-(\w+)\s*)/sg) {
    if($1 eq 'tagonly') { $tagblock = 0; }
    if($1 eq 'tagblock') { $tagblock = 1; }
    if($1 eq 'ifencloses') { $ifencloses = 1; $tagblock = 1; }
    if($1 eq 'exactattribs') { $exactattribs = 1; } #not implemented yet.
  }
#  print "tag called: ($prevstart, $prevend, $tagblock, $ifencloses, $exactattribs)\n";
  pos($$spec_ref) = $specpos;
  if($$spec_ref =~ /\G\s*<\s*                    # start where we left off, find '<'
      (?:(\/?[-\w]+)|($regexdelim)(.*?)(?<!\\)\2)   # name of the tag (FIXME note lookbehind for '\')
      (?:((?:$whitespace).*?)\s*>|>)/sgx) {       # either > closes tag or attribs and closer >
    $tag = defined $1?$1:$3;
    $attribspec = $4; # strip '<tag', '>' from spec so only attribs remain
    if(defined $attribspec) { pos($attribspec) = 0; }
    while(defined $attribspec && defined pos($attribspec) && length($attribspec) > 0) {
      if($attribspec =~ /\G\s*(?:([-\w]+)|($regexdelim)(.*?)(?<!\\)\2)\s*(=(?:~)?\s*)?/sg) {
        my($attrib) = defined $1?$1:$3;
        if(defined $4) { # look for a value
          if($attribspec =~ /\G(?:($regexdelim)(.*?)(?<!\\)\1   # find regex attrib
              |(["'])([^\3]*?)\3                                # or a quoted value
              |([^"'][^>\s]*?)(?<!["']))                        # or an unquoted value
              (?=\s|$)/sgx) {                                   # end with space or end of string
            if(defined $2) { $attribs{$attrib} = $2; }
            elsif(defined $4) { $attribs{$attrib} = "^$4\$"; }
            elsif(defined $5) { $attribs{$attrib} = "^$5\$"; }
            else {
              logger(ERROR, "  malformed regexp for $attrib value: $attribspec\n");
            }
          } else {
            logger(ERROR, "  malformed regexp for $attrib: $attribspec\n");
            return;
          }
        } else {
          $attribs{$attrib} = undef; # don't care about value, it's a boolean attrib.
        }
      } else {
        logger(ERROR, "  malformed attrib specifier: '$$spec_ref', pos: ", 
          substr($$spec_ref, pos($$spec_ref), length($$spec_ref) - pos($$spec_ref)), "\n");
        return;
      }
      last if(defined pos($attribspec) && length($attribspec) == pos($attribspec));
    } 
  } else {
    logger(ERROR, "  malformed tag specifier: $$spec_ref\n");
    return;
  }
TAG:
  while(defined pos($$content_ref)) {
    $tmpstartpos = $start = pos($$content_ref);
    if($ifencloses || $backwards) { # Search backward
      $tmpstartpos = &rewindtotag($content_ref, "/?$tag", $tmpstartpos);
      last TAG if ($tmpstartpos < 0);
      unless($$content_ref =~ /\G(<(?:$whitespace)*(\/?)($tag)(?:(?:$whitespace)[^>]*>|>))/sig) {
        print "ifencloses matching error: data is: ", substr($$content_ref, $tmpstartpos, 20), "\n";
      }
      last TAG if(defined $2 && length $2 > 0); # we found a closing tag before an opening tag...
      $end = pos($$content_ref);
      pos($$content_ref) = $tmpstartpos; # don't want to hit the same match again.
      $found = $1;
      $foundtag = lc $3; # in case the tag specifier is a regex
    } else { # Search forward
      # first search with a regex that has no () in it for speed, then match again to grab
      # data.  (faster in the case of lots of false matches)
      last TAG unless ($$content_ref =~ /\G.*?<$whitespace*(?:$tag)(?:(?:$whitespace)[^>]*>|>)/sig);
      $tmpstartpos = pos($$content_ref);
      do {
        $tmpstartpos = rindex($$content_ref, '<', $tmpstartpos-1);
        if($tmpstartpos < 0) {
          pos($$content_ref) = $start;
          next TAG;
        }
        pos($$content_ref) = $tmpstartpos;
      } while ($$content_ref !~ /\G(<$whitespace*($tag)(?:(?:$whitespace)[^>]*)?>)/sig);
      $nmatches++;
      $found = $1;
      $foundtag = lc $2; # in case the tag specifier is a regex
      $end = pos($$content_ref);
    }
    if(!defined $end) {
      print "end is undef ($ifencloses)!\n";
      print "found is: $found\n";
    }
    if(!defined $found) {
      print "found is undef!\n";
    }
    $start = $end - length $found;
#    print "\n" . '=' x 80 . "tag found ($start-$end): \n$found\n";
    # test if the tag's attribs match:
    foreach my $attrib (keys %attribs) {
      if(defined $attribs{$attrib}) {
        next TAG unless $found =~ /$whitespace(?:$attrib)$whitespace*=$whitespace*(?:(["'])([^\1]*?)\1|($notspace*?))(?:$whitespace|>)/si;
        my($value) = (defined $1)?$2:$3;
        next TAG unless $value =~ /$attribs{$attrib}/si;
      } else {
        next TAG unless $found =~ /($whitespace$attrib(?:$whitespace|=|>))/si;
      }
    } 
#    print "\n" . '=' x 80 . "tag found ($start-$end) w/attribs: \n$found\n";
    # tags with closing tags -- try to find closer.
    if($foundtag !~ /^(img|p|[bh]r)$/) {
      $tagnestlevel = 1;
      if($ifencloses) { pos($$content_ref) = $end; }
CLOSER: # note this code is very slow in the failing case.  (lots of substr)
      while($tagblock && $$content_ref =~ /\G.*?(<(?:$whitespace)*(\/?)$foundtag(?:(?:$whitespace)[^>]*>|>))/sig) {
        my($tmpend) = pos($$content_ref);
#        my($found) = substr($$content_ref, $start, $tmpend-$start); # this is time consuming...
        my($hascomm) = &hasopencomment($content_ref, $start, $tmpend);
        if($hascomm) {
          if($hascomm == 2) { # has closing comment.
            if($ifencloses) { 
              pos($$content_ref) = $start - 1; 
            } else { 
              pos($$content_ref) = $end; 
            }
#            print "going to next tag because of comment($hascomm) in:\n", $found, "\n";
            next TAG;
          } elsif($hascomm == 1) { # has opening comment.
            my($tmpend) = pos($$content_ref);
#            print "going to next closer because of comment($hascomm) in:\n", $found, "\n";
            next CLOSER;
          }
        }
        if(defined $2 && $2 eq '/') { $tagnestlevel--; } else { $tagnestlevel++; }
        if($tagnestlevel == 0) { $end = $tmpend; last CLOSER; }
      }
    }
    if($ifencloses && ($contentpos > $end || $contentpos < $start)) {
      return($prevstart, $prevend);
    }
#    print "returning ad ($start, $end): ", substr($$content_ref, $start, $end-$start), "\n";
#    pos($$content_ref) = $end;
#    if($$content_ref =~ /\G$dontcare/sig) { $end = pos($$content_ref); }
    return ($start, $end);
  }
  return ($prevstart, $prevend); 
}

# Matcher: regex
# Syntax:  regex [regex]
# Description: looks for a simple regex.  [regex] must be delmited by one of #/%&!,=:
#   This does m//, not s///, tr///, or y/// (yet), only straight "matching" regexes.
sub regex {
  if(!wantarray) {
    logger(DEBUG, "  regex called improperly!  Must be in array context!\n");
    return;
  }
  my($content_ref, $spec_ref, $start, $end) = @_;
  my($regex, $found);

  if($$spec_ref =~ /\G\s*($regexdelim)(.*?)(?<!\\)\1/sg) {
    $regex = $2;
  } else {
    logger(ERROR, "  malformed regex specifier: $$spec_ref\n");
    return;
  }
  if(!defined pos($$content_ref)) { pos($$content_ref) = $end; }
  if(defined pos($$content_ref) && $$content_ref =~ /\G.*?($regex)/sig) {
    if(length($1) > 0) {
      $end = pos($$content_ref);
      $start = $end - length($1);
      $nsuccess++;
    }
  }
  $nmatches++;
#  print "regex returning ($start, $end): ", substr($$content_ref, $start, $end-$start), "\n";
#  print "pos(\$\$spec_ref) is ", pos($$spec_ref), "\n";
  return($start, $end);
}

# Matcher: attrib
# Syntax:  attrib <tag attrib[=value] [[attrib]=[value]] ...>
# Description: find an attribute given a specifier of the form <tag attrib[=value]>.
#   The first attrib is required, and is what will match.  Other attribs are
#   required to be present in the tag.  If you want to match an attrib in *any*
#   tag, specify the "tag" with an empty regex: //
sub attrib {
  if(!wantarray) {
    logger(DEBUG, "  attrib called improperly!  Must be in array context!\n");
    return;
  }
  my($content_ref) = shift;
  my($contentpos) = pos($$content_ref);
  my($spec_ref) = shift;
  my($tag, $attribspec, %attribs, $foundtag, $found);
  my($attrib) = undef; # the attrib I'm interested in (first one in attribspec)
  my($start, $end);
  my($tmpstartpos, $tmpendpos);
  my($specpos) = pos($$spec_ref);
  my($tagnestlevel) = 0;
# process options
  while($$spec_ref =~ /\G\s*(?:-(\w+)\s*)/sg) {
    logger(ERROR, "  unrecognized option to attrib finder: $1\n");
    return;
  }
  pos($$spec_ref) = $specpos;
  if($$spec_ref =~ /\G\s*<\s*                    # start where we left off, find '<'
      (?:(\/?[-\w]+)|($regexdelim)(.*?)(?<!\\)\2)   # name of the tag (FIXME note lookbehind for '\')
      ((?:$whitespace).*?)\s*>/sgx) {             # attrib and closer >
    $tag = defined $1?$1:$3;
    $attribspec = $4; # strip '<tag', '>' from spec so only attribs remain
    if(defined $attribspec) { pos($attribspec) = 0; }
    while(defined $attribspec && defined pos($attribspec) && length($attribspec) > 0) {
      if($attribspec =~ /\G\s*(?:([-\w]+)|($regexdelim)(.*?)(?<!\\)\2)\s*(=(?:~)?\s*)?/sg) {
        my($tmpattrib) = defined $1?$1:$3;
        $attrib = $tmpattrib unless(defined $attrib);
        if(defined $4) { # look for a value
          if($attribspec =~ /\G(?:($regexdelim)(.*?)(?<!\\)\1   # find regex attrib
              |(["'])([^\3]*?)\3                                # or a quoted value
              |([^"'][^>\s]*?)(?<!["']))                        # or an unquoted value
              (?=\s|$)/sgx) {                                    # end with space or end of string
            if(defined $2) { $attribs{$tmpattrib} = $2; }
            elsif(defined $4) { $attribs{$tmpattrib} = "^$4\$"; }
            elsif(defined $5) { $attribs{$tmpattrib} = "^$5\$"; }
            else {
              logger(ERROR, "  malformed regexp for $tmpattrib value: $attribspec\n");
            }
          } else {
            logger(ERROR, "  malformed regexp for $tmpattrib: $attribspec\n");
            return;
          }
        } else {
          $attribs{$tmpattrib} = undef; # don't care about value, it's a boolean attrib.
        }
      } else {
        logger(ERROR, "  malformed attrib specifier: '$$spec_ref', pos: ", 
          substr($$spec_ref, pos($$spec_ref), length($$spec_ref) - pos($$spec_ref)), "\n");
        return;
      }
      last if(defined pos($attribspec) && length($attribspec) == pos($attribspec));
    }
  } else {
    logger(ERROR, "  malformed attrib specifier: $$spec_ref\n");
    return;
  }
ATTRIB_TAG:
  while(defined pos($$content_ref)) {
    $tmpstartpos = $start = pos($$content_ref);
    # first search with a regex that has no () in it for speed, then match again to grab
    # data.  (faster in the case of lots of false matches)
    last ATTRIB_TAG unless ($$content_ref =~ /\G.*?<$whitespace*(?:$tag)(?:$whitespace)[^>]*>/sig);
    $tmpstartpos = pos($$content_ref);
    do {
      $tmpstartpos = rindex($$content_ref, '<', $tmpstartpos-1);
      if($tmpstartpos < 0) {
        pos($$content_ref) = $start;
        next ATTRIB_TAG;
      }
      pos($$content_ref) = $tmpstartpos;
    } while ($$content_ref !~ /\G(<$whitespace*($tag)(?:(?:$whitespace)[^>]*)?>)/sig);
    $nmatches++;
    $found = $1;
    $foundtag = lc $2; # in case the tag specifier is a regex
    $end = pos($$content_ref);
    if(!defined $end) {
      print "end is undef!\n";
      print "found is: $found\n";
    }
    if(!defined $found) {
      print "found is undef!\n";
    }
    $start = $end - length $found;
#    print "\n" . '=' x 80 . "attrib found ($start-$end): \n$found\n";
# test if the tag's attribs match:
    foreach my $attrib (keys %attribs) {
      if(defined $attribs{$attrib}) {
        next ATTRIB_TAG unless $found =~ /$whitespace(?:$attrib)$whitespace*=$whitespace*(?:(["'])([^\1]*?)\1|($notspace*?))(?:$whitespace|>)/si;
        my($value) = (defined $1)?$2:$3;
        next ATTRIB_TAG unless $value =~ /$attribs{$attrib}/si;
      } else {
        print "doing valueless attrib...\n";
        next ATTRIB_TAG unless $found =~ /($whitespace$attrib(?:$whitespace|=|>))/si;
      }
    } 
#    print "\n" . '=' x 80 . "attrib found w/attribs ($start-$end): \n$found\n";
    # find the attrib we're interested in.
    pos($$content_ref) = $start;
    if($$content_ref =~ /\G.*?$whitespace($whitespace*$attrib$whitespace*(?:=$whitespace*(?:(["'])([^\2]*?)\2|($notspace*?)))?$whitespace*)(?:$whitespace|>)/sig) {
      $end = pos($$content_ref) - 1;
      $start = $end - length $1;
    } else {
      logger(ERROR, "  lost attrib\n");
    }
#    print "returning ad ($start, $end): '", substr($$content_ref, $start, $end-$start), "'\n";
    return ($start, $end);
  }
  return (undef, undef); 
}

# ------------------------------ Helper functions ----------------------------

# returns 0 if no comment is open in the string, 1 if there's an opening 
# comment, 2 if there's a closing comment.  (make sure we don't mangle the html
# *too* badly.
sub hasopencomment2 {
  my($str) = shift;
  my($commentopen) = 0;
  pos($str) = 0;
  my($lastpos) = 0;
  while($str =~ /\G.*?<!--/sig) {
    if(defined pos($str)) { $lastpos = pos($str); }
    if(!$commentopen) { $commentopen = 1; }
    if($str =~ /\G.*?-->/sig) { $commentopen = 0; }
    else { pos($str) = $lastpos; }
    if(defined pos($str)) { $lastpos = pos($str); }
  }
  return 1 if ($commentopen);
  pos($str) = $lastpos;
  return 2 if ($str =~ /\G.*?-->/sig); # pos is undef at end of string?
  if($str =~ /^(.*?)-->/si) {
    my($precomment) = $1;
    return 2 unless($precomment =~ /<!--/);
  }
  return 0;
}

sub hasopencomment {
  my($content_ref, $start, $end) = @_;
  my($commentopen) = 0;
  my($oldpos) = pos($$content_ref);  # save it in case caller needs it
  pos($$content_ref) = $start;
  my($lastpos) = 0;
  while($$content_ref =~ /\G.*?(<!--|-->)/sig) {
    last if(pos($$content_ref) > $end);
    if($1 eq "<!--") { $commentopen++; }
    if($1 eq "-->") { $commentopen--; }
  }
  pos($$content_ref) = $oldpos;
  return 1 if ($commentopen > 0);
  return 2 if ($commentopen < 0); # pos is undef at end of string?
  return 0;
}

# returns a string of the unbalanced tags in the argument.
# TODO FIXME UNFINISHED
sub unbalancedtags { #rewrite to use position args in document?
  my $str = shift;
  index $str, '<';
}

# rewind to first tag (ignoring $dontcare)
sub prevtag {
  my($content_ref, $startpos) = @_;
  my($tmpstartpos) = $startpos;
  while(1) {
    $tmpstartpos = rindex $$content_ref, '<', $tmpstartpos-1;
    last if($tmpstartpos < 0); #find failed.
    if(substr($$content_ref, $tmpstartpos, $startpos-$tmpstartpos) !~ /^$dontcarecomm$/si) {
      $startpos = $tmpstartpos;
      last;
    }
    $startpos = $tmpstartpos;
  }
  pos($$content_ref) = $startpos;
  return($startpos);
}

# forward to next tag (ignoring $dontcare)
# TODO FIXME this isn't used or even finished...
sub nexttag {
  my($content_ref, $startpos) = @_;
  my($endpos);
  my($commentnest) = 0;
  my($oldpos) = pos($$content_ref);
  while(1) {
    $endpos = index $$content_ref, '>', $startpos+1;
    $startpos = index $$content_ref, '<', $startpos+1;
    print "found ($startpos, $endpos, $commentnest)\n";
    last if($startpos < 0);
    pos($$content_ref) = $startpos;
    if($$content_ref =~ /\G<\!--/g) { $commentnest++; }
    if($commentnest) {
      pos($$content_ref) = $endpos-2;
      if($$content_ref =~ /\G-->/g) { 
        $commentnest--; 
        $startpos = pos($$content_ref);
      }
    }
    pos($$content_ref) = $startpos;
    next if ($$content_ref =~ /\G$dontcare/g);
    last unless($commentnest);
  }
  pos($$content_ref) = $oldpos;
  return($startpos);
}

# look for a specific tag backwards.
sub rewindtotag {
  my($content_ref, $tag, $startpos) = @_;
  my($tmpstartpos) = $startpos;
  my($origstartpos) = $startpos;

  while($tmpstartpos > 0) {
    $tmpstartpos = rindex $$content_ref, '<', $tmpstartpos-1;
    last if($tmpstartpos < 0);
    pos($$content_ref) = $tmpstartpos;
    if($$content_ref =~ /\G<(?:$whitespace)*$tag(?:(?:$whitespace)[^>]*>|>)/sig) {
      $startpos = $tmpstartpos;
      last;
    }
  }
  pos($$content_ref) = $startpos;
  return($startpos==$origstartpos?-1:$startpos);
}

# Some functions to handle markup
sub dumpposlist {
    my($doc_ref) = shift;
    my($poslist) = shift;
    my($posentry);
    foreach $posentry (@{$poslist}) {
        if($posentry->[3]-$posentry->[2] <= 33) {
            logger(DEBUG, "\t(" . join(",", @{$posentry}) . ") " . substr($$doc_ref, $posentry->[2], $posentry->[3]-$posentry->[2]));
        } else {
            logger(DEBUG, "\t(" . join(",", @{$posentry}) . ") " . substr($$doc_ref, $posentry->[2], 15)
                . "..." . substr($$doc_ref, $posentry->[3]-15, 15));
        }
    }
}

sub poslist_check {
    my($doc_ref) = shift;
    my($poslist) = shift;
    my($posentry);
    my($last) = -1;
    foreach $posentry (@$poslist) {
        if($posentry->[1] - $posentry->[0] != $posentry->[3] - $posentry->[2]) {
            logger(ERROR, "poslist is inconsistent: lengths are not same for: ");
            logger(ERROR, "\t(" . join(",", @{$posentry}) . ") " . substr($$doc_ref, $posentry->[2], 15)
                . "..." . substr($$doc_ref, $posentry->[3]-15, 15));
        }
        if($posentry->[0]-1 != $last) {
            logger(ERROR, "poslist is inconsistent: segments are not contiguous: ");
            logger(ERROR, "\t(" . join(",", @{$posentry}) . ") " . substr($$doc_ref, $posentry->[2], 15)
                . "..." . substr($$doc_ref, $posentry->[3]-15, 15));
        }
        $last = $posentry->[1];
    }
    # $newposlist[$#newposlist]->[3]
    if($poslist->[$#$poslist]->[3] != length($$doc_ref)) {
        logger(ERROR, "poslist is inconsistent: length " . $poslist->[$#$poslist]->[3] . " does not match document length: " . length($$doc_ref));
        &dumpposlist($doc_ref, $poslist);
    }
}

# markup_insert and poslist_substr help implement the "view how this page was filtered"
# functionality.  They keep a list ref $poslist which contains a list of lists of four 
# numbers, the first two indicating a range in the ORIGINAL document (including stripping/
# rewriting) and the second two indicate where to find that same content in the marked-up
# document (passed in $doc_ref).  This was much harder to write than I had anticipated...
sub markup_insert {
    my($doc_ref) = shift;
    my($start) = shift;     # relative to UNSTRIPPED document.
    my($length) = shift;    # if $length is 0, it's markup.
    my($insertme) = shift;  # insert $insertme at position $start, and keep @poslist accurate
    my($poslist) = shift;   # list of ($start, $end, $docstart, $docend), mapping from unstripped
                            # document to $doc_ref
# If($length > 0 and length($insertme) > 0), this is from a rewrite rule
# If($length > 0 and length($insertme) == 0), this is from a strip rule
# If($length == 0 and length($insertme) > 0), this is markup
# If($length == 0 and length($insertme) == 0), I've made a mistake somewhere...
    my($strippedsize) = 0;  # bytes difference after insertion/deletion from document
    my($newmarkupsize) = 0;
    my($multiposstripedsize) = 0;
    my(@newposlist);
    my($posentry);
    logger(DEBUG, "markup_insert inserting at ($start, $length): \n\t'$insertme'");
    logger(DEBUG, "  poslist:");
    &dumpposlist($doc_ref, $poslist);
    foreach $posentry (@{$poslist}) {
        if($start >= $posentry->[0] && $start < $posentry->[1]) {
            if($start != $posentry->[0]) {
                push @newposlist, [$posentry->[0]-$strippedsize, $start-$strippedsize-1, $posentry->[2]-$strippedsize, 
                    $posentry->[2]+$start-$posentry->[0]-1-$strippedsize];
                logger(DEBUG, "  Added entry (" . join(",", @{$newposlist[$#newposlist]}) . ") to poslist(1): ", substr($$doc_ref, $newposlist[$#newposlist]->[2], 15));
            }
            if($length == 0) { # inserted text is markup.
                logger(DEBUG, "  (a)substr(\$\$doc_ref, $posentry->[2]+($start-$posentry->[0]), $length) = $insertme;");
                if($length <= 33) {
                    logger(DEBUG, "\t" . substr($$doc_ref, $posentry->[2]+($start-$posentry->[0]), $length));
                } else {
                    logger(DEBUG, "\t" . substr($$doc_ref, $posentry->[2]+($start-$posentry->[0]), 15) .
                        "..." . substr($$doc_ref, $posentry->[2]+($start-$posentry->[0])+$length-15, 15));
                }
                substr($$doc_ref, $posentry->[2]+($start-$posentry->[0]), $length) = $insertme;
                $newmarkupsize = length($insertme); # for subsequent posentry's.
                push @newposlist, [$start-$strippedsize, $posentry->[1]-$strippedsize, 
                    $posentry->[2]+$start-$posentry->[0]+length($insertme)-$strippedsize, 
                    $posentry->[3]+length($insertme)-$strippedsize];
                logger(DEBUG, "  Added entry (" . join(",", @{$newposlist[$#newposlist]}) . ") to poslist(2): ", substr($$doc_ref, $newposlist[$#newposlist]->[2], 15));
            } else { # inserted text is from a rewrite rule (strip or rewrite).
                if($start+$length >= $posentry->[1]) { # this posentry is dead! Skip it!
                    logger(DEBUG, "  (b)substr(\$\$doc_ref, $posentry->[2]+($start-$posentry->[0])-$strippedsize, $posentry->[1]-$start+1) = $insertme");
                    if($posentry->[1]-$start <= 33) {
                        logger(DEBUG, "\t" . substr($$doc_ref, $posentry->[2]+($start-$posentry->[0])-$strippedsize, $posentry->[1]-$start));
                    } else {
                        logger(DEBUG, "\t" . substr($$doc_ref, $posentry->[2]+($start-$posentry->[0])-$strippedsize, 15) . "..." .
                            substr($$doc_ref, $posentry->[2]+($start-$posentry->[0])+$posentry->[1]-$strippedsize-$start-15, 15));
                    }
                    substr($$doc_ref, $posentry->[2]+($start-$posentry->[0])-$strippedsize, $posentry->[1]-$start+1) = $insertme;
                    $length -= $posentry->[1]-$start+1; # n+1 chars between pos (0,n)
                    $strippedsize += $posentry->[1]-$start+1+length($insertme);
                    $start = $posentry->[1]+1;
                    $insertme = "";
                } else {
                    my($strippingsize) = $length-length($insertme);
                    logger(DEBUG, "  (c)substr(\$\$doc_ref, $posentry->[2]+($start-$posentry->[0])-$strippedsize, $length) = $insertme;");
                    if($posentry->[1]-$start <= 33) {
                        logger(DEBUG, "\t" . substr($$doc_ref, $posentry->[2]+($start-$posentry->[0]), $length));
                    } else {
                        logger(DEBUG, "\t" . substr($$doc_ref, $posentry->[2]+($start-$posentry->[0])-$strippedsize, 15)
                            . "..." . substr($$doc_ref, $posentry->[2]+($start-$posentry->[0])-$strippedsize+$length-15,15));
                    }
                    substr($$doc_ref, $posentry->[2]+($start-$posentry->[0])-$strippedsize, $length) = $insertme;
                    push @newposlist, [$start-$strippedsize, $posentry->[1]-$strippedsize-$strippingsize, 
                        $posentry->[2]+$start-$posentry->[0]-$strippedsize, 
                        $posentry->[3]-$strippedsize-$strippingsize];
                    $strippedsize += $strippingsize;
                    logger(DEBUG, "  Added entry (" . join(",", @{$newposlist[$#newposlist]}) . ") to poslist(3): ");
#                    logger(DEBUG, "  doc length is " . length($$doc_ref) . "\n");
                    logger(DEBUG, "  \t " . substr($$doc_ref, $newposlist[$#newposlist]->[2], 15));
                }
            }
        } else {
            # Adjust posentries after insert for inserted/deleted text.
            push @newposlist, [$posentry->[0]-$strippedsize, $posentry->[1]-$strippedsize,
                $posentry->[2]-$strippedsize+$newmarkupsize, $posentry->[3]-$strippedsize+$newmarkupsize];
            logger(DEBUG, "  Added entry (" . join(",", @{$newposlist[$#newposlist]}) . ") to poslist(4): ", substr($$doc_ref, $newposlist[$#newposlist]->[2], 15));
        }
    }
    &poslist_check($doc_ref, \@newposlist);
    return \@newposlist;
}

# Uses @$poslist to return a substring of the original document
sub poslist_substr {
    my($doc_ref) = shift;
    my($start) = shift;
    my($length) = shift;
    my($origlen) = $length;
    my($poslist) = shift;
    my($retval) = "";
    my($posentry);
#    logger(DEBUG, "poslist_substr(\$doc_ref, $start, $length, [");
#    &dumpposlist($doc_ref, $poslist);
#    logger(DEBUG, "  ])");
    foreach $posentry (@{$poslist}) {
        if($start >= $posentry->[0] && $start <= $posentry->[1]) {
            if($length <= $posentry->[1]-$posentry->[0]) {
                $retval .= substr($$doc_ref, $posentry->[2]+$start-$posentry->[0], $length);
#                logger(DEBUG, "poslist_substr has \$posentry: (" . join(", ", @$posentry) . ")");
#                logger(DEBUG, "poslist_substr returning(a) ($start, $length) actually (" . 
#                    ($posentry->[2]+$start-$posentry->[0]
#                    ) . ", $length): \n\t$retval");
                return $retval;
            } else {
#                logger(DEBUG, "poslist_substr adding substr: ", 
#                    substr($$doc_ref, $posentry->[2]+$start-$posentry->[0],
#                        $posentry->[1]-$start));
                $retval .= substr($$doc_ref, $posentry->[2]+$start-$posentry->[0], 
                    $posentry->[1]-$start);
                $length -= $posentry->[1]-$start+1; # +1 is because between positions 1 and 3 
                $start = $posentry->[1]+1;          # there are 3 characters, not 2.
            }
        }
    }
    if(length($retval) != $origlen) {
        logger(ERROR, "poslist_substr returning a string that is not the requested length: \n\t$retval");
    }
#    logger(DEBUG, "poslist_substr returning(b) ($start, $length): \n\t$retval");
    return $retval;
}

sub unhtmlify {
    my($html) = shift;
    $html =~ s/>/&gt;/g;
    $html =~ s/</&lt;/g;
    return $html;
}

sub getpos {
    my($pos) = shift;
    my($poslist) = shift;
    my($i);

    for($i=0;$i<$#$poslist;$i++) {
        if($pos > $poslist->[$i]->[0] && $pos < $poslist->[$i]->[1]) {
            return $poslist->[$i]->[3] + $pos - $poslist->[$i]->[0];
        }
    }
    logger(ERROR, "Position $pos is not in the document!");
    return -1;
}

sub Config {
  my($req, $cgi, $siteconfig) = @_;
  my($message) = "";
#  foreach my $param ($cgi->param()) {
#    logger(DEBUG, "parameter: $param = ", $cgi->param($param),  "\n");
#  }

  if($cgi->param()) {
    my($name) = $cgi->param('name');
    my($action) = ((defined $cgi->param('action'))?$cgi->param('action'):"") 
                    . ((defined $cgi->param('filter'))?(" " . $cgi->param('filter')):"");
    while($action =~ s/(?:\015|\013|\011)//s) {} # get rid of newlines
    if(defined $cgi->param('delete') || defined $cgi->param('delete.x')) {
      my(@newfilterlist) = ();
      foreach my $filter (@{$siteconfig->{filters}}) {
        if($name) {
          unless($filter =~ /^$name:/s) { push(@newfilterlist, $filter); }
        } else { # !$name
#          print "Comparing '$filter' to '$action'\n";
          unless($filter =~ /^$action$/s) { push(@newfilterlist, $filter); }
        }
      }
      delete $siteconfig->{filters};
      $siteconfig->{filters} = \@newfilterlist;
      $message = "Rule " . $cgi->param('name') . " succussfully deleted.\n";
    } elsif(defined $cgi->param('change') || defined $cgi->param('change.x')) {
      my(@newfilterlist) = ();
      foreach my $filter (@{$siteconfig->{filters}}) {
        if($name) {
          unless($filter =~ /^$name:/s) { push(@newfilterlist, $filter); }
        } else { # !$name
#          print "Comparing '$filter' to '$action'\n";
          unless($filter =~ /^$action$/s) { push(@newfilterlist, $filter); }
        }
      }
      push @newfilterlist, ((defined $name && $name)?($cgi->param('name') . ": "):("")) . $action;
      delete $siteconfig->{filters};
      $siteconfig->{filters} = \@newfilterlist;
      $message = "Rule " . $cgi->param('name') . " succussfully changed.\n";
    } elsif(defined $cgi->param('add') || defined $cgi->param('add.x')) {
# TODO FIXME Do some validity checking on the rule here.
      push @{$siteconfig->{filters}}, ((defined $name && $name)?($cgi->param('name') . ": "):("")) . $action;
      $message = "Rule " . $cgi->param('name') . " succussfully added.\n";
    } elsif(defined $cgi->param('showfiltering')) {
        my($origdoc);
        $markupinstead=1;
        @markup = ();
        my($myreq) = $req->clone;
        $myreq->uri(uri_unescape($cgi->param('showfiltering')));  # uri_unescape from URI::Escape module
        $myreq->header("Host" => $myreq->uri->host_port);
        $myreq->remove_header("If-Modified-Since");  # having it in the browser's cache doesn't help.
        &FilterProxy::handle_filtering($myreq, $myreq, -10); # Munges outgoing headers.
        my($res) = $FilterProxy::agent->request($myreq); # Use FilterProxy's UserAgent object
        if(defined $res->content_type && length ${$res->content_ref} > 0) {
            &FilterProxy::handle_filtering($myreq, $res, 1,2,3); # Decode content (FilterProxy::Compress)
            $origdoc = $res->content; # make a copy for comparison.
            &FilterProxy::handle_filtering($myreq, $res, $CONFIG->{order}); # Only do Rewrite.
        } else {
            logger(ERROR, "View unfiltered source returned HTTP code: " . $res->code);
            return("HTTP code: " . $res->code);
        }
        $res->content($origdoc);  # We don't care what the changed document looks like, we got
        my($doc) = "";            # everything we need in @markup
        my($markup);
        my($poslist) = [[0, length(${$res->content_ref}), 0, length(${$res->content_ref})]];
# The stuff in @markup assumes that stripping/rewriting IS APPLIED TO THE DOCUMENT.
#        foreach $markup (@markup) {
#            logger(DEBUG, "\@markup element: (" . $markup->[0] . "," . $markup->[1] . "," . $markup->[2] . "," . $markup->[3] . ")");
#        }
        foreach $markup (@markup) {
#            logger(DEBUG, "Marking up segment " . 
#                "(" . $markup->[0] . "," . $markup->[1] . "," . $markup->[2] . "," . 
#                $markup->[3] . ")");
            $poslist = markup_insert($res->content_ref, $markup->[1], 0,
                "<span class=\"rulename\">" . $markup->[0] . "</span>", $poslist);
            $poslist = markup_insert($res->content_ref, $markup->[1], 0,
                "<span class=\"strip\">" . 
                unhtmlify(poslist_substr($res->content_ref,$markup->[1],$markup->[2],$poslist)) . 
                "</span>" , $poslist);
            $poslist = markup_insert($res->content_ref, $markup->[1], $markup->[2], 
                $markup->[3], $poslist);
            if($markup->[3] ne "") {
                # substituted text is already in document.  Just put markers around it.
                $poslist = markup_insert($res->content_ref, $markup->[1], 0,
                    "<span class=\"rewrite\">", $poslist);
                $poslist = markup_insert($res->content_ref, $markup->[1]+length($markup->[3]), 0,
                    "</span>", $poslist);
            }
        }
        # traverse $poslist and unhtmlify non-markup.
        my($posentry);
#        my($crap, $crap2);
#        my($delta) = 0;
        my($lastend) = -1;
        foreach $posentry (@{$poslist}) {
#            $crap = substr(${$res->content_ref}, $posentry->[2], $posentry->[3]-$posentry->[2]);
#            $crap2 = unhtmlify($crap);
            if($lastend >= 0) {
                $doc .= substr(${$res->content_ref}, $lastend, $posentry->[2]-$lastend);
#                logger(DEBUG, "Adding ($lastend,", $posentry->[2]-$lastend, ")");
            }
            $doc .= unhtmlify(substr(${$res->content_ref}, $posentry->[2], $posentry->[3]-$posentry->[2]));
            $lastend = $posentry->[3];
        }
        if($lastend >= 0 && $poslist->[$#{$poslist}]->[2] > 0) {
            $doc .= substr(${$res->content_ref}, $lastend, $poslist->[$#{$poslist}]->[2]-$lastend);
#            logger(DEBUG, "Adding at end ($lastend,", $poslist->[$#{$poslist}]->[2]-$lastend, ")");
        }
        $markupinstead=0;
        @markup=();
        return $doc;
#        return ${$res->content_ref}; # This gets passed to Source.html as $ENV{MESSAGE};
    }
  }
  return $message; # message printed to user
}

# Some code so this can be run at the command line -- for filtering algorithm tests.
# assume an html file name is passed on the command line.  Further arguments are rules
# to be added to the list.  i.e.:
#   perl FilterProxy/Rewrite.pm index.html 'strip tagblock <script src=~/doubleclick.net/>'
sub main {
  defined $ARGV[0] || die "pass me the name of an html file in which to find banners.\n";
  if(!defined &FilterProxy::logger) {
    *logger = sub { shift; print @_; };
  }
  require HTTP::Request;
  require HTTP::Response;
  require HTTP::Headers;
  require Time::HiRes;
  push @FilterProxy::MODULES, pop @FilterProxy::MODULES; # supress warning
  $FilterProxy::CONFIG->{debug} = 1;
  $FilterProxy::CONFIG->{timing} = 1;
  $FilterProxy::CONFIG->{info} = 1;
  my($content) = `cat $ARGV[0]` || die "Unable to open file $ARGV[0]\n";
  my($reqheader) = new HTTP::Headers(Date => time);
  my($req) = new HTTP::Request("GET", $ARGV[0], $reqheader);
  my($resheader) = new HTTP::Headers(Date => time, Content_Type => 'text/html');
  my($res) = HTTP::Response->new(200, "OK", $resheader, $content);
  my($siteconfig) = ();
  push @$siteconfig, {};
  @$siteconfig->[0]->{'filters'} = [
    'ADS: strip tagblock </a|img|i?layer|i?frame|script|form/ /src|href|action/ =~ #(?:(?:/ad(?:cafe|_|click|buy|count(?:er)?|content)?(?:serv|link|click|verts?|log|graphic|banner|source|mosaic|intelligent|\\.cgi|\\.pl)|blipverts?|/ad(?:s|-bin|buy)?/|banners?(?:\\.(?:cgi|phtml|php[0-9])|man|click|/redirect(?:\\.(?:cgi|html|perl|pl|php[0-9]?|shtml)\\?AD))|/(?:event|html)\\.ng/|servfu\\.pl|/ads?[\\._]|(?:images?_?|click_.x\\.|adstream_.x\\.)ads?|sponsor|phpAds|_ad\\.html|click-through|www.amazon.com/exec/obidos/redirect-home|prohosting.com/click|\\.ad\\?)|http://(?:remote)?ad(?:s|s?[0-9]+|server|images?|redir(?:ect)?|_?click|content)?\\.|(?:(?:link4link|linkexchange|flycast|clicktrade|doubleclick|avenuea|blockstackers|mediaplex|focalink|valueclick|onresponse|imgis|admaximize|eads|datis|commission-junction|pennyweb|linkbuddies|preferences|dimeclicks|futurenet|247media|netadsrv\\.iworld|clk4|spanishbanner|hibtox|burstnet|swiftad|adzerver|hamster|ukbanners|spunkmedia|namezero|ecoupons|spinbox|adclub|advertising|hitbox|link4ads|bfast|gameadexchange|adbureau|linksynergy|superstats|st21\\.yahoo|phoenix-adrunner\\.mycomputer|counter\\.xoom|\\w+ads\\.osdn|euniverseads|popuptraffic|iadnet|adscholar|hightrafficads|trafficmarketplace|qksrv|fastclick|webstormmedia)\\.(?:com|net)|(?:freecity)\\.de))#> add encloser </(?:no)?script/> add balanced add alternate add balanced',
#    'ADNAMES: strip tagblock </a|img|i?layer|i?frame|script/ /src|href/ =~ #(ad(cafe|_|click)?(serv|link|click|verts?|log|graphic|banner|source|mosaic|intelligent|\\.cgi)|blipverts?|/ad(s|-bin)?/|banner(s?\\.(cgi|phtml|php3|php4)|man|click)|/(event|html)\\.ng/|servfu\\.pl|/ads?[\\._]|(images?_?|click_.x\\.|adstream_.x\\.)ads?|sponsor|phpAds|_ad\\.html|click-through)#> add encloser <script> add balanced add alternate add balanced',
#    'ADSERVPATS: strip tag </img|i?layer|i?frame|script/ src =~ #http://ad(s|s?[0-9]+|images|redir(ect)?|_click).#> -growto tag -ifencloses <script> -addbal -addalt',
#    'ADSERVERS: strip tag </img|i?layer|i?frame|script/ src =~ /(linkexchange|flycast|clicktrade|doubleclick|avenuea|blockstackers|mediaplex|focalink|valueclick|imgis|bfast|admaximize|eads|datis|commission-junction|pennyweb|linkbuddies|preferences|dimeclicks|futurenet|247media|netadserv\\.iworld|clk4|spanishbanner|hibtox|burstnet|swiftad|adzerver|hamster|ukbanners|spunkmedia|namezero|ecoupons)\\.(com|net)/> -growto tag -ifencloses </script/> -addbal -addalt',
#    'ANDOVER: strip tag <img src =~ #http://(209.207.224|images.slashdot.org/banner/)# > -growto tag -ifencloses <script> -addbal -addalt -addbal',
#    'ANDOVER_ADLOG: strip tag <a href =~ #adlog.pl#> -addbal',
#    'GEOCITIES_MBE: strip tag <img src=~#http://pic.geocities.com/images/mbe#> -growto tag -ifencloses <table> -addbal',
#    'GEOCITIES_YBEACON: strip tag <script src=~#www.geocities.com/js_source/#> -addbal',
#    'GEOCITIES_TOTO: strip tag <script src=~#http://geocities.yahoo.com/toto#> -growto tag -ifencloses <table> -addbal -addalt -addbal',
#    'WEBBUGS: rewrite tag <img width=1 height=1> add encloser <script> add alternate as <spacer width=1 height=1>',
#    'TEXTADS: strip tag </a/ href=~/(clk4|hamster)\\.com/> -growto tag -ifencloses <table> <addbal> -addalt',
#    'HU: strip tag </a|img/ /src|href/=http://(top100.isys.hu|hle.isys.hu)*> -addbal',
#    'UGO: strip tag <img src =~ /ugo/ width="55" height="60"> -addbal -addalt',
#    'UGO_AD: strip tag </iframe|img|script/ src=~/adserver.ugo.com/> -growto tag <a href=~#http://www.ugo.com#> -addalt -addbal',

  ];
  foreach my $arg (@ARGV[1..$#ARGV]) {
    push @$siteconfig, {};
    @$siteconfig->[$#$siteconfig]->{'filters'} = [ $arg ];
  }
  my($begin) = &Time::HiRes::time;
  my($buser, $bsys) = times;
  FilterProxy::Rewrite::filter($req, $res, $siteconfig);
  my($end) = &Time::HiRes::time;
  my($euser, $esys) = times;
  printf "Rewrite took %0.5f clock, %0.5f user, %0.5f system seconds.\n", 
    ($end-$begin), ($euser-$buser), ($esys-$bsys);
  open(OUT, "+>/tmp/newrewrite.html");
  print OUT $res->content();
  close(OUT);
  print "Modified content is in /tmp/newrewrite.html\n";
}

if($0 =~ /Rewrite/) {
  &main;
}

1;

# http://rd.yahoo.com/M=162726.1206194.2817452/D=egroupweb/S=1705375618:N/A=586445/R=0/*http://mobile.yahoo.com/shopping?.pno=shop
