File: URLProcessor.pm

package info (click to toggle)
sitescooper 3.1.2-1
links: PTS
area: main
in suites: sarge, woody
size: 3,000 kB
ctags: 662
sloc: perl: 8,677; makefile: 105
file content (1000 lines) | stat: -rw-r--r-- 29,059 bytes
#===========================================================================

package Sitescooper::URLProcessor;

# base class for LinksURLProcessor and StoryURLProcessor.

use Carp;
use File::Basename;
use FileHandle;

use Sitescooper::StripTablesFilter;
use Sitescooper::HTMLLobotomize;
use Sitescooper::HTMLFindNew;
use HTTP::Status;

use strict;

use vars       qw(
		@ISA $STATE_PRE_GET $STATE_NET_WAIT
		$STATE_CACHE_WAIT $STATE_POST_GET
		);

@ISA = qw();

$STATE_PRE_GET		= 1;
$STATE_NET_WAIT		= 2;
$STATE_CACHE_WAIT	= 3;
$STATE_POST_GET		= 4;

sub new {
  my $class = shift; $class = ref($class) || $class;

  my $scoop = shift;
  my $robot = shift;
  my $scf = shift;
  my $referrer = shift;
  my $url = shift;

  croak "scoop not defd" unless defined ($scoop);
  croak "url not defd" unless defined ($url);
  croak "scf not defd" unless defined ($scf);
  croak "robot not defd" unless defined ($robot);

  my $self = {
    'scoop'		=> $scoop,
    'robot'		=> $robot,
    'cache'		=> $robot->{cache},
    'cf'		=> $scoop->{cf},

    'referrer'		=> $referrer,
    'url'		=> $url,
    'scf'		=> $scf,
    'state'		=> $STATE_PRE_GET,
    'http_state'	=> undef,
    'url_regexp_cache'	=> { },

    'retry_with_url'	=> undef,
    'warn_about_ext_links' => 0,	# turned on where necessary
  };

  my $tostr = $url;
  $tostr =~ s/(?:default|index).[a-z]*//ig;	# trim index.html, default.htm, index.cgi, etc.
  ($tostr = $1) if ($tostr =~ /[\/\&\?](\S{5,20}?)$/);
  $self->{to_string_name} = $tostr;

  bless ($self, $class);
  $self;
}

sub run {
  my ($self) = @_;
  
  $self->{scoop}->dbg ("url-handler ".$self->to_string().
  			": running state ".$self->{state});

  if ($self->{state} == $STATE_PRE_GET)
  {
    # this is supposed to be done by Sitescooper::Robot::handler_start.
    croak ("handler_start was never called for ".$self->{url});
  }
  elsif ($self->{state} == $STATE_CACHE_WAIT)
  {
    return $self->finish_get();		# cache is always ready
  }
  elsif ($self->{state} == $STATE_NET_WAIT)
  {
    if (!defined $self->{http_state}) {
      croak "http_state is unset in STATE_NET_WAIT";
    }
    if ($self->{scoop}->{httpclient}->ready_to_finish ($self->{http_state}))
    {
      return $self->finish_get();
    } else {
      return 1;				# keep waiting!
    }
  }
  elsif ($self->{state} == $STATE_POST_GET)
  {
    if (defined $self->{http_state}) { croak "http_state is set!"; }
    return;		# means delete this handler, we're done
  }
  else {
    croak ("URLProcessor state is bad: $self->{state}");
  }
}

# --------------------------------------------------------------------------

sub get_ready_handlers {		# static
  my (@handlers) = @_;
  local ($_);

  my @rethdlrs = ();

  my %fh_to_handler = ();
  my @fhs = ();
  foreach $_ (@handlers) {
    next unless (defined $_);

    if ($_->{state} == $STATE_NET_WAIT) {
      my $fh = $_->{scoop}->{httpclient}->get_waiting_fh ($_->{http_state});

      # non-threaded impls return "undef" to indicate that the data
      # is ready whenever.
      if (defined $fh) {
	$fh_to_handler{$fh} = $_;
	push (@fhs, $fh);
	next;
      }
    }
    # otherwise it's ready for reading whenever, so add it to the ret
    # list straight away.
    push (@rethdlrs, $_);
  }

  if ($#rethdlrs >= 0) {
    return @rethdlrs;		# these are ready straight away
  }
  if ($#fhs < 0) {		# no fhs are waiting
    return (undef);		# all done
  }

  my ($rin, $rout, $ein, $eout);
  $rin = $ein = '';
  foreach $_ (@fhs) {
    vec ($rin, $_->fileno(), 1) = 1;
  }
  $ein = $rin;

  my ($nfound,$timeleft) =
    select ($rout = $rin, undef, $eout = $ein, 10.0);
  if ($nfound == 0) {
    return ();			# nothing's ready
  }

  foreach $_ (@fhs) {
    if (vec $rout, fileno($_), 1) {
      push (@rethdlrs, $fh_to_handler{$_});
    }
  }
  return @rethdlrs;		# return all ready handlers
}

# --------------------------------------------------------------------------

sub start_get {
  croak "URLProcessor::start_get called, this is the base class";
}

sub finish_get {
  croak "URLProcessor::finish_get called, this is the base class";
}

# --------------------------------------------------------------------------

sub set_state {
  my $self = shift;
  $self->{state} = $_[0];
}

sub get_state {
  my $self = shift;
  $self->{state};
}

# --------------------------------------------------------------------------

sub to_string {
  my $self = shift;
  $self->{to_string_name};
}

# --------------------------------------------------------------------------

sub sitewarn {
  my $self = shift;
  $self->{robot}->sitewarn (@_);
}

# --------------------------------------------------------------------------

sub writing_doc { my $self = shift; $self->{robot}->writing_doc(); }
sub writing_html { my $self = shift; $self->{robot}->writing_html(); }
sub writing_text { my $self = shift; $self->{robot}->writing_text(); }
sub writing_images { my $self = shift; $self->{robot}->writing_images(); }
sub dbg { my $self = shift; $self->{scoop}->dbg(@_); }
sub dbg2 { my $self = shift; $self->{robot}->dbg2(@_); }

# --------------------------------------------------------------------------

sub get_page {
  my $self = shift;
  $self->{get_url_retries} = 0;
  $self->get_url (@_, 0);	# only text content types
}

sub get_img {
  my $self = shift;
  $self->{get_url_retries} = 0;
  $self->get_url (@_, 1);	# allow binary files
}

# --------------------------------------------------------------------------

sub rules_permit_using_cached_version {
  my ($self, $url, $is_dynamic_html, $lastmod, $is_shared_cache) = @_;

  my $desc = 'cached version';
  if ($is_shared_cache) { $desc = 'shared-cache version'; }

  if ($is_dynamic_html == 0) {
    $self->dbg("$desc exists");
    return 1;
  }
  
  if (defined $lastmod) {
    $lastmod = (time() - $lastmod) + 0.0;
    $lastmod /= (24 * 60 * 60);		# secs -> days

    if ($lastmod < $self->{cf}->{cached_front_page_lifetime}
	&& $lastmod > 0)		# just make sure the clock is sane
    {
      $self->dbg("$desc is new enough: ".$lastmod." days");
      return 1;
    }
  }

  if ($self->{cf}->{use_only_cache}) {
    $self->dbg("-fromcache switch is on, using $desc");
    return 1;
  }

  0;
}

sub get_url {
  my ($self, $url, $is_dynamic_html, $allow_binary) = @_;

  $url = Sitescooper::Util::URLWithoutAnchor ($url);

  my $cobj = $self->{cache}->get_cached_page ($url);

  if ($url =~ /\.(?:rdf|rss|xml)/i) {
    $allow_binary = 1;	# some sites think these are binary :(
  }

  $self->{scoop}->{useragent}->clear_redirect ();

  $self->{sharedcachehtml} = undef;
  $self->{preloadedhtml} = undef;
  $self->{getting_url} = $url;
  $self->{allow_binary} = $allow_binary;
  $self->{is_dynamic_html} = $is_dynamic_html;

  if (defined $cobj) {
    $self->{cachedhtml} = $cobj->get_page();

    if ($self->rules_permit_using_cached_version ($url,
      		$is_dynamic_html, $cobj->get_lastmod(), 0))
    {
      $self->{should_return_cachedhtml} = 1;
      $self->set_state ($STATE_CACHE_WAIT);
      return 1;
    }

  } else {
    $self->{cachedhtml} = undef;
  }

  # see if we have it in the shared cache
  if (defined $self->{cf}->{sharedcache}) {
    my $scobj = $self->{cache}->get_shared_cache_page ($url);

    if (defined $scobj) {
      if ($self->rules_permit_using_cached_version ($url,
		$is_dynamic_html, $scobj->get_lastmod(), 1))
      {
	$self->{cachedhtml} = undef;
	$self->{sharedcachehtml} = $scobj->get_page();
	$self->{should_return_cachedhtml} = 1;
	$self->set_state ($STATE_CACHE_WAIT);
	return 1;
      }
    }
  }

  # if we're already loading the URL, wait for it to finish loading
  while ($self->{robot}->url_is_in_queue ($url)) {
    $self->{robot}->handler_run_queue_once();
  }

  my $parresp = $self->{scoop}->get_preloaded_page ($url);
  if (defined $parresp) {
    $self->dbg ("preload: using preloaded response");
    $self->{preloadedhtml} = $parresp;
    $self->{should_return_preloadedhtml} = 1;
    $self->set_state ($STATE_CACHE_WAIT);
    return 1;
  }

  if ($self->{cf}->{use_only_cache}) {
    $self->dbg("-fromcache switch is on, not doing HTTP request");

    $self->{should_return_cachedhtml} = 1;
    $self->set_state ($STATE_CACHE_WAIT);
    return undef;
  }

  if (!$allow_binary && ($url =~ /\.(ra|ram|wav|jpeg|jpg|gif|mov|zip|rar)$/i
    	|| $url =~ /\.(tar|tgz|gz|tbz|bz2|rpm|swf|mpeg|mpg)$/i))
  {
    $self->dbg("not retrieving non-HTML content: $url");
    $self->set_state ($STATE_CACHE_WAIT);
    return undef;
  }

  $self->{get_url_retries}++;
  if ($self->{get_url_retries} > 3) {
    $self->{scoop}->verbose ("Failed to GET url (caught in a loop): $url");
    $self->set_state ($STATE_CACHE_WAIT);
    return undef;
  }

  if ($Sitescooper::Main::got_intr_flag) { return undef; }

  $self->{resp} = undef;

  if (defined $self->{http_state}) { croak "http_state is set!"; }
  $self->{http_state} =
      $self->{scoop}->{httpclient}->start_get ($self->{referrer},
      $url, (defined $cobj) ? $cobj->get_lastmod() : undef);
  if (!defined $self->{http_state}) {
    croak "http_state is unset after start_get";
  }

  $self->{should_return_cachedhtml} = 0;
  $self->set_state ($STATE_NET_WAIT);
  1;
}

# --------------------------------------------------------------------------

sub get_url_reply {
  my ($self) = @_;

  my $url = $self->{getting_url};
  my $allow_binary = $self->{allow_binary};
  my $is_dynamic_html = $self->{is_dynamic_html};
  my $page = '';
  my $newlastmod;

  $self->{retry_with_url} = undef;

  my $cachedhtml = $self->{cachedhtml};
  delete $self->{cachedhtml};		# save memory

  my $sharedcachehtml = $self->{sharedcachehtml};
  delete $self->{sharedcachehtml};

  if ($self->{should_return_cachedhtml}) {
    $page = $cachedhtml;
    $page ||= $sharedcachehtml;
    $newlastmod = undef;

  } elsif ($self->{should_return_preloadedhtml}) {
    $page = $self->{preloadedhtml};
    delete $self->{preloadedhtml};
    $newlastmod = undef;

  } else
  {
    # use the real HTTP response
    my $resp;
    if (defined $self->{resp}) {
      $resp = $self->{resp}; delete $self->{resp};
    } else {
      if (!defined $self->{http_state}) {
	warn "http_state is unset in get_url_reply";
	return undef;		# TODO -- fix this bug
      }
      $resp = $self->{scoop}->{httpclient}->finish_get ($self->{http_state});
    }
    $self->{http_state} = undef;
    
    if (!defined $resp) {
      $self->sitewarn  ("HTTP GET timed out, timed out ".
			"without a response.");
      Sitescooper::Main::got_intr;		# TODO -- is this right?
    }
    if ($Sitescooper::Main::got_intr_flag) { return undef; }

    if ($resp->code == RC_NOT_MODIFIED)
    {
      $self->{scoop}->verbose ("Skipping (HTTP says no mod since last download): $url");
      return undef;
    }

    my $realm;
    if ($resp->code == RC_UNAUTHORIZED &&
	defined($realm = $self->{scoop}->{useragent}->get_last_auth_realm()))
    {
      if (defined $self->{scoop}->{useragent}->get_credentials_quietly($realm)) {
	$self->{scoop}->verbose ("Deleting incorrect username and password for this realm.");
	$self->{scoop}->{useragent}->clear_credential
			($self->{scoop}->{useragent}->get_last_auth_realm());
	return $self->need_http_retry ($url);

      } else {
	$self->{scoop}->verbose ("Cannot read page, it requires a username and password.");
      }
    }

    if (!$resp->is_success) {
      $self->sitewarn  ("HTTP GET failed: ".$resp->status_line." ($url)");
      return undef;
    }

    if (!$allow_binary && (defined($resp->content_type) &&
	$resp->content_type ne '' &&
	$resp->content_type !~ /^(text\/|multipart\/)/))
    {
      $self->{scoop}->verbose ("Non-text content: Content-Type: ".$resp->content_type.".");
      return undef;
    }

    # not sure about this one, this could probably go at some point.
    if (defined $self->{scf}->{need_login_url}) {
      if ($self->match_url ($resp->base, $self->{scf}->{need_login_url})) {
	$self->{scoop}->verbose ("Page requires a username and password, requesting...");
	$self->{scoop}->{useragent}->get_basic_credentials ($self->{robot}->{url}, $url);

	return $self->need_http_retry ($url);
      }
    }

    $page = $resp->content;
    $newlastmod = $resp->last_modified;

    my $clen = $resp->content_length();

    # For Kennis -- verify content lengths
    if ($self->{cf}->{debug} > 1) {
      $self->{scoop}->dbg ("Content-length: ".(defined $clen ? $clen : "undef")
      		."; got: ".(length($page))."; message dump: "
      		.$resp->as_string()."\n\n");
    }

    if (defined $clen && length ($page) < $clen) {
      $self->{scoop}->verbose ("Got incomplete response: ".(length ($page))." bytes is".
	  " less than Content-Length ".$clen.", retrying...");
      return $self->need_http_retry ($url);
    }
  }

  # handle (ugh) Javascript or meta-tag redirects
  if (defined $page &&
      $page =~ /meta\s+http-equiv=[\"\']?refresh[\"\']?\s+
      		content=[\"\']([^\"\'\;]*);?\s*url=([^\"\'\;]+)[\"\'\;]/isx)
  {
    my $timeout = $1;
    my $newurl = $2;

    if ($timeout =~ /(\d+)/ && $1+0 > 20) {
      $self->{scoop}->dbg ("Ignored slow meta-tag refresh: \"$timeout url=$newurl\"");
    } else {
      $newurl = Sitescooper::Util::AbsoluteURL ($url, $newurl);
      $self->{scoop}->verbose ("Redirected by META tag to: $newurl");
      $self->{scoop}->{useragent}->note_redirect ($newurl);

      $self->{robot}->got_redirected ($url, $newurl);
      return $self->need_http_retry ($newurl);
    }
  }

  my $lastmod;
  if (defined $newlastmod) {
    $lastmod = $newlastmod;
    $self->dbg ("last-modified time: $lastmod (".
    		$self->{scoop}->time2datestr($lastmod).")");

    my $oldlastmod = $self->{cache}->get_last_modtime ($url);
    if (defined($oldlastmod) && defined($lastmod)
      		&& $lastmod <= $oldlastmod
		&& !$self->{cf}->{refresh}
            	&& !$is_dynamic_html && !$allow_binary)
    {
      $self->{scoop}->verbose ("Skipping (no mod since last download): $url");
      return undef;
    }

  } else {
    $self->dbg ("last-modified time: not provided");
    $lastmod = time;
  }
  $self->{cache}->set_last_modtime ($url, $lastmod);

  if (!$is_dynamic_html && defined $cachedhtml && !$allow_binary
    	&& !$self->{cf}->{refresh} && $cachedhtml eq $page)
  {
    $self->{scoop}->verbose ("Skipping (HTML has not changed): $url");
    return undef;
  }
  $page;
}

# --------------------------------------------------------------------------

sub need_http_retry {
  my ($self, $url) = @_;
  $self->{retry_with_url} = $url;
  undef;
}

sub does_need_http_retry {
  my ($self) = @_;
  return (defined $self->{retry_with_url}) ? 1 : 0;
}

sub get_http_retry_url {
  my ($self) = @_;
  return $self->{retry_with_url};
}

# --------------------------------------------------------------------------

sub handle_redirects_and_base_url {
  my ($self, $url, $page) = @_;
  my ($newurl, $newbase);
  
  $newbase = $url;
  if ($self->{scoop}->{useragent}->redirect_occurred()) {
    $newurl = $self->{scoop}->{useragent}->get_last_redirect();

    $self->sitewarn ("Redirected to $newurl from $url\n");
    $self->{scoop}->dbg ("links will use redirect as base URL");
    $newbase = $newurl;
    $self->{robot}->got_redirected ($url, $newurl);
  }
  
  if ($page =~ /<head>.*<base\s+href\s*=\s*[\"\']?(\S+?)[\"\']?\s*>.*<\/head>/is)
  {
    $newbase = Sitescooper::Util::AbsoluteURL ($newbase, $1);
    $self->{scoop}->dbg ("BASE HREF tag found, setting new base URL: $newbase");
    # this is not a proper redirect; it just means that we have a new base URL.
    # so don't call got_redirected and don't return a redirect-from URL.

    $self->{robot}->add_snarfed_link ($url);	# TODO - is this needed?
  }

  if (defined $newurl) {
    ($url, $newurl, $newbase);
  } else {
    (undef, $url, $newbase);            # stay the same
  }
}

# --------------------------------------------------------------------------

sub make_printable {
  my $self = shift;
  my $nextpage = shift;
  my $warn_if_fail = shift;

  my $sub = $self->{scf}->get_story_param ('printable_sub', $nextpage);
  if (defined $sub) {
    my $new = $nextpage;
    $sub =~ s/\\(\d+)/\$$1/g;	# avoid warnings

    eval '$new =~ '.$sub.'; 1;'
      or $self->sitewarn  ("Printable substitution failed! ($!)\n");

    if ($nextpage ne $new) {
      # $self->{scoop}->verbose ("Using printable version instead: $new");
      my $limitto = $self->{scf}->get_story_param ('story_limit_to', $new);
      if (!defined $limitto) { $limitto = $self->{scf}->{def_story_limit_to}; }

      if (defined $limitto && !$self->match_url ($new, $limitto)) {
	if ($warn_if_fail) {
	  $self->sitewarn  ("Printable version does not match StoryURL ".
		"pattern, reverting from $new to $nextpage\n");
	}
      } else {
	$nextpage = $new;
      }
    }
  }
  $nextpage;
}

# ---------------------------------------------------------------------------

sub apply_url_preproc {
  my $self = shift;
  local ($_) = shift;

  my $proc = $self->{scf}->get_story_param ('url_preproc', $_);
  if (defined $proc) {
    my $origurl = $_;
    if (!eval $proc."; 1;") {
      $self->sitewarn ("URLProcess failed: $@");
      undef $_;
    }

    if (defined $_ && $_ ne $origurl) {
      $self->{robot}->got_redirected ($origurl, $_);
    }
  }
  $_;
}

# ---------------------------------------------------------------------------

sub get_new_bits {
  my $self = shift;
  my ($oldfile, $newfile) = @_;

  if ($self->{cf}->{refresh}) {
    $self->{scoop}->verbose ("-refresh is on, not looking for differences");
    return $newfile;
  }

  $self->{scoop}->verbose ("Finding differences between current page and cached version");
  my $differ = new Sitescooper::HTMLFindNew();
  if ($self->{cf}->{debugdiffs}) { $differ->set_debug (1); }
  $differ->set_diff_command ($self->{cf}->{diff});
  $differ->find_new ($oldfile, $newfile);
}

# ---------------------------------------------------------------------------

sub text_equals {
  my $self = shift;
  my $t1 = shift;
  my $t2 = shift;
  $t1 =~ s/[\s\r\n]+/ /gs; $t1 =~ s/^\s+//; $t1 =~ s/\s+$//;
  $t2 =~ s/[\s\r\n]+/ /gs; $t2 =~ s/^\s+//; $t2 =~ s/\s+$//;
  ($t1 eq $t2);
}

# ---------------------------------------------------------------------------

sub html_to_text_warn_about_ext_links {
  my ($self, $url, $page, $format) = @_;
  $self->{warn_about_ext_links} = 1;
  my $ret = $self->html_to_text ($url, $page, $format);
  $self->{warn_about_ext_links} = 0;
  return $ret;
}

sub html_to_text {
  my ($self, $url, $page, $format) = @_;

  my $lob = new Sitescooper::HTMLLobotomize();
  $lob->set_debug_callback (sub {
    $self->{scoop}->dbg (@_);
  });

  $lob->set_text ($page);
  $lob->set_url ($url);

  my $sformat = $self->{scf}->{site_format};
  if (defined($sformat) && ($sformat eq 'rss')) { $lob->rss_to_html(); }

  $lob->set_alt_tags_allowed_regexp
  	($self->{scf}->get_story_param ('use_alt_tags', $url));

  if ($self->{cf}->{allowimgs}) {
    $lob->set_keep_img_regexp
	($self->{scf}->get_story_param ('imageurl', $url));

    # define an anonymous subroutine to load images for
    # this URLProcessor object
    my $imgrobot = $self->{robot};
    $lob->set_image_loader ( sub { $imgrobot->download_image (@_); } );
  }

  $lob->strip_advanced_tags();
  $lob->clean_inline_images();
  $lob->balance_tags();
  $lob->convert_pre_to_html();

  if ($format == $Sitescooper::Main::OUT_DOC) {
    # Create DOC bookmarks at <a name> tags
    # From Brian Lalor <blalor@hcirisc.cs.binghamton.edu>
    # via Christopher Heschong's <chris@screwdriver.net>
    # webpage-to-pdb converter. Nice one lads, good trick!
    $lob->{page} =~ s/<a\s+name.*?>/$self->{cf}->{bookmark_char} /gis;
  }

  if ($format == $Sitescooper::Main::OUT_HTML) {
    $lob->{page} = $self->fix_links_and_anchors ($url, $lob->{page});
  }

  my $proc = $self->{scf}->get_story_param ('story_postproc', $url);
  if (defined $proc) {
    $lob->{page} = $self->apply_postproc ($lob->{page}, $proc);
  }

  $lob->clean_whitespace_and_colors();
  if (!$lob->rearrange_tables ($self->{scf}->get_story_param
		 ('table_render', $url)))
  {
    sitewarn ("TableProcess invalid");
  }

  if ($format == $Sitescooper::Main::OUT_DOC || $format == $Sitescooper::Main::OUT_TEXT) {
    $lob->convert_to_text ();
  }

  if ($format == $Sitescooper::Main::OUT_DOC) {	# trim multiple (blank) bookmarks
    $lob->{page} =~ s/($self->{cf}->{bookmark_char}\s+){2,}/$self->{cf}->{bookmark_char} /gs;
  }

  $lob->fix_entities();
  $lob->get_text();
}

# ---------------------------------------------------------------------------

sub fix_links_and_anchors {
  my ($self, $url, $page) = @_;

  if (!$self->{cf}->{fileperpage}) {
    # scope anchors so they won't conflict if they crop up on other
    # pages of this site.

    my $one_page_anchor = $self->{robot}->href_to_singlepage_href ($url);
    $one_page_anchor =~ s/[^-_A-Za-z0-9]/_/g;

    # the substitutions on the anchor name itself mimic what AbsoluteURL
    # will do to it.
    my ($pre, $post, $ank);
    $page =~ s{<a(\s[^>]*\s|\s)name=[\"\']([^\"\'>]+)[\"\'](|\s[^>]*)>}{
		$pre = $1; $post = $3; $ank = $2;
		$ank =~ s/ /_20/g; $ank =~ s/[^-_A-Za-z0-9]/_/g;
		"<!!!a $pre name=\"${one_page_anchor}__HASH__$ank\" $post>";
	      }gies;
    $page =~ s{<a(\s[^>]*\s|\s)name=([^ >]+?)(|\s[^>]*)>}{
		$pre = $1; $post = $3; $ank = $2;
		$ank =~ s/ /_20/g; $ank =~ s/[^-_A-Za-z0-9]/_/g;
		"<!!!a $pre name=\"${one_page_anchor}__HASH__$ank\" $post>";
	      }gies;
    $page =~ s{<!!!a}{<a}gs;
  }

  # note the conversion of href= to href!!!=. This stops the second
  # substitution from screwing up the output of the first one!
  $page =~ s/(<a\s+[^>]*href)\s*=\s*
		(?:\"|\'|%22)([^\'\">]+)(?:\"|\'|%22)([^>]*?>)
		(.*?)<\/a>/
	  $self->translate_link ($url, $2, $4, $1.'!!!=', $3);
      /giesx;
  $page =~ s/(<a\s+[^>]*href)\s*=\s*
		([^>\s\n]+)([^>]*>)
		(.*?)<\/a>/
	  $self->translate_link ($url, $2, $4, $1.'!!!=', $3);
      /giesx;
  $page =~ s/href!!!=/href=/gis;      # back to normal
  $page;
}

# ---------------------------------------------------------------------------

sub translate_link {
  my $self = shift;
  my ($url, $origlink, $text, $ahref, $posthref) = @_;

  if (!defined $ahref) { $ahref = "<a href="; }
  if (!defined $posthref) { $posthref = ">"; }

  my $nolink;
  my $link = Sitescooper::Util::AbsoluteURL ($url, $origlink);
  if ($text =~ /\S/) {
    $nolink = $self->{scoop}->delink_unscooped_internal_link ($link, $text);
  } else {
    $nolink = " ";		# sometimes there's no text!
  }

  return $nolink if ($link !~ /^(http|file):/i);	# only supported links

  if ($self->{cf}->{nolinkrewrite}
  	|| defined ($self->{robot}->{output_links_snarfed}->{$origlink}))
  {
    return $ahref."\"".$origlink."\"".$posthref.$text."</a>";
  }

  # <a href>s to images; already handled by ImageURLProcessor.
  if ($self->{cf}->{allowimgs} && $link =~ /\.(?:gif|png|jpeg|jpg|jpe)/i)
  {
    my $limitto = $self->{scf}->get_story_param ('imageurl', $url);
    if (defined $limitto) {
      if ($self->match_url ($link, $limitto)) {
	return $ahref."\"".$origlink."\"".$posthref.$text."</a>";
      }
    }
  }

  # translate to printable version first, in case the StoryURL pattern
  # only covers the printable style.
  $link = $self->make_printable ($link, 0);

  # Is the link one that we will be downloading? If not, just de-linkify
  # it. 1-level sites never have active links so we can just assume
  # the links should not be links.
  my $limitto = $self->{scf}->get_story_param ('story_limit_to', $url);
  if (!defined $limitto) { $limitto = $self->{scf}->{def_story_limit_to}; }

  if (!$self->match_url ($link, $limitto)
    && Sitescooper::Util::URLWithoutAnchor ($link) ne
	Sitescooper::Util::URLWithoutAnchor ($url))
  {
    # check the contents/issue levels as well.
    my $ok = 0;

    my $lev;
    for ($lev = $self->{scf}->{levels}; $lev >= 0; $lev--) {
      my $limitto = $self->{scf}->get_links_param
      				('links_limit_to', $url, $lev);
      if (defined $limitto) {
	if ($self->match_url ($link, $limitto)) { $ok = 1; last; }
      }
    }

    if ($ok == 0) {
      if ($self->{warn_about_ext_links}) {
	$self->dbg ("External link not translated: $link");
      }
      return $nolink;

      # REVISIT -- provide links at end of stories
    }
  }

  $self->dbg2 ("Translating link: $link");
  $link = $self->apply_url_preproc ($link);

  if (!defined $link) {
    # link is to be ignored
    return $nolink;
  }
 
  # Note that we always put in quotes around the URL.
  # remove_external_links, which is run later, requires this (and anyway
  # it makes for better HTML).
  #
  if ($self->{cf}->{fileperpage}) {
    my $relative = $self->{robot}->href_to_multipage_href ($link);
    $ahref."\"".$relative."\"".$posthref.$text."</a>";
  } else {
    my $anchor = $self->{robot}->href_to_singlepage_href ($link);
    $ahref."\"#".$anchor."\"".$posthref.$text."</a>";
  }
}

# ---------------------------------------------------------------------------

sub apply_postproc {
  my ($self, $page, $proc) = @_;
  my $bookmark_char = $self->{cf}->{bookmark_char};  # convenience for PostProc

  $_ = $page;
  if (!eval $proc."; 1;") {
    $self->sitewarn ("StoryPostProc failed: $@");
    # and keep the original $page
  } elsif (!defined $_) {
    $self->sitewarn ("StoryPostProc failed (\$_ is undefined): $@");
    # and keep the original $page
  } else {
    $page = $_;
  }
  $page;
}

# ---------------------------------------------------------------------------

sub match_url {
  my ($self, $url, $pat) = @_;
  $self->{scoop}->match_url ($self->{url_regexp_cache}, $url, $pat);
}

# ---------------------------------------------------------------------------

sub strip_html {
  my ($self, $url, $humantype, $level, $silently, $page) = @_;

  if (!defined $page) { return undef; }
  my $journal = (defined fileno Sitescooper::Main::JOURNAL);

  my ($startpat, $startincl, $endpat, $endincl, $usesmarts);

  if (!defined $level) {
    $startpat = $self->{scf}->get_story_param ('story_start', $url);
    $startincl = $self->{scf}->get_story_param ('story_incl_start', $url);
    $endpat = $self->{scf}->get_story_param ('story_end', $url);
    $endincl = $self->{scf}->get_story_param ('story_incl_end', $url);
    $usesmarts = $self->{scf}->get_story_param ('story_use_table_smarts', $url);

  } else {
    $startpat = $self->{scf}->get_links_param ('links_start', $url, $level);
    $startincl = $self->{scf}->get_links_param ('links_incl_start', $url, $level);
    $endpat = $self->{scf}->get_links_param ('links_end', $url, $level);
    $endincl = $self->{scf}->get_links_param ('links_incl_end', $url, $level);
    $usesmarts = $self->{scf}->get_links_param ('links_use_table_smarts', $url, $level);
  }

  $startincl ||= 0;
  $endincl ||= 0;
  if (!defined $usesmarts) { $usesmarts = 1; }

  # ok, now strip the headers
  my $incl = $startincl;
  my $pat = $startpat;
  if (defined $pat) {
    if (!$silently && $page =~ /${pat}.*${pat}/) {
      $self->sitewarn ($humantype."Start pattern \"$pat\" found multiple times in page $url\n");
    }

    my $res;
    if ($incl) {
      $res = ($page =~ s#^(.*?)(${pat})#$2#gs);
    } else {
      $res = ($page =~ s#^(.*?${pat})##gs);
    }

    if ($res) {
      if ($journal) { $self->{scoop}->journal ("pre_stripped", $1); }
      if ($page =~ s#^([^<>]*?>)##s) {
	if ($journal) { $self->{scoop}->journal ("pre_stripped_ends_of_tags", $1); }
      }
    } elsif (!$silently) {
      $self->sitewarn ($humantype."Start pattern \"$pat\" not found in page $url\n");
    }
  }

  # and now the end stuff, the footers
  $incl = $endincl;
  $pat = $endpat;
  if (defined $pat) {
    if (!$silently && $page =~ /${pat}.*${pat}/) {
      $self->sitewarn ($humantype."End pattern \"$pat\" found multiple times in page $url\n");
    }

    my $res;
    if ($incl) {
      $res = ($page =~ s#(${pat})(.*?)$#$1#s);
      if ($journal && $res) { $self->{scoop}->journal ("post_stripped", $2); }
    } else {
      $res = ($page =~ s#(${pat}.*)?$##s);
      if ($journal && $res) { $self->{scoop}->journal ("post_stripped", $1); }
    }
    if ($res) {
      if ($page =~ s#(<[^<>]*)$##s) {
	if ($journal) { $self->{scoop}->journal ("post_stripped_starts_of_tags", $1); }
      }
    } elsif (!$silently) {
      $self->sitewarn ($humantype."End pattern \"$pat\" not found in page $url\n");
    }
  }

  # smart_clean_table only operates on table items with size specifications.
  # TODO -- work out table sizes using images if possible.
  #
  if ($usesmarts) {
    if ($journal) { $self->{scoop}->journal ("pre_table_smarts", $page); }
    my $filter = Sitescooper::StripTablesFilter->new();
    $filter->set_main ($self->{scoop});
    $filter->parse ($page);
    $filter->parse ("\n");	# otherwise text bytes with no following NL will be lost

    $page = $filter->filtered_html();
    if ($journal) { $self->{scoop}->journal ("post_table_smarts", $page); }
  }

  $page =~ s/\r/ /g;    # strip CRs
  $page;
}

# ---------------------------------------------------------------------------

1;