1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99
|
#===========================================================================
package Sitescooper::PreloadURLProcessor;
use Sitescooper::URLProcessor;
use Carp;
use strict;
use vars qw(
@ISA
);
@ISA = qw(Sitescooper::URLProcessor);
# ---------------------------------------------------------------------------
sub new {
my $class = shift; $class = ref($class) || $class;
my ($scoop, $robot, $scf, $ref, $url) = @_;
my $self = $class->SUPER::new($scoop, $robot, $scf, $ref, $url);
# bless ($self, $class);
$self;
}
# ---------------------------------------------------------------------------
sub start_get {
my $self = shift;
my $url = $self->{url};
if ($self->get_state() != $Sitescooper::URLProcessor::STATE_PRE_GET) {
croak ("state != STATE_PRE_GET");
}
my $fullurl = $url; $url = Sitescooper::Util::URLWithoutAnchor ($url);
study $url;
if ($url =~ m,^(ftp|mailto|https|gopher|pnm)://,) {
$self->{scoop}->dbg ("Non-story URL ignored (bad protocol): $fullurl");
return;
}
my $newurl = $self->apply_url_preproc($url);
if (!defined $newurl) {
$self->{scoop}->dbg ("URLProcess says URL should be ignored: $fullurl"); return;
} elsif ($newurl ne $url) {
$fullurl = $newurl; $url = Sitescooper::Util::URLWithoutAnchor ($newurl);
$self->{url} = $url;
}
if ($self->{scoop}->{cf}->{use_only_cache}) {
$self->dbg("-fromcache switch is on, not preloading");
return;
}
$self->{scoop}->verbose ("Preloading: $url");
$self->{http_state} =
$self->{scoop}->{httpclient}->start_get ($self->{referrer}, $url, undef);
if (!defined $self->{http_state}) {
croak "http_state is unset after start_get";
}
$self->set_state ($Sitescooper::URLProcessor::STATE_NET_WAIT);
1;
}
# ---------------------------------------------------------------------------
sub finish_get {
my $self = shift;
$self->set_state ($Sitescooper::URLProcessor::STATE_POST_GET);
if ($Sitescooper::Main::got_intr_flag) { return; }
my $url = $self->{url};
my $resp;
if (!defined $self->{http_state}) {
croak "http_state is unset in get_url_reply";
}
$resp = $self->{scoop}->{httpclient}->finish_get ($self->{http_state});
$self->{http_state} = undef;
if (!$resp->is_success) {
$self->sitewarn ("Preload GET failed: ".$resp->status_line." ($url)");
return undef;
}
$self->{scoop}->{preloaded_responses}->{$url} = $resp->content;
1;
}
# ---------------------------------------------------------------------------
1;
|