File: CacheSingleton.pm

package info (click to toggle)
sitescooper 3.1.2-1
  • links: PTS
  • area: main
  • in suites: sarge, woody
  • size: 3,000 kB
  • ctags: 662
  • sloc: perl: 8,677; makefile: 105
file content (204 lines) | stat: -rw-r--r-- 5,234 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
#===========================================================================
# 
# CacheSingleton; methods to implement the Singleton pattern in terms
# of cache use.  In other words, cache objects are actually pointers to
# reference-counted pages in a shared "page cache" directory.

package Sitescooper::CacheSingleton;

use Carp;
use Sitescooper::Main;

@ISA = qw();
use vars qw{ @ISA $SLASH $MAXFILENAMELEN };
use strict;

# ---------------------------------------------------------------------------

sub new {
  my $class = shift; $class = ref($class) || $class;

  my ($main, $factory) = @_;

  $SLASH                = $Sitescooper::Main::SLASH;
  my $self = {
    'main'		=> $main,
    'factory'		=> $factory,
    'lastmod'		=> 0,
    'refcount'		=> 1,
  };

  $MAXFILENAMELEN = 256;	# reasonable default I'd say
  if ($Sitescooper::Main::use_hashes_for_cache_filenames) { # set on MacOS 9.x
    $MAXFILENAMELEN = 32;
  }

  bless ($self, $class);
  $self;
}

# ---------------------------------------------------------------------------

sub get_cached_page {
  my ($self, $ptr) = @_;

  my $pagefile = $self->{factory}->{pagecachedir}.$SLASH.$ptr;

  open (IN, "<".$pagefile) or return undef;
  binmode IN; my $cachedpage = join ('', <IN>); close IN;

  if ($cachedpage =~ s/^<!-- ([^>]+) -->\n//s) {
    foreach my $nv (split (/\s+\/\/\s+/, $1)) {
      $nv =~ /^\s*(\S+): (\S+)\s*$/ or next;
      my $name = $1;
      my $val = $2;

      if ($name eq 'lastmod') { $self->{lastmod} = $val+0; }
      elsif ($name eq 'refcount') { $self->{refcount} = $val+0; }
    }
  }

  $cachedpage;
}

# ---------------------------------------------------------------------------

sub save_cached_page {
  my ($self, $url, $sum, $size, $pagetext, $lastmod) = @_;

  my $pagefile = $self->mk_pagefile_filename ($url, $sum, $size);
  my $pagecache = $self->{factory}->{pagecachedir};

  my $header1 = "<!-- refcount: ";
  my $header2 = " ";
  if (defined $lastmod) { $header2 = " lastmod: $lastmod "; }
  $header2 .= "-->\n";

  my $tries = 0;
  while (-f "$pagefile.lk") {
    warn "another process is updating $pagefile... sleeping.\n";
    $tries++; if ($tries > 20) {
      warn "overriding lock on $pagefile.\n"; last;
    }
    sleep 1;
  }

  open (PID, ">$pagefile.lk")
  		or warn "write to $pagefile.lk failed: $!\n";
  print PID $$;
  close PID;
  
  my $refcount = 0;
  if (open (IN, "<$pagefile")) {
    $_ = <IN>;
    / refcount: (\d+) / and $refcount = $1;
    close IN;
  }

  $refcount++;
  $header1 .= $refcount;

  open (OUT, ">$pagefile");
  binmode OUT;
  print OUT $header1, $header2, $pagetext;
  close OUT;

  unlink ("$pagefile.lk");

  $self->dbg ("cache singleton $pagefile: ref count now $refcount");

  # trim off the pagecache dir for return.
  $pagefile =~ s/\Q${pagecache}\E\///g;
  $pagefile;
}

# ---------------------------------------------------------------------------

sub dec_refcount_for_page {
  my ($self, $pagefile) = @_;
  local ($_);

  $pagefile = $self->{factory}->{pagecachedir}.$SLASH.$pagefile;
  my $decfile = $pagefile.".dec";
  my $bakfile = $pagefile.".bak";

  open (IN, "<$pagefile") or return undef;
  if (!open (OUT, ">$decfile")) {
    warn "Cannot write to $decfile\n";
    close IN;
    return undef;
  }

  # read the first line, and decrement the ref count therein.
  $_ = <IN>;
  my $refcount;
  if (/ refcount: (\d+) /) {
    $refcount = $1;
  } else {
    $refcount = 1;		# default value
    $_ =~ s/ -->/ refcount: 1 -->/;
  }

  $refcount--;
  s/ refcount: \d+ / refcount: ${refcount} /;
  print OUT $_;

  if ($refcount <= 0) {
    close IN; close OUT;
    unlink ($pagefile, $decfile);
    $self->dbg ("cache singleton $pagefile: deleted");
    return 1;
  }

  while (<IN>) { print OUT; }
  close IN;
  close OUT or warn "Failed to write to $decfile\n";

  rename ($pagefile, $bakfile) or warn "rename $pagefile failed";
  rename ($decfile, $pagefile) or warn "rename $pagefile failed";
  unlink ($bakfile);

  $self->dbg ("cache singleton $pagefile: ref count now $refcount");
}

# ---------------------------------------------------------------------------

sub mk_pagefile_filename {
  my ($self, $url, $sum, $size) = @_;

  # Generate a deep cache filename. It'll look like:
  #
  #    HOST         PATH              CKSUM SIZE
  #    sitename_com/some_page_in_site-95443-3243
  #
  # Very long names are shortened, by stripping off the start of the URL
  # and turning it into a 32-bit hash represented as a hex value. The
  # end of the URL is left untouched where possible to avoid filename
  # collisions.
  #
  if (length ($url) > $MAXFILENAMELEN-5) {
    my $splitpoint = length ($url) - ($MAXFILENAMELEN-10);

    my $start = substr ($url, 0, $splitpoint);	# hashed into a 32bit value
    my $end = substr ($url, $splitpoint);	# left as a string

    $url = sprintf ("%08lx_%s", unpack ("%32C*", $start), $end);
  }

  Sitescooper::PerSiteDirCache::mk_generic_cache_filename
  	($self->{factory}->{pagecachedir}, $url."-".$sum."-".$size, 1);
}

sub dbg {
  my $self = shift;
  $self->{main}->dbg(@_);
}

sub verbose {
  my $self = shift;
  $self->{main}->verbose(@_);
}

# ---------------------------------------------------------------------------

1;