File: spider_instance.rb

package info (click to toggle)
ruby-spider 0.5.0-6
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 184 kB
  • sloc: ruby: 824; makefile: 2
file content (312 lines) | stat: -rw-r--r-- 9,123 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
# Specialized spidering rules.

require File.dirname(__FILE__)+'/robot_rules.rb'
require 'open-uri'
require 'uri'
require 'net/http'
require 'net/https'

module Net #:nodoc:
  class HTTPResponse #:nodoc:
    def success?; false; end
    def redirect?; false; end
  end
  class HTTPSuccess #:nodoc:
    def success?; true; end
  end
  class HTTPRedirection #:nodoc:
    def redirect?; true; end
  end
end

class NilClass #:nodoc:
  def merge(h); h; end
end

class SpiderInstance
  def initialize(next_urls, seen = [], rules = nil, robots_seen = []) #:nodoc:
    @url_checks  = []
    @cache       = :memory
    @callbacks   = {}
    @next_urls   = [next_urls]
    @seen        = seen
    @rules       = rules || RobotRules.new("Ruby Spider #{Spider::VERSION}")
    @robots_seen = robots_seen
    @headers     = {}
    @setup       = nil
    @teardown    = nil
  end

  # Add a predicate that determines whether to continue down this URL's path.
  # All predicates must be true in order for a URL to proceed.
  #
  # Takes a block that takes a string and produces a boolean. For example, this
  # will ensure that the URL starts with 'http://cashcats.biz':
  #
  #  add_url_check { |a_url| a_url =~ %r{^http://cashcats.biz.*}
  def add_url_check(&block)
    @url_checks << block
  end

  # The Web is a graph; to avoid cycles we store the nodes (URLs) already
  # visited. The Web is a really, really, really big graph; as such, this list
  # of visited nodes grows really, really, really big.
  #
  # Change the object used to store these seen nodes with this. The default
  # object is an instance of Array. Available with Spider is a wrapper of
  # memcached.
  #
  # You can implement a custom class for this; any object passed to
  # check_already_seen_with must understand just << and included? .
  #
  #  # default
  #  check_already_seen_with Array.new
  #
  #  # memcached
  #  require 'spider/included_in_memcached'
  #  check_already_seen_with IncludedInMemcached.new('localhost:11211')
  def check_already_seen_with(cacher)
    if cacher.respond_to?(:<<) && cacher.respond_to?(:include?)
      @seen = cacher
    else
      raise ArgumentError, 'expected something that responds to << and included?'
    end
  end

  # The Web is a really, really, really big graph; as such, this list
  # of nodes to visit grows really, really, really big.
  #
  # Change the object used to store nodes we have yet to walk. The default
  # object is an instance of Array. Available with Spider is a wrapper of
  # AmazonSQS.
  #
  # You can implement a custom class for this; any object passed to
  # check_already_seen_with must understand just push and pop .
  #
  #  # default
  #  store_next_urls_with Array.new
  #
  #  # AmazonSQS
  #  require 'spider/next_urls_in_sqs'
  #  store_next_urls_with NextUrlsInSQS.new(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY, queue_name)
  def store_next_urls_with(a_store)
    tmp_next_urls = @next_urls
    @next_urls = a_store
    tmp_next_urls.each do |a_url_hash|
      @next_urls.push a_url_hash
    end
  end

  # Add a response handler. A response handler's trigger can be :every,
  # :success, :failure, or any HTTP status code. The handler itself can be
  # either a Proc or a block.
  #
  # The arguments to the block are: the URL as a string, an instance of
  # Net::HTTPResponse, and the prior URL as a string.
  #
  #
  # For example:
  #
  #  on 404 do |a_url, resp, prior_url|
  #    puts "URL not found: #{a_url}"
  #  end
  #
  #  on :success do |a_url, resp, prior_url|
  #    puts a_url
  #    puts resp.body
  #  end
  #
  #  on :every do |a_url, resp, prior_url|
  #    puts "Given this code: #{resp.code}"
  #  end
  def on(code, p = nil, &block)
    f = p ? p : block
    case code
    when Integer
      @callbacks[code] = f
    else
      @callbacks[code.to_sym] = f
    end
  end

  # Run before the HTTP request. Given the URL as a string.
  #  setup do |a_url|
  #    headers['Cookies'] = 'user_id=1;admin=true'
  #  end
  def setup(p = nil, &block)
    @setup = p ? p : block
  end

  # Run last, once for each page. Given the URL as a string.
  def teardown(p = nil, &block)
    @teardown = p ? p : block
  end

  # Use like a hash:
  #  headers['Cookies'] = 'user_id=1;password=btrross3'
  def headers
    HeaderSetter.new(self)
  end

  def raw_headers #:nodoc:
    @headers
  end
  def raw_headers=(v) #:nodoc:
    @headers = v
  end

  # Reset the headers hash.
  def clear_headers
    @headers = {}
  end

  def start! #:nodoc:
    interrupted = false
    trap("SIGINT") { interrupted = true }
    begin
      next_urls = @next_urls.pop
      tmp_n_u = {}
      next_urls.each do |prior_url, urls|
        urls = [urls] unless urls.kind_of?(Array)
        urls.map do |a_url|
          [a_url, (URI.parse(a_url) rescue nil)]
        end.select do |a_url, parsed_url|
          allowable_url?(a_url, parsed_url)
        end.each do |a_url, parsed_url|
          @setup.call(a_url) unless @setup.nil?
          get_page(parsed_url) do |response|
            do_callbacks(a_url, response, prior_url)
            #tmp_n_u[a_url] = generate_next_urls(a_url, response)
            #@next_urls.push tmp_n_u
            generate_next_urls(a_url, response).each do |a_next_url|
              @next_urls.push a_url => a_next_url
            end
            #exit if interrupted
          end
          @teardown.call(a_url) unless @teardown.nil?
          exit if interrupted
        end
      end
    end while !@next_urls.empty?
  end

  def success_or_failure(code) #:nodoc:
    if code > 199 && code < 300
      :success
    else
      :failure
    end
  end

  def allowable_url?(a_url, parsed_url) #:nodoc:
    !parsed_url.nil? && !@seen.include?(parsed_url) && allowed?(a_url, parsed_url) &&
      @url_checks.map{|url_check|url_check.call(a_url)}.all?
  end

  # True if the robots.txt for that URL allows access to it.
  def allowed?(a_url, parsed_url) # :nodoc:
    return false unless ['http','https'].include?(parsed_url.scheme)
    u = "#{parsed_url.scheme}://#{parsed_url.host}:#{parsed_url.port}/robots.txt"
    parsed_u = URI.parse(u)
    return false unless @url_checks.map{|url_check|url_check.call(a_url)}.all?
    begin
      unless @robots_seen.include?(u)
        #open(u, 'User-Agent' => 'Ruby Spider',
        #  'Accept' => 'text/html,text/xml,application/xml,text/plain', :ssl_verify => false) do |url|
        #  @rules.parse(u, url.read)
        #end
        get_page(parsed_u) do |r|
          @rules.parse(u, r.body)
        end
        @robots_seen << u
      end
      @rules.allowed?(a_url)
    rescue OpenURI::HTTPError
      true # No robots.txt
    rescue Exception, Timeout::Error # to keep it from crashing
      false
    end
  end

  def get_page(parsed_url, &block) #:nodoc:
    @seen << parsed_url
    begin
      http = Net::HTTP.new(parsed_url.host, parsed_url.port)
      if parsed_url.scheme == 'https'
        http.use_ssl = true
        http.verify_mode = OpenSSL::SSL::VERIFY_NONE
      end
      # Uses start because http.finish cannot be called.
      r = http.start {|h| h.request(Net::HTTP::Get.new(parsed_url.request_uri, @headers))}
      if r.redirect?
        get_page(URI.parse(construct_complete_url(parsed_url,r['Location'])), &block)
      else
        block.call(r)
      end
    rescue Timeout::Error, Errno::EINVAL, Errno::ECONNRESET, EOFError => e
      p e
      nil
    end
  end

  def do_callbacks(a_url, resp, prior_url) #:nodoc:
    cbs = [@callbacks[:every],
      resp.success? ?  @callbacks[:success] : @callbacks[:failure],
      @callbacks[resp.code]]

    cbs.each do |cb|
      cb.call(a_url, resp, prior_url) if cb
    end
  end

  def generate_next_urls(a_url, resp) #:nodoc:
    web_page = resp.body
    base_url = (web_page.scan(/base\s+href="(.*?)"/i).flatten +
                [a_url[0,a_url.rindex('/')]])[0]
    base_url = remove_trailing_slash(base_url)
    web_page.scan(/href="(.*?)"/i).flatten.map do |link|
      begin
        parsed_link = URI.parse(link)
        if parsed_link.fragment == '#'
          nil
        else
          construct_complete_url(base_url, link, parsed_link)
        end
      rescue
        nil
      end
    end.compact
  end

  def construct_complete_url(base_url, additional_url, parsed_additional_url = nil) #:nodoc:
    parsed_additional_url ||= URI.parse(additional_url)
    case parsed_additional_url.scheme
      when nil
        u = base_url.is_a?(URI) ? base_url : URI.parse(base_url)
        if additional_url[0].chr == '/'
          "#{u.scheme}://#{u.host}#{additional_url}"
        elsif u.path.nil? || u.path == ''
          "#{u.scheme}://#{u.host}/#{additional_url}"
        elsif u.path[0].chr == '/'
          "#{u.scheme}://#{u.host}#{u.path}/#{additional_url}"
        else
          "#{u.scheme}://#{u.host}/#{u.path}/#{additional_url}"
        end
    else
      additional_url
    end
  end

  def remove_trailing_slash(s) #:nodoc:
    s.sub(%r{/*$},'')
  end

  class HeaderSetter #:nodoc:
    def initialize(si)
      @si = si
    end
    def []=(k,v)
      @si.raw_headers = @si.raw_headers.merge({k => v})
    end
  end
end