File: spider_instance_spec.rb

package info (click to toggle)
ruby-spider 0.5.0-6
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 184 kB
  • sloc: ruby: 824; makefile: 2
file content (405 lines) | stat: -rw-r--r-- 14,014 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
require File.dirname(__FILE__)+'/../spec_helper'
require 'webrick'
require 'webrick/https'
local_require 'spider', 'spider/included_in_memcached'

describe 'SpiderInstance' do
  # http://www.rcuk.ac.uk/ redirects to /default.htm, which isn't a complete
  # URL. Bug reported by Henri Cook.
  it 'should construct a complete redirect URL' do
    @response_called = false
    redirected_resp = stub(:redirect? => true,
                          :[] => '/default.htm')
    success_resp = stub(:redirect? => false)
    http_req = stub(:request => true)
    http_mock_redir = stub(:use_ssl= => true)
    http_mock_redir.stubs(:start).yields(http_req).returns(redirected_resp)
    http_mock_success = stub(:use_ssl= => true)
    http_mock_success.stubs(:start).yields(http_req).returns(success_resp)
    Net::HTTP.expects(:new).times(2).returns(http_mock_redir).then.
      returns(http_mock_success)
    si = SpiderInstance.new({nil => ['http://www.rcuk.ac.uk/']})
    si.get_page(URI.parse('http://www.rcuk.ac.uk/')) do |resp|
      @response_called = true
    end
    @response_called.should be_true
  end

  it 'should prevent cycles with an IncludedInMemcached' do
    with_memcached do
      cacher = IncludedInMemcached.new('localhost:11211')
      it_should_prevent_cycles_with(cacher)
    end
  end

  it 'should prevent cycles with an Array' do
    cacher = Array.new
    it_should_prevent_cycles_with(cacher)
  end

  it 'should call the "setup" callback before loading the Web page' do
    mock_successful_http
    @on_called = false
    @before_called = false
    si = SpiderInstance.new({nil => ['http://example.com/']})
    si.stubs(:allowed?).returns(true)
    si.stubs(:generate_next_urls).returns([])
    si.setup       { |*a| @before_called = Time.now }
    si.on(:every)  { |*a| @on_called = Time.now }
    si.start!
    @on_called.should_not be_false
    @before_called.should_not be_false
    @before_called.should_not be_false
    @before_called.should < @on_called
  end

  it 'should call the "teardown" callback after running all other callbacks' do
    mock_successful_http
    @on_called = false
    @after_called = false
    si = SpiderInstance.new({nil => ['http://example.com/']})
    si.stubs(:allowed?).returns(true)
    si.stubs(:generate_next_urls).returns([])
    si.on(:every)  { |*a| @on_called = Time.now }
    si.teardown    { |*a| @after_called = Time.now }
    si.start!
    @on_called.should_not be_false
    @after_called.should_not be_false
    @after_called.should_not be_false
    @after_called.should > @on_called
  end

  it 'should pass headers set by a setup handler to the HTTP request' do
    mock_successful_http
    Net::HTTP::Get.expects(:new).with('/foo',{'X-Header-Set' => 'True'})
    si = SpiderInstance.new(nil => ['http://example.com/foo'])
    si.stubs(:allowable_url?).returns(true)
    si.stubs(:generate_next_urls).returns([])
    si.setup do |a_url|
      si.headers['X-Header-Set'] = 'True'
    end
    si.teardown do |a_url|
      si.clear_headers
    end
    si.start!
  end

  it 'should call the :every callback with the current URL, the response, and the prior URL' do
    mock_successful_http
    callback_arguments_on(:every)
  end

  it 'should call the :success callback with the current URL, the request, and the prior URL' do
    mock_successful_http
    callback_arguments_on(:success)
  end

  it 'should call the :failure callback with the current URL, the request, and the prior URL' do
    mock_failed_http
    callback_arguments_on(:failure)
  end

  it 'should call the HTTP status error code callback with the current URL, the request, and the prior URL' do
    mock_failed_http
    callback_arguments_on(404)
  end

  it 'should call the HTTP status success code callback with the current URL, the request, and the prior URL' do
    mock_successful_http
    callback_arguments_on(200)
  end

  # Bug reported by John Nagro, using the example source http://eons.com/
  # had to change line 192; uses request_uri now instead of path.
  it 'should handle query URLs without a path' do
    u = 'http://localhost:8888?s=1'
    u_p = URI.parse(u)
    @block_called = false
    with_web_server(QueryServlet) do
      si = SpiderInstance.new({nil => [u]})
      si.get_page(u_p) do
        @block_called = true
      end
    end
    @block_called.should be_true
  end

  # This solves a problem reported by John Nagro.
  it 'should handle redirects' do
    u = 'http://example.com/'
    u_p = URI.parse(u)
    @redirect_handled = false
    mock_redirect_http
    si = SpiderInstance.new({nil => [u]})
    si.get_page(u_p) do
      @redirect_handled = true
    end
    @redirect_handled.should be_true
  end

  it 'should handle HTTPS' do
    u = 'https://localhost:10443/'
    u_p = URI.parse(u)
    @page_called = false
    server = WEBrick::HTTPServer.new(:Port => 10443,
                                     :Logger => null_logger,
                                     :AccessLog => [],
                                     :SSLEnable => true,
                                     :SSLCertName => [["O", "ruby-lang.org"], ["OU", "sample"], ["CN", WEBrick::Utils::getservername]],
                                     :SSLComment => 'Comment of some sort')
    server.mount('/', QueryServlet)
    Thread.new {server.start}
    si = SpiderInstance.new({nil => [u]})
    si.get_page(u_p) { @page_called = true }
    server.shutdown
    @page_called.should be_true
  end

  it 'should skip URLs when allowable_url? is false' do
    u = 'http://example.com/'
    u_p = URI.parse(u)
    http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
    Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
    si = SpiderInstance.new({nil => [u]})
    si.expects(:allowable_url?).with(u, u_p).returns(false)
    si.expects(:get_page).times(0)
    si.start!
  end

  it 'should not skip URLs when allowable_url? is true' do
    u = 'http://example.com/'
    u_p = URI.parse(u)
    http_resp = stub(:redirect? => false, :success? => true, :code => 200, :headers => 1, :body => 1)
    Net::HTTP.stubs(:new).returns(stub(:request => http_resp, :finish => nil))
    si = SpiderInstance.new({nil => [u]})
    si.expects(:allowable_url?).with(u, u_p).returns(true)
    si.expects(:get_page).with(URI.parse(u))
    si.start!
  end

  it 'should disallow URLs when the robots.txt says to' do
    robot_rules = stub
    SpiderInstance.any_instance.expects(:open).
      with('http://example.com:80/robots.txt', 'User-Agent' => 'Ruby Spider',
        'Accept' => 'text/html,text/xml,application/xml,text/plain').
      yields(stub(:read => 'robots.txt content'))
    robot_rules.expects(:parse).with('http://example.com:80/robots.txt',
                                     'robots.txt content')
    robot_rules.expects(:allowed?).with('http://example.com/').returns(false)
    si = SpiderInstance.new({nil => ['http://example.com/']}, [], robot_rules, [])
    allowable = si.allowable_url?('http://example.com/',
                                  URI.parse('http://example.com/'))
    allowable.should be_false
  end

  it 'should disallow URLs when they fail any url_check' do
    si = SpiderInstance.new({nil => ['http://example.com/']})
    si.stubs(:allowed?).returns(true)
    si.add_url_check { |a_url| false }
    allowable = si.allowable_url?('http://example.com/',
                                  URI.parse('http://example.com/'))
    allowable.should be_false
  end

  it 'should support multiple url_checks' do
    @first_url_check = false
    @second_url_check = false
    si = SpiderInstance.new({nil => ['http://example.com/']})
    si.stubs(:allowed?).returns(true)
    si.add_url_check do |a_url|
      @first_url_check = true
      true
    end
    si.add_url_check do |a_url|
      @second_url_check = true
      false
    end
    allowable = si.allowable_url?('http://example.com/',
                                  URI.parse('http://example.com/'))
    allowable.should be_false
    @first_url_check.should be_true
    @second_url_check.should be_true
  end

  it 'should avoid cycles' do
    u = 'http://example.com/'
    u_p = URI.parse(u)
    si = SpiderInstance.new({nil => [u]}, [u_p])
    si.stubs(:allowed?).returns(true)
    allowable = si.allowable_url?(u, u_p)
    allowable.should be_false
    u_p.should_not be_nil
  end

  it 'should call the 404 handler for 404s' do
    @proc_called = false
    mock_failed_http
    si = SpiderInstance.new({nil => ['http://example.com/']})
    si.stubs(:allowed?).returns(true)
    si.stubs(:generate_next_urls).returns([])
    si.on(404) {|*a| @proc_called = true}
    si.start!
    @proc_called.should be_true
  end

  it 'should call the :success handler on success' do
    @proc_called = false
    mock_successful_http
    si = SpiderInstance.new({nil => ['http://example.com/']})
    si.stubs(:allowed?).returns(true)
    si.stubs(:generate_next_urls).returns([])
    si.on(:success) {|*a| @proc_called = true}
    si.start!
    @proc_called.should be_true
  end

  it 'should not call the :success handler on failure' do
    @proc_called = false
    mock_failed_http
    si = SpiderInstance.new({nil => ['http://example.com/']})
    si.stubs(:allowed?).returns(true)
    si.stubs(:generate_next_urls).returns([])
    si.on(:success) {|*a| @proc_called = true}
    si.start!
    @proc_called.should be_false
  end

  it 'should call the :success handler and the 200 handler on 200' do
    @proc_200_called = false
    @proc_success_called = false
    mock_successful_http
    si = SpiderInstance.new({nil => ['http://example.com/']})
    si.stubs(:allowed?).returns(true)
    si.stubs(:generate_next_urls).returns([])
    si.on(:success) {|*a| @proc_success_called = true}
    si.on(200)      {|*a| @proc_200_called     = true}
    si.start!
    @proc_200_called.should be_true
    @proc_success_called.should be_true
  end

  it 'should not call the :failure handler on success' do
    @proc_called = false
    mock_successful_http
    si = SpiderInstance.new({nil => ['http://example.com/']})
    si.stubs(:allowed?).returns(true)
    si.stubs(:generate_next_urls).returns([])
    si.on(:failure) {|*a| @proc_called = true}
    si.start!
    @proc_called.should be_false
  end

  it 'should call the :failure handler on failure' do
    @proc_called = false
    mock_failed_http
    si = SpiderInstance.new({nil => ['http://example.com/']})
    si.stubs(:allowed?).returns(true)
    si.stubs(:generate_next_urls).returns([])
    si.on(:failure) {|*a| @proc_called = true}
    si.start!
    @proc_called.should be_true
  end

  it 'should call the :failure handler and the 404 handler on 404' do
    @proc_404_called = false
    @proc_failure_called = false
    mock_failed_http
    si = SpiderInstance.new({nil => ['http://example.com/']})
    si.stubs(:allowed?).returns(true)
    si.stubs(:generate_next_urls).returns([])
    si.on(:failure) {|*a| @proc_failure_called = true}
    si.on(404) {|*a| @proc_404_called = true}
    si.start!
    @proc_404_called.should be_true
    @proc_failure_called.should be_true
  end

  it 'should call the :every handler even when a handler for the error code is defined' do
    @any_called = false
    mock_successful_http
    si = SpiderInstance.new({nil => ['http://example.com/']})
    si.stubs(:allowed?).returns(true)
    si.stubs(:generate_next_urls).returns([])
    si.on(:every) { |*a| @any_called = true }
    si.on(202) {|*a|}
    si.start!
    @any_called.should be_true
  end

  it 'should support a block as a response handler' do
    @proc_called = false
    mock_successful_http
    si = SpiderInstance.new({nil => ['http://example.com/']})
    si.stubs(:allowed?).returns(true)
    si.stubs(:generate_next_urls).returns([])
    si.on(:every) { |*a| @proc_called = true }
    si.start!
    @proc_called.should be_true
  end

  it 'should support a proc as a response handler' do
    @proc_called = false
    mock_successful_http
    si = SpiderInstance.new({nil => ['http://example.com/']})
    si.stubs(:allowed?).returns(true)
    si.stubs(:generate_next_urls).returns([])
    si.on(:every, Proc.new { |*a| @proc_called = true })
    si.start!
    @proc_called.should be_true
  end

  def mock_http(http_req)
    http_obj = mock(:use_ssl= => true)
    http_obj.expects(:start).
      yields(mock(:request => http_req)).returns(http_req)
    Net::HTTP.expects(:new).returns(http_obj)
  end

  def mock_successful_http
    http_req = stub(:redirect? => false, :success? => true, :code => 200, :body => 'body')
    mock_http(http_req)
  end

  def mock_failed_http
    http_req = stub(:redirect? => false, :success? => false, :code => 404)
    mock_http(http_req)
  end

  def mock_redirect_http
    http_req = stub(:redirect? => true, :success? => false, :code => 404)
    http_req.expects(:[]).with('Location').returns('http://example.com/')
    http_req2 = stub(:redirect? => false, :success? => true, :code => 200)
    http_obj = mock(:use_ssl= => true)
    http_obj.expects(:start).
      yields(mock(:request => http_req)).returns(http_req)
    http_obj2 = mock(:use_ssl= => true)
    http_obj2.expects(:start).
      yields(mock(:request => http_req2)).returns(http_req2)
    Net::HTTP.expects(:new).times(2).returns(http_obj).then.returns(http_obj2)
  end

  def callback_arguments_on(code)
    si = SpiderInstance.new('http://foo.com/' => ['http://example.com/'])
    si.stubs(:allowed?).returns(true)
    si.stubs(:generate_next_urls).returns([])
    si.on(code) do |a_url, resp, prior_url|
      a_url.should == 'http://example.com/'
      resp.should_not be_nil
      prior_url.should == 'http://foo.com/'
    end
    si.start!
  end

  def it_should_prevent_cycles_with(cacher)
    u = 'http://localhost:8888/'
    u_p = URI.parse(u)
    u2 = 'http://localhost:8888/foo'
    u_p2 = URI.parse(u2)

    with_web_server(LoopingServlet) do
      si = SpiderInstance.new(nil => [u])
      si.check_already_seen_with cacher
      si.start!
    end
  end
end