File: spider.rb

package info (click to toggle)
ruby-mechanize 2.7.6-1%2Bdeb10u1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 1,480 kB
  • sloc: ruby: 11,380; makefile: 5; sh: 4
file content (22 lines) | stat: -rw-r--r-- 469 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
require 'rubygems'
require 'mechanize'

agent = Mechanize.new
agent.max_history = nil # unlimited history
stack = agent.get(ARGV[0]).links

while l = stack.pop
  next unless l.uri
  host = l.uri.host
  next unless host.nil? or host == agent.history.first.uri.host
  next if agent.visited? l.href

  puts "crawling #{l.uri}"
  begin
    page = l.click
    next unless Mechanize::Page === page
    stack.push(*page.links)
  rescue Mechanize::ResponseCodeError
  end
end