1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
|
require 'mechanize'
require 'tsort'
##
# This example implements the alt-text of http://xkcd.com/903/ which states:
#
# Wikipedia trivia: if you take any article, click on the first link in the
# article text not in parentheses or italics, and then repeat, you will
# eventually end up at "Philosophy".
class WikipediaLinksToPhilosophy
def initialize
@agent = Mechanize.new
@agent.user_agent_alias = 'Mac Safari' # Wikipedia blocks "mechanize"
@history = @agent.history
@wiki_url = URI 'http://en.wikipedia.org'
@search_url = @wiki_url + '/w/index.php'
@random_url = @wiki_url + '/wiki/Special:Random'
@title = nil
@seen = nil
end
##
# Retrieves the title of the current page
def extract_title
@page.title =~ /(.*) - Wikipedia/
@title = $1
end
##
# Retrieves the initial page. If +query+ is not given a random page is
# chosen
def fetch_first_page query
if query then
search query
else
random
end
end
##
# The search is finished if we've seen the page before or we've reached
# Philosophy
def finished?
@seen or @title == 'Philosophy'
end
##
# Follows the first non-parenthetical, non-italic link in the main body of
# the article.
def follow_first_link
puts @title
# > p > a rejects italics
links = @page.root.css('.mw-content-ltr > p > a[href^="/wiki/"]')
# reject disambiguation and special pages, images and files
links = links.reject do |link_node|
link_node['href'] =~ %r%/wiki/\w+:|\(disambiguation\)%
end
links = links.reject do |link_node|
in_parenthetical? link_node
end
link = links.first
unless link then
# disambiguation page? try the first item in the list
link =
@page.root.css('.mw-content-ltr > ul > li > a[href^="/wiki/"]').first
end
# convert a Nokogiri HTML element back to a mechanize link
link = Mechanize::Page::Link.new link, @agent, @page
return if @seen = @agent.visited?(link)
@page = link.click
extract_title
end
##
# Is +link_node+ in an open parenthetical section?
def in_parenthetical? link_node
siblings = link_node.parent.children
seen = false
before = siblings.reject do |node|
seen or (seen = node == link_node)
end
preceding_text = before.map { |node| node.text }.join
open = preceding_text.count '('
close = preceding_text.count ')'
open > close
end
##
# Prints the result of the search
def print_result
if @seen then
puts "[Loop detected]"
else
puts @title
end
puts
# subtract initial search or Special:Random
puts "After #{@agent.history.length - 1} pages"
end
##
# Retrieves a random page from wikipedia
def random
@page = @agent.get @random_url
extract_title
end
##
# Entry point
def run query = nil
fetch_first_page query
follow_first_link until finished?
print_result
end
##
# Searches for +query+ on wikipedia
def search query
@page = @agent.get @search_url, search: query
extract_title
end
end
WikipediaLinksToPhilosophy.new.run ARGV.shift if $0 == __FILE__
|