File: wikipedia_links_to_philosophy.rb

package info (click to toggle)
ruby-mechanize 2.7.6-1%2Bdeb10u1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 1,480 kB
  • sloc: ruby: 11,380; makefile: 5; sh: 4
file content (159 lines) | stat: -rw-r--r-- 3,199 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
require 'mechanize'
require 'tsort'

##
# This example implements the alt-text of http://xkcd.com/903/ which states:
#
# Wikipedia trivia: if you take any article, click on the first link in the
# article text not in parentheses or italics, and then repeat, you will
# eventually end up at "Philosophy".

class WikipediaLinksToPhilosophy

  def initialize
    @agent = Mechanize.new
    @agent.user_agent_alias = 'Mac Safari' # Wikipedia blocks "mechanize"

    @history = @agent.history

    @wiki_url = URI 'http://en.wikipedia.org'
    @search_url = @wiki_url + '/w/index.php'
    @random_url = @wiki_url + '/wiki/Special:Random'

    @title = nil
    @seen = nil
  end

  ##
  # Retrieves the title of the current page

  def extract_title
    @page.title =~ /(.*) - Wikipedia/

    @title = $1
  end

  ##
  # Retrieves the initial page.  If +query+ is not given a random page is
  # chosen

  def fetch_first_page query
    if query then
      search query
    else
      random
    end
  end

  ##
  # The search is finished if we've seen the page before or we've reached
  # Philosophy

  def finished?
    @seen or @title == 'Philosophy'
  end

  ##
  # Follows the first non-parenthetical, non-italic link in the main body of
  # the article.

  def follow_first_link
    puts @title

    # > p > a rejects italics
    links = @page.root.css('.mw-content-ltr > p > a[href^="/wiki/"]')

    # reject disambiguation and special pages, images and files
    links = links.reject do |link_node|
      link_node['href'] =~ %r%/wiki/\w+:|\(disambiguation\)%
    end

    links = links.reject do |link_node|
      in_parenthetical? link_node
    end

    link = links.first

    unless link then
      # disambiguation page? try the first item in the list
      link =
        @page.root.css('.mw-content-ltr > ul > li > a[href^="/wiki/"]').first
    end

    # convert a Nokogiri HTML element back to a mechanize link
    link = Mechanize::Page::Link.new link, @agent, @page

    return if @seen = @agent.visited?(link)

    @page = link.click

    extract_title
  end

  ##
  # Is +link_node+ in an open parenthetical section?

  def in_parenthetical? link_node
    siblings = link_node.parent.children

    seen = false

    before = siblings.reject do |node|
      seen or (seen = node == link_node)
    end

    preceding_text = before.map { |node| node.text }.join

    open  = preceding_text.count '('
    close = preceding_text.count ')'

    open > close
  end

  ##
  # Prints the result of the search

  def print_result
    if @seen then
      puts "[Loop detected]"
    else
      puts @title
    end
    puts
    # subtract initial search or Special:Random
    puts "After #{@agent.history.length - 1} pages"
  end

  ##
  # Retrieves a random page from wikipedia

  def random
    @page = @agent.get @random_url

    extract_title
  end

  ##
  # Entry point

  def run query = nil
    fetch_first_page query

    follow_first_link until finished?

    print_result
  end

  ##
  # Searches for +query+ on wikipedia

  def search query
    @page = @agent.get @search_url, search: query

    extract_title
  end

end

WikipediaLinksToPhilosophy.new.run ARGV.shift if $0 == __FILE__