File: wikicrawl.pl

package info (click to toggle)
libgraph-easy-perl 0.71-1
  • links: PTS, VCS
  • area: main
  • in suites: wheezy
  • size: 4,284 kB
  • sloc: perl: 24,909; makefile: 2
file content (318 lines) | stat: -rw-r--r-- 8,321 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#!/usr/bin/perl 

use strict;
use Graph::Easy;
use LWP;
use HTML::TokeParser;
use utf8;
use Getopt::Long;
use Encode;
use Data::Dumper;

my $VERSION = 0.03;

# things that shouldn't be looked at
my %bad = map { $_ => 1 } qw/
  Wikipedia Image Talk Help Template Portal Special User Category
  Wikipedia Bild Diskussion Hilfe Vorlage Portal Spezial Benutzer Kategorie
  Wikipédia Image Discuter Modèle Mod%C3%A9le Aide Utilisateur Catégorie Cat%C3%A9gorie
  /;
# do not crawl these:
my $skip = qr/\((disambiguation|Begriffsklärung|Homonymie)\)/i;
# to figure out redirections
my $redir = qr/(Weitergeleitet von|Redirected from|Redirig. depuis).*?title="(.*?)"/i;

# the default settings are defined in get_options()
# option handling
my $help_requested = 0; $help_requested = 1 if @ARGV == 0;

my $opt = get_options();

# error?
$help_requested = 1 if !ref($opt);

# no error and --help was specified
$help_requested = 2 if ref($opt) && $opt->{help} ne '';

my $copyright = "wikicrawl v$VERSION  (c) by Tels 2008.  "
        	."Released under the GPL 2.0 or later.\n\n"
        	."After a very cool idea by 'integral' on forum.xkcd.com. Thanx! :)\n\n";

if (ref($opt) && $opt->{version} != 0)
  {
  print $copyright;
  print "Running under Perl v$].\n\n";
  exit 2;
  }

if ($help_requested > 0)
  {
  print STDERR $copyright;
  require Pod::Usage;
  if ($help_requested > 1 && $Pod::Usage::VERSION < 1.35)
    {
    # The way old Pod::Usage executes "perldoc" might fail:
    system('perldoc', $0);
    exit 2;
    }
  Pod::Usage::pod2usage( { -exitval => 2, -verbose => $help_requested } );
  }

my $verbose = $opt->{verbose};

output ($copyright);

my $graph = Graph::Easy->new();
# set some default attributes on the graph
$graph->set_attribute('node','shape',$opt->{nodeshape});
$graph->set_attribute('node','font-size','80%');
$graph->set_attribute('edge','arrowstyle','filled');
$graph->set_attribute('graph','label',"Wikipedia map for $opt->{root}");
$graph->set_attribute('graph','font-size', '200%');
$graph->set_attribute('graph','comment', "Created with wikicrawl.pl v$VERSION");

output ("Using the following settings:\n");
print Data::Dumper->Dump([$opt], ['opt']);

# don't crawl stuff twice
my %visitedLinks;
# re-use the UserAgent object
my $ua = LWP::UserAgent->new();
#$ua->agent("WikiCrawl/$VERSION - " . $ua->_agent . " - vGraph::Easy $Graph::Easy::VERSION");

# count how many we have done
my $nodes = 0;

# enable UTF-8 output
binmode STDERR, ':utf8';
binmode STDOUT, ':utf8';

# push the first node on the stack
my @todo = [$opt->{root},0];
# and work on it (this will take one off and then push more nodes on it)
while (@todo && crawl()) { };

my $file = "wikicrawl-$opt->{lang}.txt";
output ("Generating $file:\n");
open(my $DATA, ">", "$file") or die("Could not write to '$file': $!");
binmode ($DATA,':utf8');
print $DATA $graph->as_txt();
close $DATA;
output ("All done.\n");

my $png = $file; $png =~ s/.txt/.png/;

output ("Generating $png:\n");
`perl -Ilib bin/graph-easy --png --renderer=$opt->{renderer} $file`;

output ("All done.\n");

########################################################################################

# main crawl routine
sub crawl {
  no warnings 'recursion';

  # all done?
  return if @todo == 0;
  my ($name,$depth) = ($todo[0]->[0],$todo[0]->[1]);
  shift @todo;

  my $page = "http://$opt->{lang}.wikipedia.org/wiki/$name";

  # limit depth
  return if $depth + 1 > $opt->{maxdepth};
  # already did as many nodes?
  return if $opt->{maxnodes} > 0 && $nodes > $opt->{maxnodes};
  # skip this page
  return 1 if exists $visitedLinks{$page};

  # crawl page
  my $res = $ua->request(HTTP::Request->new(GET => $page));
  return 1 unless $res->is_success();

  # remove the " - Wikipedia" (en) or " – Wikipedia" (de) from the title
  my $title = decode('utf8',$res->title);	# convert to UTF-8
  $title =~ s/ [–-] Wikip[ée]dia.*//;
  return 1 if $title =~ $skip;			# no disambiguation pages

  # tels: not sure when/why these happen:
  print STDERR "# $title ",$res->title()," $page\n" if $title eq '';

  output ("Crawling node #$nodes '$title' at depth $depth\n"); $nodes++;

  # set flag
  $visitedLinks{$page} = undef;
  my $content = $res->content;

  # parse anchors
  my $parser = HTML::TokeParser->new(\$content) or die("Could not parse page.");

  # handle redirects:
  $content = decode('utf-8', $content);
  $content =~ $redir; my $old = $2;

  if ($old)
    {
    output (" Redirected to '$title' from '$old'\n");
    # find the node named "$old" (at the same time adding it if it didn't exist yet)
    my $source = $graph->add_node($old);
    # and mention the redirect in the label
    $source->set_attribute('label', "$old\\n($title)");
    # now force edges to come from that node
    $title = $old; 
    }

  # iterate over all links
  for(my $i = 0; (my $token = $parser->get_tag("a")) && ($i < $opt->{maxspread} || $opt->{maxspread} == 0);)
    {
    my $url = $token->[1]{href};
    my $alt = $token->[1]{title};

    next unless defined $url;
    # we do not crawl these:
    next if $url !~ m/^\/wiki\//;	 	# no pages outside of wikipedia
    next if $alt =~ $skip;			# no disambiguation pages
    next if $alt =~ m/\[/;			# no brackets

    my @chunks = split ":", substr(decode('utf-8',$url), 6);	# extract special pages, if any
    next if exists $bad{$chunks[0]};		# no bad pages

    $i++;
    if ($title ne $alt)
      {
      output (" Adding link from '$title' to '$alt'\n", 1);
      my ($from,$to,$edge) = $graph->add_edge_once($title,$alt);
      if (defined $to)
	{
	my $old_depth = $to->raw_attribute('rank');
        if (!$old_depth)
	  {
	  my $color = sprintf("%i", (360 / $opt->{maxdepth}) * ($depth));
	  $to->set_attribute('fill', 'hsl(' .$color.',1,0.7)');
	  # store rank
	  $to->set_attribute('rank', $depth+1);
          }
	}
      }
    my $u = $url; $u =~ s/^\/wiki\///;
    push @todo, [$u,$depth+1];
    }

  # continue
  return 1;
  }

sub get_options
  {
  my $opt = {};
  $opt->{help} = '';
  $opt->{version} = 0;
  # max depth to crawl
  $opt->{maxdepth} = 4;
  # max number of links per node
  $opt->{maxspread} = 5;
  # stop after so many nodes, -1 to disable
  $opt->{maxnodes} = -1;
  # language
  $opt->{lang} = 'en';
  # root node
  $opt->{root} = 'Xkcd';
  $opt->{renderer} = 'neato';
  $opt->{nodeshape} = 'rect';
  my @o = (
    "language=s" => \$opt->{lang},
    "root=s" => \$opt->{root},
    "maxdepth=i" => \$opt->{maxdepth},
    "maxspread=i" => \$opt->{maxspread},
    "maxnodes=i" => \$opt->{maxnodes},
    "version" => \$opt->{version},
    "help|?" => \$opt->{help},
    "verbose" => \$opt->{verbose},
    "nodeshape" => \$opt->{nodeshape},
    );
  return unless Getopt::Long::GetOptions (@o);
  $opt;
  }

sub output
  {
  my ($txt, $level) = @_;

  $level |= 0;

  print STDERR $txt if $opt->{verbose} || $level == 0;
  }

=pod

=head1 NAME

wikicrawl - crawl Wikipedia to generate graph from the found article links

=head1 SYNOPSIS

Crawl wikipedia and create a L<Graph::Easy> text describing the inter-article links
that were found during the crawl.

At least one argument must be given to start:

	perl examples/wikicrawl.pl --lang=fr

=head1 ARGUMENTS

Here are the options:

=over 12

=item --help

Print the full documentation, not just this short overview.

=item --version

Write version info and exit.

=item --language

Select the language of Wikipedia that we should crawl. Currently supported
are 'de', 'en' and 'fr'. Default is 'en'.

=item --root

Set the root node where the crawl should start. Default is of course 'Xkcd'.

=item --maxdepth

The maximum depth the crawl should go. Please select small values under 10. Default is 4.

=item --maxspread

The maximum number of links we follow per article. Please select small values under 10. Default is 5.

=item --maxnodes

The maximum number of nodes we crawl. Set to -1 (default) to disable.

=back

=head1 SEE ALSO

L<http://forums.xkcd.com/viewtopic.php?f=2&t=21300&p=672184> and
L<Graph::Easy>.

=head1 LICENSE

This library is free software; you can redistribute it and/or modify
it under the terms of the GPL.

See the LICENSE file of Graph::Easy for a copy of the GPL.

X<license>

=head1 AUTHOR

Copyright (C) 2008 by integral L<forum.xkcd.com>
Copyright (C) 2008 by Tels L<http://bloodgate.com>

=cut