File: downloadMSOfficeDocuments.pl

package info (click to toggle)
calligra 1%3A2.9.11%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 189,332 kB
  • sloc: cpp: 919,806; xml: 27,759; ansic: 10,472; python: 8,190; perl: 2,724; yacc: 2,557; sh: 1,675; lex: 1,431; java: 1,304; sql: 903; ruby: 734; makefile: 48
file content (145 lines) | stat: -rwxr-xr-x 3,847 bytes parent folder | download | duplicates (11)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#! /usr/bin/env perl
use warnings;
use strict;
use File::Path;
use LWP::UserAgent;
use HTML::LinkExtor;
use URI;
use URI::URL;
use URI::Escape;

# This script downloads MS Office files for you. You should provide a search
# term which is used to get a list of document.
# To download 20 presentations about pears do:
#    downloadMSOfficeDocuments.pl 20 pear ppt
#
# Copyright 2009 Jos van den Oever <jos@vandenoever.info>

if ($#ARGV != 2 || $ARGV[0] !~ m/^\d+$/ || $ARGV[1] !~ m/^\w+$/) {
	die "Usage: $0 number term type\n";
}

my $maxresults = $ARGV[0];
my $term = $ARGV[1];
my $type = $ARGV[2];
my $maxjobs = 10;

my %mimetypes = (
	"ppt", "application/vnd.ms-powerpoint",
	"pdf", "application/pdf",
        "pptx", "application/vnd.openxmlformats-officedocument.presentationml.presentation" ,
        "xlsx", "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
	"docx", "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
	"doc", "application/msword",
	"xls", "application/vnd.ms-excel",
	"rtf", "application/rtf",
        "ods", "application/vnd.oasis.opendocument.spreadsheet",
        "odt", "application/vnd.oasis.opendocument.text",
        "odp", "application/vnd.oasis.opendocument.presentation"
);

if (!defined $mimetypes{$type}) {
	die "Unknown type '$type'.\n";
}
my $mimetype = $mimetypes{$type};

#used to dispatch web requests
my $ua = LWP::UserAgent->new;
$ua->timeout(10); # seconds
$ua->env_proxy;
my $agentstring = 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1)';
$ua->agent($agentstring);
my @pages;
sub callback {
	my($tag, %attr) = @_;
	# we only look closer at <a ...>
	return if $tag ne 'a';
	push(@pages, values %attr);
}
#extracts links from an HTML document
my $p = HTML::LinkExtor->new(\&callback);

my $runningjobs = 0;
sub startJob {

#Shifts the first value of the array off and returns it, shortening the array by 1 and moving everything down. 
	my $uri = shift;
        if ($runningjobs >= $maxjobs) {
		wait;
		$runningjobs--;
 	}
	$runningjobs++;
	if (!fork()) {
		my $localuri = $uri;
		my $localua = LWP::UserAgent->new;
		$localua->timeout(10); # seconds
		$localua->env_proxy;
		$localua->agent($agentstring);
		my $res = $localua->request(HTTP::Request->new(HEAD => $localuri));
		if ($res->content_type() eq $mimetype) {
			my $filename = uri_unescape($localuri);
			$filename =~ s#^http://##;
			$filename = uri_escape($filename, '/:\!&*$?;:= \'"');
	                print $localuri."\n";
			$ua->get($localuri, ':content_file'  => $filename);
		}
		exit;
 	}
}

my @jobs;
my %done;
sub addJob {
#Shifts the first value of the array off and returns it, shortening the array by 1 and moving everything down. 
	my $uri = shift;
	my $scheme = $uri->scheme;
	if (exists $done{$uri} || ($scheme ne "http" && $scheme ne "https")) {
		return;
	}
	$done{$uri} = 1;
        push @jobs, $uri;
        while (@jobs && $runningjobs < $maxjobs) {
		startJob(pop @jobs);
		#sleep 1;
	}
}

for (my $start=0; $start < $maxresults; $start = $start + 100) {
	if ($start > 0) {
		sleep 3; # do not query search engine too often
	}
	@pages = ();
	my $base = "http://www.google.com/";
	my $url = $base . "search?q=$term+filetype:$type&start=$start&num=100";
	my $res = $ua->request(HTTP::Request->new(GET => $url), sub {$p->parse($_[0])});
	foreach (@pages) {
		my $uri = URI->new_abs($_, $base);
		if ($uri->host =~ m/google/) {
			my @q = $uri->query_form;
			if (!@q) {
				next;
			}
			for (my $i = 0; $i <= @q; $i++) {
				$uri = URI->new_abs($q[$i], $base);
				if (($uri->scheme eq "http" ||
						$uri->scheme eq "https") &&
						$uri->host !~ m/google/) {
					addJob($uri);
				}
			}
		} else {
			addJob($uri);
		}
	}
}

# keep the queue filled
while (@jobs) {
	startJob(pop @jobs);
}

# wait for jobs to finish
my $pid;
do {
	$pid = wait;
} while ($pid != -1);