File: spiderctl.py

package info (click to toggle)
python-scrapy 0.8-3
  • links: PTS, VCS
  • area: main
  • in suites: squeeze
  • size: 2,904 kB
  • ctags: 2,981
  • sloc: python: 15,349; xml: 199; makefile: 68; sql: 64; sh: 34
file content (155 lines) | stat: -rw-r--r-- 5,781 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
"""
Extensions for allowing spider control from web console
"""

from scrapy.xlib.pydispatch import dispatcher

from scrapy.core import signals
from scrapy.core.manager import scrapymanager
from scrapy.core.engine import scrapyengine
from scrapy.spider import spiders
from scrapy.management.web import banner
from scrapy.conf import settings

class Spiderctl(object):
    webconsole_id = 'spiderctl'
    webconsole_name = 'Spider control panel'

    def __init__(self):
        self.running = {}
        self.finished = set()
        dispatcher.connect(self.spider_opened, signal=signals.spider_opened)
        dispatcher.connect(self.spider_closed, signal=signals.spider_closed)

        from scrapy.management.web import webconsole_discover_module
        dispatcher.connect(self.webconsole_discover_module, signal=webconsole_discover_module)

    def spider_opened(self, spider):
        self.running[spider.domain_name] = spider

    def spider_closed(self, spider):
        del self.running[spider.domain_name]
        self.finished.add(spider.domain_name)

    def webconsole_render(self, wc_request):
        if wc_request.args:
            changes = self.webconsole_control(wc_request)

        self.scheduled = [s.domain_name for s in scrapyengine.spider_scheduler._pending_spiders]
        self.idle = [d for d in self.enabled_domains if d not in self.scheduled
                                                        and d not in self.running
                                                        and d not in self.finished]

        s = banner(self)
        s += '<table border=1">\n'
        s += "<tr><th>Idle (%d)</th><th>Scheduled (%d)</th><th>Running (%d/%d)</th><th>Finished (%d)</th></tr>\n" % \
                (len(self.idle),
                 len(self.scheduled),
                 len(self.running),
                 settings['CONCURRENT_SPIDERS'],
                 len(self.finished))
        s += "<tr>\n"

        # idle
        s += "<td valign='top'>\n"
        s += '<form method="post" action=".">\n'
        s += '<select name="add_pending_domains" multiple="multiple">\n'
        for domain in sorted(self.idle):
            s += "<option>%s</option>\n" % domain
        s += '</select><br>\n'
        s += '<br />'
        s += '<input type="submit" value="Schedule selected">\n'
        s += '</form>\n'
        s += "</td>\n"

        # scheduled
        s += "<td valign='top'>\n"
        s += '<form method="post" action=".">\n'
        s += '<select name="remove_pending_domains" multiple="multiple">\n'
        for domain in self.scheduled:
            s += "<option>%s</option>\n" % domain
        s += '</select><br>\n'
        s += '<br />'
        s += '<input type="submit" value="Remove selected">\n'
        s += '</form>\n'

        s += "</td>\n"

        # running
        s += "<td valign='top'>\n"
        s += '<form method="post" action=".">\n'
        s += '<select name="stop_running_domains" multiple="multiple">\n'
        for domain in sorted(self.running):
            s += "<option>%s</option>\n" % domain
        s += '</select><br>\n'
        s += '<br />'
        s += '<input type="submit" value="Stop selected">\n'
        s += '</form>\n'
        s += "</td>\n"

        # finished
        s += "<td valign='top'>\n"
        s += '<form method="post" action=".">\n'
        s += '<select name="rerun_finished_domains" multiple="multiple">\n'
        for domain in sorted(self.finished):
            s += "<option>%s</option>\n" % domain
        s += '</select><br>\n'
        s += '<br />'
        s += '<input type="submit" value="Re-schedule selected">\n'
        s += '</form>\n'
        s += "</td>\n"

        s += "</tr>\n"
        s += "</table>\n"

        if wc_request.args:
            s += changes

        s += "</body>\n"
        s += "</html>\n"

        return s

    def webconsole_control(self, wc_request):
        args = wc_request.args
        s = "<hr />\n"

        if "stop_running_domains" in args:
            s += "<p>"
            stopped_domains = []
            for domain in args["stop_running_domains"]:
                if domain in self.running:
                    scrapyengine.close_spider(self.running[domain])
                    stopped_domains.append(domain)
            s += "Stopped spiders: <ul><li>%s</li></ul>" % "</li><li>".join(stopped_domains)
            s += "</p>"
        if "remove_pending_domains" in args:
            removed = []
            for domain in args["remove_pending_domains"]:
                if scrapyengine.spider_scheduler.remove_pending_domain(domain):
                    removed.append(domain)
            if removed:
                s += "<p>"
                s += "Removed scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["remove_pending_domains"])
                s += "</p>"
        if "add_pending_domains" in args:
            for domain in args["add_pending_domains"]:
                if domain not in scrapyengine.scheduler.pending_requests:
                    scrapymanager.crawl(domain)
            s += "<p>"
            s += "Scheduled spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["add_pending_domains"])
            s += "</p>"
        if "rerun_finished_domains" in args:
            for domain in args["rerun_finished_domains"]:
                if domain not in scrapyengine.scheduler.pending_requests:
                    scrapymanager.crawl(domain)
                self.finished.remove(domain)
            s += "<p>"
            s += "Re-scheduled finished spiders: <ul><li>%s</li></ul>" % "</li><li>".join(args["rerun_finished_domains"])
            s += "</p>"

        return s
        
    def webconsole_discover_module(self):
        self.enabled_domains = spiders.list()
        return self