1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44
|
import asyncio
import sys
from scrapy import Spider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.defer import deferred_from_coro
from twisted.internet.defer import Deferred
class UppercasePipeline:
async def _open_spider(self, spider):
spider.logger.info("async pipeline opened!")
await asyncio.sleep(0.1)
def open_spider(self, spider):
return deferred_from_coro(self._open_spider(spider))
def process_item(self, item, spider):
return {"url": item["url"].upper()}
class UrlSpider(Spider):
name = "url_spider"
start_urls = ["data:,"]
custom_settings = {
"ITEM_PIPELINES": {UppercasePipeline: 100},
}
def parse(self, response):
yield {"url": response.url}
if __name__ == "__main__":
try:
ASYNCIO_EVENT_LOOP = sys.argv[1]
except IndexError:
ASYNCIO_EVENT_LOOP = None
process = CrawlerProcess(settings={
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"ASYNCIO_EVENT_LOOP": ASYNCIO_EVENT_LOOP,
})
process.crawl(UrlSpider)
process.start()
|