1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48
|
from __future__ import annotations
import asyncio
import sys
from scrapy import Spider
from scrapy.crawler import CrawlerProcess
from scrapy.utils.defer import deferred_from_coro
class UppercasePipeline:
async def _open_spider(self, spider):
spider.logger.info("async pipeline opened!")
await asyncio.sleep(0.1)
def open_spider(self, spider):
return deferred_from_coro(self._open_spider(spider))
def process_item(self, item, spider):
return {"url": item["url"].upper()}
class UrlSpider(Spider):
name = "url_spider"
start_urls = ["data:,"]
custom_settings = {
"ITEM_PIPELINES": {UppercasePipeline: 100},
}
def parse(self, response):
yield {"url": response.url}
if __name__ == "__main__":
ASYNCIO_EVENT_LOOP: str | None
try:
ASYNCIO_EVENT_LOOP = sys.argv[1]
except IndexError:
ASYNCIO_EVENT_LOOP = None
process = CrawlerProcess(
settings={
"TWISTED_REACTOR": "twisted.internet.asyncioreactor.AsyncioSelectorReactor",
"ASYNCIO_EVENT_LOOP": ASYNCIO_EVENT_LOOP,
}
)
process.crawl(UrlSpider)
process.start()
|