1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145
|
# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
# SPDX-FileCopyrightText: © 2019 Daniel Ring
from __future__ import annotations
import json
import re
from contextlib import suppress
from re import compile
from .. import util
from ..helpers import indirectStarter
from ..scraper import BasicScraper, ParserScraper
from ..util import tagre
from .common import ComicControlScraper, WordPressNavi, WordPressScraper
class UberQuest(ParserScraper):
baseUrl = 'https://uberquest.studiokhimera.com/'
url = baseUrl + 'wp-json/keeros_comics/v1/chapters'
stripUrl = baseUrl + 'wp-json/wp/v2/cfx_comic_page?page_number=%s'
firstStripUrl = stripUrl % 'cover'
def starter(self):
# Retrieve comic metadata from API
data = self.session.get(self.url)
data.raise_for_status()
return self.stripUrl % data.json()[-1]['pages'][-1]['page_number']
def getPrevUrl(self, url, data):
return self.stripUrl % json.loads(data.text_content())[0]['prev_id']
def extract_image_urls(self, url, data):
return [json.loads(data.text_content())[0]['attachment']]
def namer(self, imageUrl, pageUrl):
return 'UberQuest-' + pageUrl.rsplit('=', 1)[-1]
class Underling(WordPressNavi):
url = ('https://web.archive.org/web/20190806120425/'
'http://underlingcomic.com/')
firstStripUrl = url + 'page-one/'
endOfLife = True
class Undertow(BasicScraper):
url = 'http://undertow.dreamshards.org/'
imageSearch = compile(tagre("img", "src", r'([^"]+\.jpg)'))
prevSearch = compile(r'href="(.+?)".+?teynpoint')
latestSearch = compile(r'href="(.+?)".+?Most recent page')
starter = indirectStarter
class unDivine(ComicControlScraper):
url = 'https://www.undivinecomic.com/'
stripUrl = url + 'comic/%s'
firstStripUrl = stripUrl % 'page-1'
def namer(self, image_url, page_url):
# Fix inconsistent filenames
filename = util.urlpathsplit(image_url)[-1].replace(' ', '-')
filename = filename.replace('10B311D9-0992-4D74-AEB8-DAB714DA67C6', 'UD-322')
filename = filename.replace('99266624-7EF7-4E99-9EC9-DDB5F59CBDFD', 'UD-311')
filename = filename.replace('33C6A5A1-F703-4A0A-BCD5-DE1A09359D8E', 'UD-310')
filename = filename.replace('6CE01E81-C299-43C7-A221-8DE0670EFA30', 'ch4endbonusq4')
filename = filename.replace('DB66D93B-1FE5-49C7-90E0-FFF981DCD6B3', 'bipolar')
if len(filename) > 15 and filename[0].isdigit() and filename[10] == '-':
filename = filename[11:]
return filename
class UnicornJelly(BasicScraper):
baseUrl = 'http://unicornjelly.com/'
url = baseUrl + 'uni666.html'
stripUrl = baseUrl + 'uni%s.html'
firstStripUrl = stripUrl % '001'
imageSearch = compile(r'</TABLE>(?:<FONT COLOR="BLACK">)?<IMG SRC="(images/[^"]+)" WIDTH=')
prevSearch = compile(r'<A HREF="(uni\d{3}[bcs]?\.html)">(<FONT COLOR="BLACK">)?<IMG SRC="images/back00\.gif"')
help = 'Index format: nnn'
class Unsounded(ParserScraper):
url = 'https://www.casualvillain.com/Unsounded/'
startUrl = url + 'comic+index/'
stripUrl = url + 'comic/ch%s/ch%s_%s.html'
firstStripUrl = stripUrl % ('01', '01', '01')
imageSearch = '//div[@id="comic"]//img'
prevSearch = '//a[d:class("back")]'
latestSearch = '//div[@id="chapter_box"][1]//a[last()]'
multipleImagesPerStrip = True
starter = indirectStarter
style_bg_regex = re.compile(r'background-image: url\((.*pageart/.*)\)')
help = 'Index format: chapter-page'
def extract_image_urls(self, url, data):
urls = []
with suppress(ValueError):
urls.extend(super().extract_image_urls(url, data))
# Include background for multi-image pages
cssbg = self.extract_css_bg(data)
if cssbg:
urls.append(cssbg)
if not urls:
raise ValueError(f'No comic found at {url!r}')
return urls
def extract_css_bg(self, page) -> str | None:
comicdivs = self.match(page, '//div[@id="comic"]')
if comicdivs:
style = comicdivs[0].attrib.get('style')
if style:
hit = self.style_bg_regex.search(style)
if hit:
return hit.group(1)
return None
def namer(self, image_url, page_url):
filename = util.urlpathsplit(image_url)[-1]
pagename = util.urlpathsplit(page_url)[-1]
if pagename.split('.', 1)[0] != filename.split('.', 1)[0]:
filename = pagename.split('_', 1)[0] + '_' + filename
return filename
def getPrevUrl(self, url, data):
# Fix missing navigation links between chapters
if 'ch13/you_let_me_fall' in url:
return self.stripUrl % ('13', '13', '85')
return super().getPrevUrl(url, data)
def getIndexStripUrl(self, index):
chapter, num = index.split('-')
return self.stripUrl % (chapter, chapter, num)
class UrgentTransformationCrisis(WordPressScraper):
url = 'http://www.catomix.com/utc/'
firstStripUrl = url + 'comic/cover1'
def namer(self, imageUrl, pageUrl):
# Fix inconsistent filenames
filename = util.urlpathsplit(imageUrl)[-1]
return filename.replace('FVLYHD', 'LYHDpage').replace('UTC084web', '20091218c')
|