1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201
|
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2022 Tobias Gruetzmacher
import codecs
import html
import json
import os
import re
import sys
import time
import lxml
from dosagelib.scraper import scrapers
from dosagelib.util import get_page
from dosagelib import http
def first_lower(x):
return x[0].lower()
class ComicListUpdater(object):
dup_templates: tuple[str, ...] = ()
excluded_comics: tuple[str, ...] = ()
START = "# START AUTOUPDATE"
END = "# END AUTOUPDATE"
def __init__(self, name: str):
self.json = name.replace(".py", ".json")
self.session = http.default_session
self.sleep = 0
def get_url(self, url: str, expand=True):
"""Get an HTML page and parse it with LXML."""
print("Parsing", url, file=sys.stderr)
try:
pagetext = get_page(url, self.session).text
data = lxml.html.document_fromstring(pagetext)
if expand:
data.make_links_absolute(url)
if self.sleep > 0:
time.sleep(self.sleep)
return data
except IOError as msg:
print("ERROR:", msg, file=sys.stderr)
raise
def should_skip(self, name: str):
if contains_case_insensitive(self.res, name):
# we cannot handle two comics that only differ in case
print("INFO: skipping possible duplicate", repr(name),
file=sys.stderr)
return True
return False
def get_results(self):
"""Collect comics and save dictionary in JSON file."""
self.res = {}
self.collect_results()
if not self.res:
print("ERROR:", "did not match any comics", file=sys.stderr)
return
with codecs.open(self.json, 'wb', 'utf-8') as f:
json.dump(self.res, f, sort_keys=True, indent=2,
separators=(',', ': '))
def add_comic(self, name: str, data, count=None):
"""Add a collected comic with a specific number of comics."""
name = format_name(name)
if not self.should_skip(name):
self.res[name] = {'count': count, 'data': data}
return True
return False
def collect_results(self):
raise NotImplementedError
def print_results(self, args):
"""Print all comics that have at least the given number of minimum
comic strips."""
min_comics, filename = args
min_comics = int(min_comics)
oldf = codecs.open(filename, 'r', 'utf-8')
newf = codecs.open(filename + '.new', 'w', 'utf-8')
with oldf, newf:
indent = self.copy_until_start(oldf, newf)
with codecs.open(self.json, 'rb', 'utf-8') as f:
data = json.load(f)
for name, entry in sorted(data.items(), key=first_lower):
self.write_entry(newf, name, entry, min_comics, indent)
self.copy_after_end(oldf, newf)
os.replace(filename + '.new', filename)
def copy_until_start(self, src, dest):
for line in src:
dest.write(line)
if line.strip().startswith(self.START):
return line.find(self.START)
raise RuntimeError("can't find start marker!")
def copy_after_end(self, src, dest):
skip = True
for line in src:
if line.strip().startswith(self.END):
skip = False
if not skip:
dest.write(line)
if skip:
raise RuntimeError("can't find end marker!")
def write_entry(self, fp, name, entry, min_comics, indent):
if name in self.excluded_comics:
return
count = entry['count']
if count and count < min_comics:
return
dup = self.find_dups(name)
fp.write(" " * indent)
if dup is not None:
fp.write(u"# %s has a duplicate in %s\n" % (name, dup))
else:
fp.write(self.get_entry(
truncate_name(name),
entry['data']).replace("\n", "\n" + (" " * indent)) + "\n")
def find_dups(self, name):
"""Check if comic name already exists."""
names = [(tmpl % name).lower() for tmpl in self.dup_templates]
if names:
for scraper in scrapers.all():
lname = scraper.name.lower()
if lname in names:
return scraper.name
return None
def get_entry(self, name, data):
"""Return an entry for the module generator."""
raise NotImplementedError
def run(self):
if len(sys.argv) > 1:
self.print_results(sys.argv[1:])
else:
self.get_results()
def contains_case_insensitive(adict, akey):
"""Check if key is in adict. The search is case insensitive."""
for key in adict:
if key.lower() == akey.lower():
return True
return False
def capfirst(text):
"""Uppercase the first character of text."""
if not text:
return text
return text[0].upper() + text[1:]
def save_result(res, json_file):
"""Save result to file."""
with codecs.open(json_file, 'wb', 'utf-8') as f:
json.dump(res, f, sort_keys=True, indent=2, separators=(',', ': '))
def load_result(json_file):
"""Load contents of a json file."""
with codecs.open(json_file, 'rb', 'utf-8') as f:
return json.load(f)
def truncate_name(text):
"""Ensure the comic name does not exceed 50 characters."""
return text[:50]
def asciify(name):
"""Remove non-ascii characters from string."""
return re.sub("[^0-9a-zA-Z_]", "", name)
TRANS = str.maketrans({
'&': 'And',
'@': 'At',
'ñ': 'n',
'á': 'a',
})
def format_name(text):
"""Format a comic name."""
name = html.unescape(text)
name = "".join(capfirst(x) for x in name.split(" "))
return asciify(name.translate(TRANS))
|