File: gocomics.py

package info (click to toggle)
dosage 3.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky
  • size: 1,400 kB
  • sloc: python: 12,703; sh: 55; makefile: 6
file content (63 lines) | stat: -rwxr-xr-x 2,201 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
#!/usr/bin/env python3
# SPDX-License-Identifier: MIT
# SPDX-FileCopyrightText: © 2004 Tristan Seligmann and Jonathan Jacobs
# SPDX-FileCopyrightText: © 2012 Bastian Kleineidam
# SPDX-FileCopyrightText: © 2015 Tobias Gruetzmacher
"""
Script to get a list of gocomics and save the info in a JSON file for further
processing.
"""

from scriptutil import ComicListUpdater


class GoComicsUpdater(ComicListUpdater):
    dup_templates = (
        "ComicsKingdom/%s",
    )

    # names of comics to exclude
    excluded_comics = (
        # Has its own module
        'Widdershins',
        # Moved to webtoons
        "FalseKnees",
    )
    spanish: set[str] = set()

    def handle_atozlist(self, url: str) -> None:
        """Parse one GoComics alphabetic page."""
        data = self.get_url(url, expand=False)

        for comiclink in self.xpath(data, '//a[d:class_start("ComicsAtoZ_comics__link_")]'):
            link = comiclink.attrib['href'].split('/')[1].strip()
            name = comiclink.xpath('.//h3')[0].text
            self.add_comic(name, (link, self.detect_lang(name, link)))

    def detect_lang(self, name: str, link: str) -> str | None:
        '''Language heuristics'''
        if ("en Español" in name or
                "spanish" in link or "espanol" in link or
                link in self.spanish):
            return "es"
        return None

    def find_spanish(self) -> None:
        data = self.get_url('https://www.gocomics.com/comics', expand=False)
        for comiclink in self.xpath(data, '//section[.//h2[contains(text(), "en Español")]]//a'):
            self.spanish.add(comiclink.attrib['href'].split('/')[1].strip())

    def collect_results(self) -> None:
        """Parse all listing pages."""
        self.find_spanish()
        self.handle_atozlist('https://www.gocomics.com/comics/a-to-z')
        self.handle_atozlist('https://www.gocomics.com/political-cartoons/political-a-to-z')

    def get_entry(self, name: str, data: tuple[str, str]) -> str:
        url, lang = data
        langopt = ", '%s'" % lang if lang else ''
        return u"cls('%s', '%s'%s)," % (name, url, langopt)


if __name__ == '__main__':
    GoComicsUpdater(__file__).run()