File: check_comics.py

package info (click to toggle)
dosage 3.2-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,400 kB
  • sloc: python: 12,703; sh: 55; makefile: 6
file content (156 lines) | stat: -rw-r--r-- 5,471 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# SPDX-License-Identifier: MIT
# Copyright (C) 2004-2008 Tristan Seligmann and Jonathan Jacobs
# Copyright (C) 2012-2014 Bastian Kleineidam
# Copyright (C) 2015-2021 Tobias Gruetzmacher
import json
import multiprocessing
import os
import re
import warnings
from urllib.parse import urlsplit


# Dictionary with per-host locks.
_locks = {}
# Allowed number of connections per host
MaxConnections = 2
# Maximum number of strips to get to test a comic
MaxStrips = 5
# Match (already-escaped) archive.org URL
ARCHIVE_ORG_MATCH = re.compile(r'(?<=web\\.archive\\.org/web)/\d+/')
# Matches some (maybe-escaped - because Python 2) printf-style format specifiers
PRINTF_MATCH = re.compile(r'\\?%[0-9]*[sd]')
# Classes where the modules are very similar, so that testing the history of
# each modules doesn't make much sense
standarized_modules = {
    'ComicSherpa',
    'ComicsKingdom',
    'GoComics',
    'MangaDex',
    'WebToons',
}
# Already seen classes
seen_modules = set()


def get_lock(host):
    """Get bounded semphore for given host."""
    if host not in _locks:
        _locks[host] = multiprocessing.BoundedSemaphore(MaxConnections)
    return _locks[host]


def test_comicmodule(tmpdir, scraperobj, worker_id):
    '''Test a scraper. It must be able to traverse backward for at least 5
    strips from the start, and find strip images on at least 4 pages.'''
    # Limit number of connections to one host.
    host = urlsplit(scraperobj.url).hostname
    with get_lock(host):
        maxstrips = MaxStrips
        parts = scraperobj.name.split('/', maxsplit=1)
        if len(parts) > 1 and parts[0] in standarized_modules:
            if parts[0] in seen_modules:
                maxstrips = 1
            else:
                seen_modules.add(parts[0])

        _test_comic(str(tmpdir), scraperobj, maxstrips)


def _test_comic(outdir, scraperobj, maxstrips):
    num_strips = 0
    strip = None
    files = []
    PROXYMAP.apply(scraperobj.name)
    for strip in scraperobj.getStrips(maxstrips):
        files.append(_check_strip(outdir, strip,
                                  scraperobj.multipleImagesPerStrip))

        if num_strips > 0:
            _check_stripurl(strip, scraperobj)
        num_strips += 1

    if scraperobj.prevSearch and not scraperobj.hitFirstStripUrl:
        # subtract the number of skipped URLs with no image from the expected
        # image number
        num_strips_expected = maxstrips - len(scraperobj.skippedUrls)
        msg = 'Traversed %d strips instead of %d.' % (num_strips,
                                                      num_strips_expected)
        if strip:
            msg += " Check the prevSearch pattern at %s" % strip.strip_url
        assert num_strips == num_strips_expected, msg
        if strip:
            _check_scraperesult(files, num_strips_expected, strip, scraperobj)


def _check_strip(outdir, strip, multipleImagesPerStrip):
    '''Check that a specific page yields images and the comic module correctly
    declares if there are multiple images per page.'''
    images = []
    files = []
    for image in strip.getImages():
        images.append(image.url)

        # write a fake image (to download less)
        fakeimg = image._fnbase(outdir) + '.fake'
        with open(fakeimg, 'w') as f:
            f.write("fake image for testing")

        fn, _ = image.save(outdir)
        files.append(fn)
    assert images, 'failed to find images at %s' % strip.strip_url
    if not multipleImagesPerStrip:
        assert len(images) == 1, 'found more than 1 image at {}: {}'.format(
            strip.strip_url, images)
    return files


def _check_scraperesult(saved_images, num_images_expected, strip, scraperobj):
    '''Check that exactly or for multiple pages at least num_strips images are
    saved. This checks saved files, ie. it detects duplicate filenames.'''
    num_images = len(saved_images)

    attrs = (num_images, saved_images, num_images_expected)
    if scraperobj.multipleImagesPerStrip:
        err = 'saved %d %s instead of at least %d images' % attrs
        assert num_images >= num_images_expected, err
    else:
        err = 'saved %d %s instead of %d images' % attrs
        assert num_images == num_images_expected, err


def _check_stripurl(strip, scraperobj):
    if not scraperobj.stripUrl:
        # no indexing support
        return
    # test that the stripUrl regex matches the retrieved strip URL
    urlmatch = re.escape(scraperobj.stripUrl)
    urlmatch = PRINTF_MATCH.sub('.+', urlmatch)
    urlmatch = ARCHIVE_ORG_MATCH.sub(r'/\\d+/', urlmatch)
    ro = re.compile(urlmatch)
    mo = ro.match(strip.strip_url)
    if not mo:
        warnings.warn('strip URL {!r} does not match stripUrl pattern {}'.format(
            strip.strip_url, urlmatch))


class ProxyConfig:
    """Loads proxy config from an environment variable and applies it for each test."""
    def __init__(self):
        self.config = {}
        if 'PROXYMAP' in os.environ:
            for regex, server in json.loads(os.environ['PROXYMAP']).items():
                self.config[re.compile(regex)] = server

    def apply(self, name):
        useserver = ''
        for regex, server in self.config.items():
            if regex.match(name):
                useserver = server
                break
        os.environ['http_proxy'] = useserver
        os.environ['https_proxy'] = useserver


# External proxy config to fetch some modules via proxies
PROXYMAP = ProxyConfig()