File: decompression.py

package info (click to toggle)
python-scrapy 2.4.1-2%2Bdeb11u1
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 4,748 kB
  • sloc: python: 32,888; xml: 199; makefile: 90; sh: 7
file content (83 lines) | stat: -rw-r--r-- 2,527 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
""" This module implements the DecompressionMiddleware which tries to recognise
and extract the potentially compressed responses that may arrive.
"""

import bz2
import gzip
import logging
import tarfile
import zipfile
from io import BytesIO
from tempfile import mktemp

from scrapy.responsetypes import responsetypes


logger = logging.getLogger(__name__)


class DecompressionMiddleware:
    """ This middleware tries to recognise and extract the possibly compressed
    responses that may arrive. """

    def __init__(self):
        self._formats = {
            'tar': self._is_tar,
            'zip': self._is_zip,
            'gz': self._is_gzip,
            'bz2': self._is_bzip2
        }

    def _is_tar(self, response):
        archive = BytesIO(response.body)
        try:
            tar_file = tarfile.open(name=mktemp(), fileobj=archive)
        except tarfile.ReadError:
            return

        body = tar_file.extractfile(tar_file.members[0]).read()
        respcls = responsetypes.from_args(filename=tar_file.members[0].name, body=body)
        return response.replace(body=body, cls=respcls)

    def _is_zip(self, response):
        archive = BytesIO(response.body)
        try:
            zip_file = zipfile.ZipFile(archive)
        except zipfile.BadZipfile:
            return

        namelist = zip_file.namelist()
        body = zip_file.read(namelist[0])
        respcls = responsetypes.from_args(filename=namelist[0], body=body)
        return response.replace(body=body, cls=respcls)

    def _is_gzip(self, response):
        archive = BytesIO(response.body)
        try:
            body = gzip.GzipFile(fileobj=archive).read()
        except IOError:
            return

        respcls = responsetypes.from_args(body=body)
        return response.replace(body=body, cls=respcls)

    def _is_bzip2(self, response):
        try:
            body = bz2.decompress(response.body)
        except IOError:
            return

        respcls = responsetypes.from_args(body=body)
        return response.replace(body=body, cls=respcls)

    def process_response(self, request, response, spider):
        if not response.body:
            return response

        for fmt, func in self._formats.items():
            new_response = func(response)
            if new_response:
                logger.debug('Decompressed response with format: %(responsefmt)s',
                             {'responsefmt': fmt}, extra={'spider': spider})
                return new_response
        return response