File: test_responsetypes.py

package info (click to toggle)
python-scrapy 2.13.3-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 5,664 kB
  • sloc: python: 52,028; xml: 199; makefile: 25; sh: 7
file content (124 lines) | stat: -rw-r--r-- 4,999 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
from scrapy.http import (
    Headers,
    HtmlResponse,
    JsonResponse,
    Response,
    TextResponse,
    XmlResponse,
)
from scrapy.responsetypes import responsetypes


class TestResponseTypes:
    def test_from_filename(self):
        mappings = [
            ("data.bin", Response),
            ("file.txt", TextResponse),
            ("file.xml.gz", Response),
            ("file.xml", XmlResponse),
            ("file.html", HtmlResponse),
            ("file.unknownext", Response),
        ]
        for source, cls in mappings:
            retcls = responsetypes.from_filename(source)
            assert retcls is cls, f"{source} ==> {retcls} != {cls}"

    def test_from_content_disposition(self):
        mappings = [
            (b'attachment; filename="data.xml"', XmlResponse),
            (b"attachment; filename=data.xml", XmlResponse),
            ("attachment;filename=data£.tar.gz".encode(), Response),
            ("attachment;filename=dataµ.tar.gz".encode("latin-1"), Response),
            ("attachment;filename=data高.doc".encode("gbk"), Response),
            ("attachment;filename=دورهdata.html".encode("cp720"), HtmlResponse),
            (
                "attachment;filename=日本語版Wikipedia.xml".encode("iso2022_jp"),
                XmlResponse,
            ),
        ]
        for source, cls in mappings:
            retcls = responsetypes.from_content_disposition(source)
            assert retcls is cls, f"{source} ==> {retcls} != {cls}"

    def test_from_content_type(self):
        mappings = [
            ("text/html; charset=UTF-8", HtmlResponse),
            ("text/xml; charset=UTF-8", XmlResponse),
            ("application/xhtml+xml; charset=UTF-8", HtmlResponse),
            ("application/vnd.wap.xhtml+xml; charset=utf-8", HtmlResponse),
            ("application/xml; charset=UTF-8", XmlResponse),
            ("application/octet-stream", Response),
            ("application/json; encoding=UTF8;charset=UTF-8", JsonResponse),
            ("application/x-json; encoding=UTF8;charset=UTF-8", JsonResponse),
            ("application/json-amazonui-streaming;charset=UTF-8", JsonResponse),
            (b"application/x-download; filename=\x80dummy.txt", Response),
        ]
        for source, cls in mappings:
            retcls = responsetypes.from_content_type(source)
            assert retcls is cls, f"{source} ==> {retcls} != {cls}"

    def test_from_body(self):
        mappings = [
            (b"\x03\x02\xdf\xdd\x23", Response),
            (b"Some plain text\ndata with tabs\t and null bytes\0", TextResponse),
            (b"<html><head><title>Hello</title></head>", HtmlResponse),
            # https://codersblock.com/blog/the-smallest-valid-html5-page/
            (b"<!DOCTYPE html>\n<title>.</title>", HtmlResponse),
            (b'<?xml version="1.0" encoding="utf-8"', XmlResponse),
        ]
        for source, cls in mappings:
            retcls = responsetypes.from_body(source)
            assert retcls is cls, f"{source} ==> {retcls} != {cls}"

    def test_from_headers(self):
        mappings = [
            ({"Content-Type": ["text/html; charset=utf-8"]}, HtmlResponse),
            (
                {
                    "Content-Type": ["text/html; charset=utf-8"],
                    "Content-Encoding": ["gzip"],
                },
                Response,
            ),
            (
                {
                    "Content-Type": ["application/octet-stream"],
                    "Content-Disposition": ["attachment; filename=data.txt"],
                },
                TextResponse,
            ),
        ]
        for source, cls in mappings:
            source = Headers(source)
            retcls = responsetypes.from_headers(source)
            assert retcls is cls, f"{source} ==> {retcls} != {cls}"

    def test_from_args(self):
        # TODO: add more tests that check precedence between the different arguments
        mappings = [
            ({"url": "http://www.example.com/data.csv"}, TextResponse),
            # headers takes precedence over url
            (
                {
                    "headers": Headers({"Content-Type": ["text/html; charset=utf-8"]}),
                    "url": "http://www.example.com/item/",
                },
                HtmlResponse,
            ),
            (
                {
                    "headers": Headers(
                        {"Content-Disposition": ['attachment; filename="data.xml.gz"']}
                    ),
                    "url": "http://www.example.com/page/",
                },
                Response,
            ),
        ]
        for source, cls in mappings:
            retcls = responsetypes.from_args(**source)
            assert retcls is cls, f"{source} ==> {retcls} != {cls}"

    def test_custom_mime_types_loaded(self):
        # check that mime.types files shipped with scrapy are loaded
        assert responsetypes.mimetypes.guess_type("x.scrapytest")[0] == "x-scrapy/test"