File: test_scanners.py

package info (click to toggle)
puremagic 2.1.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,472 kB
  • sloc: python: 2,138; makefile: 9; sh: 7
file content (137 lines) | stat: -rw-r--r-- 5,554 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
import puremagic
from test.common import IMAGE_DIR, OFFICE_DIR, SYSTEM_DIR, AUDIO_DIR
from puremagic.scanners import python_scanner, json_scanner, sndhdr_scanner

sample_text = b"""Lorem ipsum dolor sit amet, consectetur adipiscing elit,{ending}
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.{ending}
{ending}
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.{ending}
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.{ending}
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.{ending}
"""


def test_text_scanner():
    # Test the text scanner with a sample text file
    lr_file = OFFICE_DIR / "text_lf.txt"
    lr_file.write_bytes(sample_text.replace(b"\n", b"").replace(b"{ending}", b"\n"))
    results = puremagic.magic_file(lr_file)
    assert results[0].extension == ".txt"
    assert results[0].name == "ascii text, with LF line terminators"
    assert results[0].mime_type == "text/plain"
    assert results[0].confidence == 0.9

    crlf_file = OFFICE_DIR / "text_crlf.txt"
    crlf_file.write_bytes(sample_text.replace(b"\n", b"").replace(b"{ending}", b"\r\n"))
    results = puremagic.magic_file(crlf_file)
    assert results[0].extension == ".txt"
    assert results[0].name == "ascii text, with CRLF line terminators"
    assert results[0].mime_type == "text/plain"
    assert results[0].confidence == 0.9

    cr_file = OFFICE_DIR / "text_cr.txt"
    cr_file.write_bytes(sample_text.replace(b"\n", b"").replace(b"{ending}", b"\r"))
    results = puremagic.magic_file(cr_file)
    assert results[0].name == "ascii text, with CR line terminators"
    assert results[0].extension == ".txt"
    assert results[0].mime_type == "text/plain"
    assert results[0].confidence == 0.9


def test_utf16_le_not_mp1():
    # GH #134: UTF-16 LE BOM (FF FE) should not be misidentified as .mp1
    data = b"\xff\xfe" + "a,b,c\n1,2,3\n".encode("utf-16-le")
    result = puremagic.from_string(data)
    assert result != ".mp1", "UTF-16 LE data misidentified as .mp1"
    result_mime = puremagic.from_string(data, mime=True)
    assert result_mime != "audio/mpeg", "UTF-16 LE data misidentified as audio/mpeg"


def test_utf16_le_csv_deep_scan():
    # GH #134: UTF-16 LE CSV file should be detected as CSV via text_scanner deep scan
    utf16_csv = OFFICE_DIR / "test_utf16le.csv"
    results = puremagic.magic_file(utf16_csv)
    assert results[0].extension == ".csv"
    assert results[0].mime_type == "text/csv"
    assert "comma" in results[0].name
    assert results[0].confidence >= 0.9


def test_from_string_nonexistent_filename():
    # GH #137: passing filename for extension hint should not raise FileNotFoundError
    # Use PDF-like bytes so identify_all finds a match via magic numbers,
    # then deep scan is skipped (file doesn't exist) and the match is returned.
    pdf_bytes = b"%PDF-1.4 fake content"
    result = puremagic.from_string(pdf_bytes, filename="nonexistent.pdf")
    assert result == ".pdf"

    # magic_string should also work without crashing
    results = puremagic.magic_string(pdf_bytes, filename="nonexistent.pdf")
    assert any(r.extension == ".pdf" for r in results)


def test_python_scanner():
    # Test the Python scanner with a sample Python file
    py_file = SYSTEM_DIR / "test.py"
    result = python_scanner.main(py_file, None, None)
    magic_result = puremagic.magic_file(py_file)
    assert result is not None
    assert result.extension == ".py"
    assert result.confidence == magic_result[0].confidence
    assert result.name == "Python Script"
    assert result.mime_type == "text/x-python"
    assert result.confidence == 1.0


def test_json_scanner():
    json_file = SYSTEM_DIR / "test.json"
    result = json_scanner.main(json_file, b"{", b"}")
    magic_result = puremagic.magic_file(json_file)
    assert result is not None
    assert result.confidence == magic_result[0].confidence
    assert result.extension == ".json"
    assert result.name == "JSON File"
    assert result.mime_type == "application/json"
    assert result.confidence == 1.0


def test_eml_scanner():
    eml_file = OFFICE_DIR / "test.eml"
    results = puremagic.magic_file(eml_file)
    assert results[0].extension == ".eml"
    assert results[0].name == "RFC 2822 Email Message"
    assert results[0].mime_type == "message/rfc822"
    assert results[0].confidence == 1.0


def test_jpg_without_extension():
    # GH #141: JPEG file without extension should still be identified as image/jpeg
    import struct

    data = b"\xff\xd8\xff\xe0"
    data += struct.pack(">H", 16)
    data += b"JFIF\x00\x01\x01\x00"
    data += struct.pack(">HH", 1, 1)
    data += b"\x00\x00\xff\xd9"

    no_ext_file = IMAGE_DIR / "test_jpeg_no_ext"
    no_ext_file.write_bytes(data)
    try:
        result = puremagic.from_file(no_ext_file, mime=True)
        assert result == "image/jpeg", f"Expected image/jpeg, got {result}"
    finally:
        no_ext_file.unlink()


def test_sndhdr_scanner():
    # Test the sndhdr scanner with sndr file
    sndr_file = AUDIO_DIR / "test.sndr"
    with open(sndr_file, "rb") as f:
        head = f.read(512)
    result = sndhdr_scanner.main(None, head, None)
    puremagic.magic_file(sndr_file)
    assert result is not None
    assert result.extension == ".sndr"
    assert result.name.startswith("Macintosh SNDR Resource")
    assert result.mime_type == "audio/x-sndr"
    assert result.confidence == 0.1