1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137
|
import puremagic
from test.common import IMAGE_DIR, OFFICE_DIR, SYSTEM_DIR, AUDIO_DIR
from puremagic.scanners import python_scanner, json_scanner, sndhdr_scanner
sample_text = b"""Lorem ipsum dolor sit amet, consectetur adipiscing elit,{ending}
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.{ending}
{ending}
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.{ending}
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.{ending}
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.{ending}
"""
def test_text_scanner():
# Test the text scanner with a sample text file
lr_file = OFFICE_DIR / "text_lf.txt"
lr_file.write_bytes(sample_text.replace(b"\n", b"").replace(b"{ending}", b"\n"))
results = puremagic.magic_file(lr_file)
assert results[0].extension == ".txt"
assert results[0].name == "ascii text, with LF line terminators"
assert results[0].mime_type == "text/plain"
assert results[0].confidence == 0.9
crlf_file = OFFICE_DIR / "text_crlf.txt"
crlf_file.write_bytes(sample_text.replace(b"\n", b"").replace(b"{ending}", b"\r\n"))
results = puremagic.magic_file(crlf_file)
assert results[0].extension == ".txt"
assert results[0].name == "ascii text, with CRLF line terminators"
assert results[0].mime_type == "text/plain"
assert results[0].confidence == 0.9
cr_file = OFFICE_DIR / "text_cr.txt"
cr_file.write_bytes(sample_text.replace(b"\n", b"").replace(b"{ending}", b"\r"))
results = puremagic.magic_file(cr_file)
assert results[0].name == "ascii text, with CR line terminators"
assert results[0].extension == ".txt"
assert results[0].mime_type == "text/plain"
assert results[0].confidence == 0.9
def test_utf16_le_not_mp1():
# GH #134: UTF-16 LE BOM (FF FE) should not be misidentified as .mp1
data = b"\xff\xfe" + "a,b,c\n1,2,3\n".encode("utf-16-le")
result = puremagic.from_string(data)
assert result != ".mp1", "UTF-16 LE data misidentified as .mp1"
result_mime = puremagic.from_string(data, mime=True)
assert result_mime != "audio/mpeg", "UTF-16 LE data misidentified as audio/mpeg"
def test_utf16_le_csv_deep_scan():
# GH #134: UTF-16 LE CSV file should be detected as CSV via text_scanner deep scan
utf16_csv = OFFICE_DIR / "test_utf16le.csv"
results = puremagic.magic_file(utf16_csv)
assert results[0].extension == ".csv"
assert results[0].mime_type == "text/csv"
assert "comma" in results[0].name
assert results[0].confidence >= 0.9
def test_from_string_nonexistent_filename():
# GH #137: passing filename for extension hint should not raise FileNotFoundError
# Use PDF-like bytes so identify_all finds a match via magic numbers,
# then deep scan is skipped (file doesn't exist) and the match is returned.
pdf_bytes = b"%PDF-1.4 fake content"
result = puremagic.from_string(pdf_bytes, filename="nonexistent.pdf")
assert result == ".pdf"
# magic_string should also work without crashing
results = puremagic.magic_string(pdf_bytes, filename="nonexistent.pdf")
assert any(r.extension == ".pdf" for r in results)
def test_python_scanner():
# Test the Python scanner with a sample Python file
py_file = SYSTEM_DIR / "test.py"
result = python_scanner.main(py_file, None, None)
magic_result = puremagic.magic_file(py_file)
assert result is not None
assert result.extension == ".py"
assert result.confidence == magic_result[0].confidence
assert result.name == "Python Script"
assert result.mime_type == "text/x-python"
assert result.confidence == 1.0
def test_json_scanner():
json_file = SYSTEM_DIR / "test.json"
result = json_scanner.main(json_file, b"{", b"}")
magic_result = puremagic.magic_file(json_file)
assert result is not None
assert result.confidence == magic_result[0].confidence
assert result.extension == ".json"
assert result.name == "JSON File"
assert result.mime_type == "application/json"
assert result.confidence == 1.0
def test_eml_scanner():
eml_file = OFFICE_DIR / "test.eml"
results = puremagic.magic_file(eml_file)
assert results[0].extension == ".eml"
assert results[0].name == "RFC 2822 Email Message"
assert results[0].mime_type == "message/rfc822"
assert results[0].confidence == 1.0
def test_jpg_without_extension():
# GH #141: JPEG file without extension should still be identified as image/jpeg
import struct
data = b"\xff\xd8\xff\xe0"
data += struct.pack(">H", 16)
data += b"JFIF\x00\x01\x01\x00"
data += struct.pack(">HH", 1, 1)
data += b"\x00\x00\xff\xd9"
no_ext_file = IMAGE_DIR / "test_jpeg_no_ext"
no_ext_file.write_bytes(data)
try:
result = puremagic.from_file(no_ext_file, mime=True)
assert result == "image/jpeg", f"Expected image/jpeg, got {result}"
finally:
no_ext_file.unlink()
def test_sndhdr_scanner():
# Test the sndhdr scanner with sndr file
sndr_file = AUDIO_DIR / "test.sndr"
with open(sndr_file, "rb") as f:
head = f.read(512)
result = sndhdr_scanner.main(None, head, None)
puremagic.magic_file(sndr_file)
assert result is not None
assert result.extension == ".sndr"
assert result.name.startswith("Macintosh SNDR Resource")
assert result.mime_type == "audio/x-sndr"
assert result.confidence == 0.1
|