File: test_scanners.py

package info (click to toggle)
puremagic 2.1.1-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 3,472 kB
sloc: python: 2,138; makefile: 9; sh: 7
file content (137 lines) | stat: -rw-r--r-- 5,554 bytes
import puremagic
from test.common import IMAGE_DIR, OFFICE_DIR, SYSTEM_DIR, AUDIO_DIR
from puremagic.scanners import python_scanner, json_scanner, sndhdr_scanner

sample_text = b"""Lorem ipsum dolor sit amet, consectetur adipiscing elit,{ending}
sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.{ending}
{ending}
Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat.{ending}
Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur.{ending}
Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.{ending}
"""


def test_text_scanner():
    # Test the text scanner with a sample text file
    lr_file = OFFICE_DIR / "text_lf.txt"
    lr_file.write_bytes(sample_text.replace(b"\n", b"").replace(b"{ending}", b"\n"))
    results = puremagic.magic_file(lr_file)
    assert results[0].extension == ".txt"
    assert results[0].name == "ascii text, with LF line terminators"
    assert results[0].mime_type == "text/plain"
    assert results[0].confidence == 0.9

    crlf_file = OFFICE_DIR / "text_crlf.txt"
    crlf_file.write_bytes(sample_text.replace(b"\n", b"").replace(b"{ending}", b"\r\n"))
    results = puremagic.magic_file(crlf_file)
    assert results[0].extension == ".txt"
    assert results[0].name == "ascii text, with CRLF line terminators"
    assert results[0].mime_type == "text/plain"
    assert results[0].confidence == 0.9

    cr_file = OFFICE_DIR / "text_cr.txt"
    cr_file.write_bytes(sample_text.replace(b"\n", b"").replace(b"{ending}", b"\r"))
    results = puremagic.magic_file(cr_file)
    assert results[0].name == "ascii text, with CR line terminators"
    assert results[0].extension == ".txt"
    assert results[0].mime_type == "text/plain"
    assert results[0].confidence == 0.9


def test_utf16_le_not_mp1():
    # GH #134: UTF-16 LE BOM (FF FE) should not be misidentified as .mp1
    data = b"\xff\xfe" + "a,b,c\n1,2,3\n".encode("utf-16-le")
    result = puremagic.from_string(data)
    assert result != ".mp1", "UTF-16 LE data misidentified as .mp1"
    result_mime = puremagic.from_string(data, mime=True)
    assert result_mime != "audio/mpeg", "UTF-16 LE data misidentified as audio/mpeg"


def test_utf16_le_csv_deep_scan():
    # GH #134: UTF-16 LE CSV file should be detected as CSV via text_scanner deep scan
    utf16_csv = OFFICE_DIR / "test_utf16le.csv"
    results = puremagic.magic_file(utf16_csv)
    assert results[0].extension == ".csv"
    assert results[0].mime_type == "text/csv"
    assert "comma" in results[0].name
    assert results[0].confidence >= 0.9


def test_from_string_nonexistent_filename():
    # GH #137: passing filename for extension hint should not raise FileNotFoundError
    # Use PDF-like bytes so identify_all finds a match via magic numbers,
    # then deep scan is skipped (file doesn't exist) and the match is returned.
    pdf_bytes = b"%PDF-1.4 fake content"
    result = puremagic.from_string(pdf_bytes, filename="nonexistent.pdf")
    assert result == ".pdf"

    # magic_string should also work without crashing
    results = puremagic.magic_string(pdf_bytes, filename="nonexistent.pdf")
    assert any(r.extension == ".pdf" for r in results)


def test_python_scanner():
    # Test the Python scanner with a sample Python file
    py_file = SYSTEM_DIR / "test.py"
    result = python_scanner.main(py_file, None, None)
    magic_result = puremagic.magic_file(py_file)
    assert result is not None
    assert result.extension == ".py"
    assert result.confidence == magic_result[0].confidence
    assert result.name == "Python Script"
    assert result.mime_type == "text/x-python"
    assert result.confidence == 1.0


def test_json_scanner():
    json_file = SYSTEM_DIR / "test.json"
    result = json_scanner.main(json_file, b"{", b"}")
    magic_result = puremagic.magic_file(json_file)
    assert result is not None
    assert result.confidence == magic_result[0].confidence
    assert result.extension == ".json"
    assert result.name == "JSON File"
    assert result.mime_type == "application/json"
    assert result.confidence == 1.0


def test_eml_scanner():
    eml_file = OFFICE_DIR / "test.eml"
    results = puremagic.magic_file(eml_file)
    assert results[0].extension == ".eml"
    assert results[0].name == "RFC 2822 Email Message"
    assert results[0].mime_type == "message/rfc822"
    assert results[0].confidence == 1.0


def test_jpg_without_extension():
    # GH #141: JPEG file without extension should still be identified as image/jpeg
    import struct

    data = b"\xff\xd8\xff\xe0"
    data += struct.pack(">H", 16)
    data += b"JFIF\x00\x01\x01\x00"
    data += struct.pack(">HH", 1, 1)
    data += b"\x00\x00\xff\xd9"

    no_ext_file = IMAGE_DIR / "test_jpeg_no_ext"
    no_ext_file.write_bytes(data)
    try:
        result = puremagic.from_file(no_ext_file, mime=True)
        assert result == "image/jpeg", f"Expected image/jpeg, got {result}"
    finally:
        no_ext_file.unlink()


def test_sndhdr_scanner():
    # Test the sndhdr scanner with sndr file
    sndr_file = AUDIO_DIR / "test.sndr"
    with open(sndr_file, "rb") as f:
        head = f.read(512)
    result = sndhdr_scanner.main(None, head, None)
    puremagic.magic_file(sndr_file)
    assert result is not None
    assert result.extension == ".sndr"
    assert result.name.startswith("Macintosh SNDR Resource")
    assert result.mime_type == "audio/x-sndr"
    assert result.confidence == 0.1