1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
|
import os
import sys
import pytest
from internetarchive import get_item
from internetarchive.exceptions import DirectoryTraversalError
from internetarchive.files import File
from internetarchive.item import Item
from internetarchive.utils import (
is_path_within_directory,
sanitize_windows_filename,
sanitize_windows_relpath,
)
IS_WIN = os.name == 'nt'
pytestmark = pytest.mark.skipif(not IS_WIN, reason='Windows specific tests')
def test_control_char_encoding():
name = 'bad\x05name'
sanitized, modified = sanitize_windows_filename(name)
assert modified
assert sanitized == 'bad%05name'
@pytest.mark.parametrize(('reserved','expected'), [
('AUX', 'AU%58'),
('CON', 'CO%4E'),
('COM1', 'COM%31'),
('LPT9', 'LPT%39'),
('NUL', 'NU%4C'),
])
def test_reserved_names(reserved, expected):
sanitized, modified = sanitize_windows_filename(reserved)
assert modified
assert sanitized == expected
@pytest.mark.parametrize(('filename','expected'), [
('AUX.txt', 'AU%58.txt'),
('con.log', 'co%6E.log'),
('Com1.bin', 'Com%31.bin'),
('COM3.txt.txt', 'COM%33.txt.txt'),
])
def test_reserved_with_extension_sanitized(filename, expected):
sanitized, modified = sanitize_windows_filename(filename)
assert modified
assert sanitized == expected
@pytest.mark.parametrize(('filename','expected'), [
('name.', 'name%2E'),
('name..', 'name%2E%2E'),
('trailspace ', 'trailspace%20'),
('both. ', 'both%2E%20'),
])
def test_trailing_dot_space(filename, expected):
sanitized, modified = sanitize_windows_filename(filename)
assert modified
assert sanitized == expected
@pytest.mark.parametrize(('ch','enc'), [
(':', '%3A'),
('*', '%2A'),
('<', '%3C'),
('>', '%3E'),
('|', '%7C'),
('?', '%3F'),
('\\', '%5C'),
('"', '%22')
])
def test_invalid_chars(ch, enc):
sanitized, modified = sanitize_windows_filename(f'a{ch}b')
assert modified
assert sanitized == f'a{enc}b'
@pytest.mark.parametrize('name', [
'back\\slash', 'dir\\\\file'
])
def test_backslash_always_encoded(name):
sanitized, modified = sanitize_windows_filename(name)
assert '%5C' in sanitized
def test_full_filename_combined_sanitization(tmp_path, monkeypatch):
"""Simulate downloading a file whose remote name contains many invalid characters
including a backslash. We only test the sanitization logic up to path formation
(not actual network download)."""
remote_name = 'hello < > : " \\ | ? *.txt'
# Use direct sanitize to assert expected output
sanitized, modified = sanitize_windows_filename(remote_name)
assert modified
# Ensure each invalid char encoded
for ch in ['<','>','|','?','*',':','\\','"',' ']:
assert ch not in sanitized or ch == ' ' # trailing/inner spaces become %20
assert '%5C' in sanitized # backslash
def test_reserved_identifier_directory_sanitized(tmp_path):
"""Ensure that an item identifier that is a reserved device name is sanitized when
constructing download paths."""
# This test focuses on sanitize_windows_filename, as item.Download path building now
# sanitizes components.
reserved = 'AUX'
sanitized, modified = sanitize_windows_filename(reserved)
assert modified
assert (sanitized.startswith('AU') and sanitized.endswith(b'X'.hex().upper()[:])) \
or sanitized == 'AU%58'
def test_directory_traversal_exception_handled(monkeypatch, tmp_path):
# Use is_path_within_directory directly
base = tmp_path
outside = tmp_path.parent / 'outside.txt'
outside.write_text('x')
assert not is_path_within_directory(str(base), str(outside))
@pytest.mark.parametrize('attempt', [
'../evil.txt', '..\\evil.txt', '..%2Fevil.txt', '%2e%2e/evil.txt'
])
def test_traversal_attempt_sanitization(attempt):
# sanitize_windows_relpath should NOT remove traversal but higher layer blocks it;
# here we just ensure it encodes backslashes
sanitized, _ = sanitize_windows_relpath(attempt, verbose=False)
# Backslashes encoded
if '\\' in attempt:
assert '%5C' in sanitized or sanitized.replace('\\', '%5C')
@pytest.mark.parametrize('name', [
'hello%20world', '%41already'
])
def test_existing_percent_sequences(name):
# If no other encoding needed, percent remains unless part of %HH sequence
# and no other changes?
sanitized, modified = sanitize_windows_filename(name)
# existing sequences remain unchanged because no other encoding triggered
assert sanitized == name
@pytest.mark.parametrize('name', [
'needs:encoding%20plus', 'AUX%41' # reserved triggers change
])
def test_percent_gets_encoded_when_other_modifications(name):
sanitized, modified = sanitize_windows_filename(name)
if '%' in name and modified:
assert '%25' in sanitized or name.count('%') == sanitized.count('%25')
# Directory traversal guard logic tests
# (cross-platform semantics validated on Windows here)
def test_is_path_within_directory_true(tmp_path):
base = tmp_path
target = base / 'subdir' / 'file.txt'
target.parent.mkdir()
target.write_text('x')
assert is_path_within_directory(str(base), str(target))
def test_is_path_within_directory_false(tmp_path):
base = tmp_path / 'a'
other = tmp_path / 'b' / 'file.txt'
base.mkdir()
(tmp_path / 'b').mkdir()
other.write_text('x')
assert not is_path_within_directory(str(base), str(other))
|