File: test_upload_large_folder.py

package info (click to toggle)
huggingface-hub 1.2.2-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 5,424 kB
  • sloc: python: 45,857; sh: 434; makefile: 33
file content (157 lines) | stat: -rw-r--r-- 6,718 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# tests/test_upload_large_folder.py
import unittest
from unittest.mock import MagicMock, patch

import pytest

from huggingface_hub._upload_large_folder import (
    COMMIT_SIZE_SCALE,
    MAX_FILES_PER_FOLDER,
    MAX_FILES_PER_REPO,
    LargeUploadStatus,
    _validate_upload_limits,
)


@pytest.fixture
def status():
    return LargeUploadStatus(items=[])


def test_target_chunk_default(status):
    assert status.target_chunk() == COMMIT_SIZE_SCALE[1]


@pytest.mark.parametrize(
    "start_idx, success, delta_items, duration, expected_idx",
    [
        (2, False, 0, 10, 1),  # drop by one on failure
        (0, False, 0, 10, 0),  # never go below zero
        (1, True, 0, 50, 1),  # duration >= 40 --> no bump
        (1, True, -1, 30, 1),  # nb_items < threshold --> no bump
        (1, True, 0, 30, 2),  # fast enough and enough items
        (len(COMMIT_SIZE_SCALE) - 1, True, 0, 10, len(COMMIT_SIZE_SCALE) - 1),  # never exceed last index
    ],
)
def test_update_chunk_transitions(status, start_idx, success, delta_items, duration, expected_idx):
    status._chunk_idx = start_idx
    threshold = COMMIT_SIZE_SCALE[start_idx]
    nb_items = threshold + delta_items
    status.update_chunk(success=success, nb_items=nb_items, duration=duration)

    assert status._chunk_idx == expected_idx
    assert status.target_chunk() == COMMIT_SIZE_SCALE[expected_idx]


class TestValidateUploadLimits(unittest.TestCase):
    """Test the _validate_upload_limits function directly."""

    class MockPath:
        """Mock object to simulate LocalUploadFilePaths."""

        def __init__(self, path_in_repo, size_bytes=1000):
            self.path_in_repo = path_in_repo
            self.file_path = MagicMock()
            self.file_path.stat.return_value.st_size = size_bytes

    @patch("huggingface_hub._upload_large_folder.logger")
    def test_no_warnings_under_limits(self, mock_logger):
        """Test that no warnings are issued when under all limits."""
        paths = [
            self.MockPath("file1.txt"),
            self.MockPath("data/file2.txt"),
            self.MockPath("data/sub/file3.txt"),
        ]
        _validate_upload_limits(paths)

        # Should only have info messages, no warnings
        mock_logger.warning.assert_not_called()

    @patch("huggingface_hub._upload_large_folder.logger")
    def test_warns_too_many_total_files(self, mock_logger):
        """Test warning when total files exceed MAX_FILES_PER_REPO."""
        # Create a list with more files than the limit
        paths = [self.MockPath(f"file{i}.txt") for i in range(MAX_FILES_PER_REPO + 10)]
        _validate_upload_limits(paths)

        # Check that the appropriate warning was logged
        warning_calls = [str(call) for call in mock_logger.warning.call_args_list]
        assert any(f"{MAX_FILES_PER_REPO + 10:,} files" in call for call in warning_calls)
        assert any("exceeds the recommended limit" in call for call in warning_calls)

    @patch("huggingface_hub._upload_large_folder.logger")
    def test_warns_too_many_subdirectories(self, mock_logger):
        """Test warning when a folder has too many subdirectories."""
        # Create files in many subdirectories under "data"
        paths = []
        for i in range(MAX_FILES_PER_FOLDER + 10):
            paths.append(self.MockPath(f"data/subdir{i:05d}/file.txt"))

        _validate_upload_limits(paths)

        # Check that warning mentions subdirectories in "data" folder
        warning_calls = [str(call) for call in mock_logger.warning.call_args_list]
        assert any("data" in call and "subdirectories" in call for call in warning_calls)
        assert any(f"{MAX_FILES_PER_FOLDER + 10:,} subdirectories" in call for call in warning_calls)

    @patch("huggingface_hub._upload_large_folder.logger")
    def test_counts_files_and_subdirs_separately(self, mock_logger):
        """Test that files and subdirectories are counted separately and correctly."""
        # Create a structure with both files and subdirs in "data"
        paths = []
        # Add 5000 files directly in data/
        for i in range(5000):
            paths.append(self.MockPath(f"data/file{i}.txt"))
        # Add 5100 subdirectories with files (exceeds limit when combined)
        for i in range(5100):
            paths.append(self.MockPath(f"data/subdir{i}/file.txt"))

        _validate_upload_limits(paths)

        # Should warn about "data" having 10,100 entries (5000 files + 5100 subdirs)
        warning_calls = [str(call) for call in mock_logger.warning.call_args_list]
        assert any("data" in call and "10,100 entries" in call for call in warning_calls)
        assert any("5,000 files" in call and "5,100 subdirectories" in call for call in warning_calls)

    @patch("huggingface_hub._upload_large_folder.logger")
    def test_file_size_decimal_gb(self, mock_logger):
        """Test that file sizes are calculated using decimal GB (10^9 bytes)."""
        # Create a file that's 21 GB in decimal (21 * 10^9 bytes)
        size_bytes = 21 * 1_000_000_000
        paths = [self.MockPath("large_file.bin", size_bytes=size_bytes)]

        _validate_upload_limits(paths)

        # Should warn about file being larger than 20GB recommended
        warning_calls = [str(call) for call in mock_logger.warning.call_args_list]
        assert any("21.0GB" in call or "21GB" in call for call in warning_calls)
        assert any("20GB (recommended limit)" in call for call in warning_calls)

    @patch("huggingface_hub._upload_large_folder.logger")
    def test_very_large_file_warning(self, mock_logger):
        """Test warning for files exceeding hard limit (50GB)."""
        # Create a file that's 51 GB
        size_bytes = 51 * 1_000_000_000
        paths = [self.MockPath("huge_file.bin", size_bytes=size_bytes)]

        _validate_upload_limits(paths)

        # Should warn about file exceeding 50GB hard limit
        warning_calls = [str(call) for call in mock_logger.warning.call_args_list]
        assert any("51.0GB" in call or "51GB" in call for call in warning_calls)
        assert any("50GB hard limit" in call for call in warning_calls)

    @patch("huggingface_hub._upload_large_folder.logger")
    def test_nested_directory_structure(self, mock_logger):
        """Test correct handling of deeply nested directory structures."""
        paths = [
            self.MockPath("a/b/c/d/e/file1.txt"),
            self.MockPath("a/b/c/d/e/file2.txt"),
            self.MockPath("a/b/c/d/f/file3.txt"),
            self.MockPath("a/b/c/g/file4.txt"),
        ]

        _validate_upload_limits(paths)

        # Should not warn - each folder has at most 2 entries
        mock_logger.warning.assert_not_called()