File: test_ls_file_collection.py

package info (click to toggle)
datalad-next 1.5.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,584 kB
  • sloc: python: 23,970; makefile: 205; sh: 61
file content (214 lines) | stat: -rw-r--r-- 8,334 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
from pathlib import (
    Path,
    PurePath,
)
import pytest

from datalad.api import ls_file_collection

from datalad_next.constraints import CommandParametrizationError
# we need this fixture
from datalad_next.iter_collections.tests.test_iterzip import sample_zip
from datalad_next.tests import skipif_no_network

from ..ls_file_collection import LsFileCollectionParamValidator


def test_ls_file_collection_insufficient_args():
    with pytest.raises(CommandParametrizationError):
        ls_file_collection()

    # any collection needs some kind of identifier, just the type
    # parameter is not enough
    with pytest.raises(CommandParametrizationError):
        ls_file_collection('tarfile')

    # individual collection types have particular requirements re
    # the identifiers -- tarfile wants an existing path
    with pytest.raises(CommandParametrizationError):
        ls_file_collection('tarfile', 'http://example.com')

    # not a known collection type
    with pytest.raises(CommandParametrizationError):
        ls_file_collection('bogus', 'http://example.com')


def _check_archive_member_result(r, collection):
    # basics of a result
    assert r['action'] == 'ls_file_collection'
    assert r['status'] == 'ok'
    # a collection identifier, here the tar location
    assert 'collection' in r
    assert r['collection'] == collection
    # an item identifier, here a str-path of an archive member
    assert 'item' in r
    assert isinstance(r['item'], str)
    # item type info, here some filesystem-related category
    assert 'type' in r
    assert r['type'] in ('file', 'directory', 'symlink', 'hardlink')


def test_ls_file_collection_zipfile(sample_zip, no_result_rendering):
    for res in (
        ls_file_collection('zipfile', sample_zip),
        ls_file_collection('zipfile', sample_zip, hash='md5'),
    ):
        assert len(res) == 4
        # test a few basic properties that should be true for any result
        for r in res:
            _check_archive_member_result(r, sample_zip)


@skipif_no_network
def test_ls_file_collection_tarfile(sample_tar_xz, no_result_rendering):
    for res in (
        ls_file_collection('tarfile', sample_tar_xz),
        ls_file_collection('tarfile', sample_tar_xz, hash='md5'),
    ):
        assert len(res) == 6
        # test a few basic properties that should be true for any result
        for r in res:
            _check_archive_member_result(r, sample_tar_xz)


def test_ls_file_collection_directory(tmp_path, no_result_rendering):
    # smoke test on an empty dir
    res = ls_file_collection('directory', tmp_path)
    assert len(res) == 0


def test_ls_file_collection_gitworktree(existing_dataset, no_result_rendering):
    # smoke test on a plain dataset
    res = ls_file_collection('gitworktree', existing_dataset.pathobj)
    assert len(res) > 1
    assert all('gitsha' in r for r in res)

    # and with hashing
    res_hash = ls_file_collection('gitworktree', existing_dataset.pathobj,
                                  hash='md5')
    assert len(res) == len(res_hash)
    assert all('hash-md5' in r for r in res_hash)


def test_ls_file_collection_validator():
    val = LsFileCollectionParamValidator()

    with pytest.raises(RuntimeError):
        val.get_collection_iter(type='bogus', collection='any', hash=None)


@skipif_no_network
def test_replace_add_archive_content(sample_tar_xz, existing_dataset,
                                     no_result_rendering):
    ds = existing_dataset
    archive_path = ds.pathobj / '.datalad' / 'myarchive.tar.xz'
    # get archive copy in dataset (not strictly needed, but
    # add-archive-content worked like this
    ds.download({sample_tar_xz.as_uri(): archive_path})
    # properly safe to dataset (download is ignorant of datasets)
    res = ds.save(message='add archive')
    # the first result has the archive addition, snatch the archive key from it
    assert res[0]['path'] == str(archive_path)
    archive_key = res[0]['key']

    # now we can scan the archive and register keys for its content.
    # the order and specific composition of the following steps is flexible.
    # we could simply extract the local archive, save the content to the
    # dataset, and then register `dl+archive` URLs.
    # however, we will use an approach that does not require any data
    # to be present locally (actually not even the archive that we have locally
    # already for this test), but is instead based on some metadata
    # that is provided by `ls-file-collection` (but could come from elsewhere,
    # including `ls-file-collection` executed on a different host).
    file_recs = [
        r for r in ls_file_collection(
            'tarfile', sample_tar_xz, hash=['md5'],
        )
        # ignore any non-file, would not have an annex key.
        # Also ignores hardlinks (they consume no space (size=0), but could be
        # represented as regular copies of a shared key. however, this
        # requires further processing of the metadata records, in order to find
        # the size of the item that has the same checksum as this one)
        if r.get('type') == 'file'
    ]
    # we enable the `datalad-archives` special remote using a particular
    # configuration that `add-archive-content` would use.
    # this special remote can act on the particular URLs that we will add next
    ds.repo.call_annex([
        'initremote', 'datalad-archives', 'type=external',
        'externaltype=datalad-archives', 'encryption=none', 'autoenable=true'])
    # assign special `dl+archive` URLs to all file keys
    # the `datalad-archives` special remote will see them and perform the
    # extraction of file content from the archive on demand.
    # the entire operation is not doing any extraction or data retrieval,
    # because we have all information necessary to generate keys
    ds.addurls(
        # takes an iterable of dicts
        file_recs,
        # urlformat: handcrafted archive key, as expected by datalad-archive
        # (double braces to keep item and size as placeholders for addurls)
        f'dl+archive:{archive_key}#path={{item}}&size={{size}}',
        # filenameformat
        '{item}',
        key='et:MD5-s{size}--{hash-md5}',
    )
    # because we have  been adding the above URLs using a pure metadata-driven
    # approach, git-annex does not yet know that the archives remote actually
    # has the keys. we could use `annex setpresentkey` for that (fast local
    # operation), but here we use `fsck` to achieve a comprehensive smoke test
    # of compatibility with our hand-crafted and the special remote
    # implementation
    # (actually: without --fast the special remote crashes with a protocol
    #  error -- a bug in the special remote probably)
    ds.repo.call_annex(
        ['fsck', '--fast', '-f', 'datalad-archives'],
        files=['test-archive'],
    )
    # at this point we are done
    # check retrieval for a test file, which is not yet around
    testfile = ds.pathobj / 'test-archive' / '123_hard.txt'
    assert ds.status(
        testfile, annex='availability')[0]['has_content'] is False
    ds.get(testfile)
    assert testfile.read_text() == '123\n'


def test_ls_renderer():
    # nothing more than a smoke test
    ls_file_collection(
        'directory',
        Path(__file__).parent,
        result_renderer='tailored',
    )


def test_ls_annexworktree_empty_dataset(existing_dataset):
    res = ls_file_collection(
        'annexworktree',
        existing_dataset.pathobj,
        result_renderer='disabled'
    )
    assert len(res) == 3
    annexed_files = [annex_info for annex_info in res if 'annexkey' in annex_info]
    assert len(annexed_files) == 0


def test_ls_annexworktree_simple_dataset(existing_dataset):

    (existing_dataset.pathobj / 'sample.bin').write_bytes(b'\x00' * 1024)
    existing_dataset.save(message='add sample file')

    res = ls_file_collection(
        'annexworktree',
        existing_dataset.pathobj,
        result_renderer='disabled'
    )
    assert len(res) == 4
    annexed_files = [annex_info for annex_info in res if 'annexkey' in annex_info]
    assert len(annexed_files) == 1
    assert annexed_files[0]['type'] == 'annexed file'
    assert {
        'annexkey',
        'annexsize',
        'annexobjpath'
    }.issubset(set(annexed_files[0].keys()))