File: test_file_store.py

package info (click to toggle)
python-maggma 0.70.0-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 1,412 kB
  • sloc: python: 10,150; makefile: 10
file content (384 lines) | stat: -rw-r--r-- 13,951 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
"""
Tests for FileStore

Desired behavior
----------------
- A FileStore is initialized on a directory containing files
- The FileStore reads the files and populates itself with file metadata
- If there is a FileStore.json present, its contents are read and merged with
  the file metadata
- If there are records (file_id) in the JSON metadata that are not associated
  with a file on disk anymore, they are marked as orphans with 'orphan: True'
  and added to the store.
- If there is no FileStore.json present
    - if read_only=False, the file is created
    - if read_only=True, no metadata is read in
- if read_only=False, the update() method is enabled
- if a FileStore is moved to a different location on disk (but all contents of the
  main directory are preserved), file_ids should not change and metadata should
  remain intact.
"""

import hashlib
from datetime import datetime, timezone
import shutil
from pathlib import Path

import pytest

from maggma.core import StoreError
from maggma.stores.file_store import FileStore


@pytest.fixture()
def test_dir(tmp_path):
    module_dir = Path(__file__).resolve().parent
    test_dir = module_dir / ".." / "test_files" / "file_store_test"
    shutil.copytree(str(test_dir), str(tmp_path), dirs_exist_ok=True)
    return tmp_path.resolve()


def test_record_from_file(test_dir):
    """
    Test functionality of _create_record_from_file
    """
    fs = FileStore(test_dir, read_only=True)
    fs.connect()
    f = Path(test_dir / "calculation1" / "input.in")

    relative_path = f.relative_to(test_dir)
    digest = hashlib.md5()
    digest.update(str(relative_path).encode())
    file_id = str(digest.hexdigest())

    d = fs._create_record_from_file(f)
    assert d["name"] == "input.in"
    assert d["parent"] == "calculation1"
    assert d["path"] == test_dir / "calculation1" / "input.in"
    assert d["size"] == pytest.approx(90, abs=1)
    assert isinstance(d["hash"], str)
    assert d["file_id"] == file_id
    assert d["last_updated"] == datetime.fromtimestamp(f.stat().st_mtime, tz=timezone.utc)


def test_newer_in_on_local_update(test_dir):
    """
    Init a FileStore
    modify one of the files on disk
    Init another FileStore on the same directory
    confirm that one record shows up in newer_in
    """
    fs = FileStore(test_dir, read_only=False)
    fs.connect()
    with open(test_dir / "calculation1" / "input.in", "w") as f:
        f.write("Ryan was here")
    fs2 = FileStore(test_dir, read_only=False)
    fs2.connect()

    assert fs2.last_updated > fs.last_updated
    assert (
        fs2.query_one({"path": {"$regex": "calculation1/input.in"}})["last_updated"]
        > fs.query_one({"path": {"$regex": "calculation1/input.in"}})["last_updated"]
    )
    assert len(fs.newer_in(fs2)) == 1


def test_max_depth(test_dir):
    """
    test max_depth parameter

    NOTE this test only creates a single temporary directory, meaning that
    the JSON file created by the first FileStore.init() persists for the other
    tests. This creates the possibility of orphaned metadata.
    """
    # default (None) should parse all 6 files
    fs = FileStore(test_dir, read_only=False)
    fs.connect()
    assert len(list(fs.query())) == 6

    # 0 depth should parse 1 file
    fs = FileStore(test_dir, read_only=False, max_depth=0)
    fs.connect()
    assert len(list(fs.query())) == 1

    # 1 depth should parse 5 files
    fs = FileStore(test_dir, read_only=False, max_depth=1)
    fs.connect()
    assert len(list(fs.query())) == 5

    # 2 depth should parse 6 files
    fs = FileStore(test_dir, read_only=False, max_depth=2)
    fs.connect()
    assert len(list(fs.query())) == 6


def test_orphaned_metadata(test_dir):
    """
    test behavior when orphaned metadata is found

    NOTE the design of this test exploits the fact that the test only creates
    a single temporary directory, meaning that the JSON file created by the
    first FileStore.init() persists for the other tests.
    """
    # make a FileStore of all files and add metadata to all of them
    fs = FileStore(test_dir, read_only=False)
    fs.connect()
    data = list(fs.query())
    for d in data:
        d.update({"tags": "Ryan was here"})
    fs.update(data)

    assert len(list(fs.query())) == 6
    assert len(list(fs.query({"tags": {"$exists": True}}))) == 6
    # the orphan field should be populated for all documents
    assert len(list(fs.query({"orphan": {"$exists": True}}))) == 6
    fs.close()

    # re-init the store with a different max_depth parameter
    # this will result in orphaned metadata
    # with include_orphans=True, this should be returned in queries
    fs = FileStore(test_dir, read_only=True, max_depth=1, include_orphans=True)
    with pytest.warns(UserWarning, match="Orphaned metadata was found in FileStore.json"):
        fs.connect()
    assert len(list(fs.query())) == 6
    assert len(list(fs.query({"tags": {"$exists": True}}))) == 6
    # all items, including orphans, should have a file_id and path_relative
    assert len(list(fs.query({"file_id": {"$exists": True}}))) == 6
    assert len(list(fs.query({"path_relative": {"$exists": True}}))) == 6
    assert len(list(fs.query({"orphan": True}))) == 1
    fs.close()

    # re-init the store after renaming one of the files on disk
    # this will result in orphaned metadata
    # with include_orphans=False (default), that metadata should be
    # excluded from query results
    Path(test_dir / "calculation1" / "input.in").rename(test_dir / "calculation1" / "input_renamed.in")
    fs = FileStore(test_dir, read_only=True, include_orphans=False)
    with pytest.warns(UserWarning, match="Orphaned metadata was found in FileStore.json"):
        fs.connect()
    assert len(list(fs.query())) == 6
    assert len(list(fs.query({"tags": {"$exists": True}}))) == 5
    assert len(list(fs.query({"path": {"$exists": True}}))) == 6
    # manually specifying orphan: True should still work
    assert len(list(fs.query({"orphan": True}))) == 1
    fs.close()


def test_store_files_moved(test_dir):
    """
    test behavior when the directory that constitutes the FileStore is
    moved to a new location on disk
    """
    # make a FileStore of all files and add metadata to all of them
    fs = FileStore(test_dir, read_only=False)
    fs.connect()
    data = list(fs.query())
    for d in data:
        d.update({"tags": "Ryan was here"})
    fs.update(data)

    # the orphan field should be populated for all documents, and False
    assert len(list(fs.query({"orphan": False}))) == 6
    original_file_ids = {f["file_id"] for f in fs.query()}
    original_paths = {f["path"] for f in fs.query()}
    fs.close()

    # now copy the entire FileStore to a new directory and re-initialize
    shutil.copytree(test_dir, str(test_dir / "new_store_location"), dirs_exist_ok=True)
    fs = FileStore(test_dir / "new_store_location", read_only=False)
    fs.connect()
    assert len(list(fs.query({"orphan": False}))) == 6
    assert {f["file_id"] for f in fs.query()} == original_file_ids
    # absolute paths should change to follow the FileStore
    assert {f["path"] for f in fs.query()} != original_paths
    for d in fs.query(properties=["path"]):
        assert str(d["path"]).startswith(str(fs.path))


def test_file_filters(test_dir):
    """
    Make sure multiple patterns work correctly
    """
    # here, we should get 2 input.in files and the file_2_levels_deep.json
    # the store's FileStore.json should be skipped even though .json is
    # in the file patterns
    fs = FileStore(test_dir, read_only=False, file_filters=["*.in", "*.json"])
    fs.connect()
    assert len(list(fs.query())) == 3


def test_read_only(test_dir):
    """
    Make sure nothing is written to a read-only FileStore and that
    documents cannot be deleted
    """
    with pytest.warns(UserWarning, match="JSON file 'random.json' not found"):
        fs = FileStore(test_dir, read_only=True, json_name="random.json")
        fs.connect()
    assert not Path(test_dir / "random.json").exists()
    file_id = fs.query_one()["file_id"]
    with pytest.raises(StoreError, match="read-only"):
        fs.update({"file_id": file_id, "tags": "something"})
    with pytest.raises(StoreError, match="read-only"):
        fs.remove_docs({})


def test_query(test_dir):
    """
    File contents should be read unless file is too large
    size and path keys should not be returned unless explicitly requested
    querying on 'contents' should raise a warning
    contents should be empty if a file is too large
    empty properties kwarg should return contents, size, and path (along with everything else)
    """
    fs = FileStore(test_dir, read_only=True)
    fs.connect()
    d = fs.query_one(
        {"name": "input.in", "parent": "calculation1"},
        properties=["file_id", "contents"],
    )
    assert not d.get("size")
    assert not d.get("path")
    assert d.get("file_id")
    assert d.get("contents")
    assert "This is the file named input.in" in d["contents"]

    d = fs.query_one(
        {"name": "input.in", "parent": "calculation1"},
        properties=None,
    )
    assert d.get("size")
    assert d.get("path")
    assert d.get("file_id")
    assert d.get("contents")

    with pytest.warns(UserWarning, match="'contents' is not a queryable field!"):
        fs.query_one({"contents": {"$regex": "input.in"}})

    d = fs.query_one(
        {"name": "input.in", "parent": "calculation1"},
        properties=["name", "contents"],
        contents_size_limit=50,
    )
    assert d["contents"] == "File exceeds size limit of 50 bytes"
    assert d.get("name")


def test_remove(test_dir):
    """
    Test behavior of remove_docs()
    """
    fs = FileStore(test_dir, read_only=False)
    fs.connect()
    paths = [d["path"] for d in fs.query()]
    with pytest.raises(StoreError, match="about to delete 6 items"):
        fs.remove_docs({})
    fs.remove_docs({"name": "input.in"}, confirm=True)
    assert len(list(fs.query())) == 4
    assert not Path.exists(test_dir / "calculation1" / "input.in")
    assert not Path.exists(test_dir / "calculation2" / "input.in")
    fs.remove_docs({}, confirm=True)
    assert not any(Path(p).exists() for p in paths)


def test_metadata(test_dir):
    """
    1. init a FileStore
    2. add some metadata to both 'input.in' files
    3. confirm metadata written to .json
    4. close the store, init a new one
    5. confirm metadata correctly associated with the files
    """
    fs = FileStore(test_dir, read_only=False, last_updated_field="last_change")
    fs.connect()
    query = {"name": "input.in", "parent": "calculation1"}
    key = next(iter(fs.query(query)))[fs.key]
    fs.add_metadata(
        {
            "metadata": {"experiment date": "2022-01-18"},
            fs.last_updated_field: "this should not be here",
        },
        query,
    )

    # make sure metadata has been added to the item without removing other contents
    item_from_store = next(iter(fs.query({"file_id": key})))
    assert item_from_store.get("name", False)
    assert item_from_store.get("metadata", False)
    fs.close()

    # only the updated item should have been written to the JSON,
    # and it should not contain any of the protected keys
    data = fs.metadata_store.read_json_file(fs.path / fs.json_name)
    assert len(data) == 1
    item_from_file = next(d for d in data if d["file_id"] == key)
    assert item_from_file["metadata"] == {"experiment date": "2022-01-18"}
    assert not item_from_file.get("name")
    assert not item_from_file.get("path")
    assert not item_from_file.get(fs.last_updated_field)
    assert item_from_file.get("path_relative")

    # make sure metadata is preserved after reconnecting
    fs2 = FileStore(test_dir, read_only=True)
    fs2.connect()
    data = fs2.metadata_store.read_json_file(fs2.path / fs2.json_name)
    item_from_file = next(d for d in data if d["file_id"] == key)
    assert item_from_file["metadata"] == {"experiment date": "2022-01-18"}

    # make sure reconnected store properly merges in the metadata
    item_from_store = next(iter(fs2.query({"file_id": key})))
    assert item_from_store["name"] == "input.in"
    assert item_from_store["parent"] == "calculation1"
    assert item_from_store.get("metadata") == {"experiment date": "2022-01-18"}
    fs2.close()

    # make sure reconnecting with read_only=False doesn't remove metadata from the JSON
    fs3 = FileStore(test_dir, read_only=False)
    fs3.connect()
    data = fs3.metadata_store.read_json_file(fs3.path / fs3.json_name)
    item_from_file = next(d for d in data if d["file_id"] == key)
    assert item_from_file["metadata"] == {"experiment date": "2022-01-18"}
    item_from_store = next(iter(fs3.query({"file_id": key})))
    assert item_from_store["name"] == "input.in"
    assert item_from_store["parent"] == "calculation1"
    assert item_from_store.get("metadata") == {"experiment date": "2022-01-18"}
    fs3.close()

    # test automatic metadata assignment
    def add_data_from_name(d):
        return {"calc_name": d["name"][0:5]}

    fs4 = FileStore(test_dir, read_only=False)
    fs4.connect()
    # apply the auto function to all records
    fs4.add_metadata(auto_data=add_data_from_name)
    for d in fs4.query():
        print(d)
        assert d.get("calc_name", False) == d["name"][0:5]


def test_json_name(test_dir):
    """
    Make sure custom .json name works
    """
    fs = FileStore(test_dir, read_only=False, json_name="random.json")
    fs.connect()
    assert Path(test_dir / "random.json").exists()


def test_this_dir():
    """
    Make sure connect() works when path is "."
    """
    fs = FileStore(".")
    fs.connect()
    assert not fs.name.endswith(".")


def test_encoding():
    """
    Make sure custom encoding works
    """
    fs = FileStore(".", read_only=False, encoding="utf8")
    fs.connect()
    assert Path("FileStore.json").exists()