File: datasets.py

package info (click to toggle)
dials-data 2.4.0-2
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 892 kB
  • sloc: python: 647; makefile: 25
file content (159 lines) | stat: -rw-r--r-- 5,214 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
"""Module providing access to all known dataset definitions."""

from __future__ import annotations

import hashlib
import os
import textwrap
from pathlib import Path
from typing import Any

import importlib_resources
import yaml

_hashinfo_formatversion = 1

definition: dict[str, Any]
fileinfo_dirty: set[str]


def _load_yml_definitions() -> None:
    """
    Read dataset .yml files from definitions/ and hashinfo/ directories.
    This is done once during the module import stage.
    """
    global definition, fileinfo_dirty
    definition = {}
    fileinfo_dirty = set()
    base_directory = importlib_resources.files("dials_data") / "definitions"
    hash_directory = importlib_resources.files("dials_data") / "hashinfo"
    for definition_file in base_directory.glob("*.yml"):
        dataset_definition = definition_file.read_bytes()
        dataset_name = definition_file.stem
        definition[dataset_name] = yaml.safe_load(dataset_definition)
        dhash = hashlib.sha256()
        dhash.update(dataset_definition)
        definition[dataset_name]["hash"] = dhash.hexdigest()

        h_file = hash_directory / definition_file.name
        if not h_file.exists():
            fileinfo_dirty.add(dataset_name)
            continue
        hashinfo = yaml.safe_load(h_file.read_bytes())
        if (
            hashinfo["definition"] == definition[dataset_name]["hash"]
            and hashinfo["formatversion"] == _hashinfo_formatversion
        ):
            definition[dataset_name]["hashinfo"] = hashinfo
        else:
            fileinfo_dirty.add(dataset_name)


_load_yml_definitions()


def create_integrity_record(dataset_name) -> dict[str, Any]:
    """
    Generate a dictionary for the integrity information of a specific dataset.
    """
    return {
        "definition": definition[dataset_name]["hash"],
        "formatversion": _hashinfo_formatversion,
    }


def repository_location() -> Path:
    """
    Returns an appropriate location where the downloaded regression data should
    be stored.

    In order of evaluation:
    * If the environment variable DIALS_DATA is set and exists or can be
      created then use that location
    * If a Diamond Light Source specific path exists then use that location
    * If the environment variable LIBTBX_BUILD is set and the directory
      'dials_data' exists or can be created underneath that location then
      use that.
    * Use ~/.cache/dials_data if it exists or can be created
    * Otherwise fail with a RuntimeError
    """
    if os.getenv("DIALS_DATA"):
        try:
            repository = Path(os.environ["DIALS_DATA"])
            repository.mkdir(parents=True, exist_ok=True)
            return repository
        except (KeyError, TypeError, OSError):
            pass
    try:
        repository = Path("/dls/science/groups/scisoft/DIALS/dials_data")
        if repository.is_dir():
            return repository
    except OSError:
        pass
    if os.getenv("LIBTBX_BUILD"):
        try:
            repository = Path(os.environ["LIBTBX_BUILD"]) / "dials_data"
            repository.mkdir(parents=True, exist_ok=True)
            return repository
        except (KeyError, TypeError, OSError):
            pass
    try:
        repository = Path.home() / ".cache" / "dials_data"
        repository.mkdir(parents=True, exist_ok=True)
        return repository
    except (TypeError, OSError):
        raise RuntimeError(
            "Could not determine regression data location. Use environment variable DIALS_DATA"
        )


def get_resident_size(ds) -> int:
    if ds in fileinfo_dirty:
        return 0
    return sum(item["size"] for item in definition[ds]["hashinfo"]["verify"])


def _human_readable(num: float, suffix: str = "B") -> str:
    for unit in ("", "k", "M", "G"):
        if num < 10:
            return f"{num:.1f}{unit}{suffix}"
        if num < 1024:
            return f"{num:.0f}{unit}{suffix}"
        num /= 1024.0
    return f"{num:.0f}T{suffix}"


def list_known_definitions(ds_list, quiet=False) -> None:
    indent = " " * 4
    for shortname in sorted(ds_list):
        if quiet:
            print(shortname)
            continue
        dataset = definition[shortname]
        if shortname in fileinfo_dirty:
            size_information = "unverified dataset"
        else:
            size_information = _human_readable(get_resident_size(shortname))
        print(
            "{shortname}: {dataset[name]} ({size_information})".format(
                shortname=shortname, dataset=dataset, size_information=size_information
            )
        )
        print(
            "{indent}{author} ({license})".format(
                author=dataset.get("author", "unknown author"),
                indent=indent,
                license=dataset.get("license", "unknown license"),
            )
        )
        if dataset.get("url"):
            print("{indent}{dataset[url]}".format(indent=indent, dataset=dataset))
        print(
            "\n{}\n".format(
                textwrap.fill(
                    dataset["description"],
                    initial_indent=indent,
                    subsequent_indent=indent,
                )
            )
        )