1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159
|
"""Module providing access to all known dataset definitions."""
from __future__ import annotations
import hashlib
import os
import textwrap
from pathlib import Path
from typing import Any
import importlib_resources
import yaml
_hashinfo_formatversion = 1
definition: dict[str, Any]
fileinfo_dirty: set[str]
def _load_yml_definitions() -> None:
"""
Read dataset .yml files from definitions/ and hashinfo/ directories.
This is done once during the module import stage.
"""
global definition, fileinfo_dirty
definition = {}
fileinfo_dirty = set()
base_directory = importlib_resources.files("dials_data") / "definitions"
hash_directory = importlib_resources.files("dials_data") / "hashinfo"
for definition_file in base_directory.glob("*.yml"):
dataset_definition = definition_file.read_bytes()
dataset_name = definition_file.stem
definition[dataset_name] = yaml.safe_load(dataset_definition)
dhash = hashlib.sha256()
dhash.update(dataset_definition)
definition[dataset_name]["hash"] = dhash.hexdigest()
h_file = hash_directory / definition_file.name
if not h_file.exists():
fileinfo_dirty.add(dataset_name)
continue
hashinfo = yaml.safe_load(h_file.read_bytes())
if (
hashinfo["definition"] == definition[dataset_name]["hash"]
and hashinfo["formatversion"] == _hashinfo_formatversion
):
definition[dataset_name]["hashinfo"] = hashinfo
else:
fileinfo_dirty.add(dataset_name)
_load_yml_definitions()
def create_integrity_record(dataset_name) -> dict[str, Any]:
"""
Generate a dictionary for the integrity information of a specific dataset.
"""
return {
"definition": definition[dataset_name]["hash"],
"formatversion": _hashinfo_formatversion,
}
def repository_location() -> Path:
"""
Returns an appropriate location where the downloaded regression data should
be stored.
In order of evaluation:
* If the environment variable DIALS_DATA is set and exists or can be
created then use that location
* If a Diamond Light Source specific path exists then use that location
* If the environment variable LIBTBX_BUILD is set and the directory
'dials_data' exists or can be created underneath that location then
use that.
* Use ~/.cache/dials_data if it exists or can be created
* Otherwise fail with a RuntimeError
"""
if os.getenv("DIALS_DATA"):
try:
repository = Path(os.environ["DIALS_DATA"])
repository.mkdir(parents=True, exist_ok=True)
return repository
except (KeyError, TypeError, OSError):
pass
try:
repository = Path("/dls/science/groups/scisoft/DIALS/dials_data")
if repository.is_dir():
return repository
except OSError:
pass
if os.getenv("LIBTBX_BUILD"):
try:
repository = Path(os.environ["LIBTBX_BUILD"]) / "dials_data"
repository.mkdir(parents=True, exist_ok=True)
return repository
except (KeyError, TypeError, OSError):
pass
try:
repository = Path.home() / ".cache" / "dials_data"
repository.mkdir(parents=True, exist_ok=True)
return repository
except (TypeError, OSError):
raise RuntimeError(
"Could not determine regression data location. Use environment variable DIALS_DATA"
)
def get_resident_size(ds) -> int:
if ds in fileinfo_dirty:
return 0
return sum(item["size"] for item in definition[ds]["hashinfo"]["verify"])
def _human_readable(num: float, suffix: str = "B") -> str:
for unit in ("", "k", "M", "G"):
if num < 10:
return f"{num:.1f}{unit}{suffix}"
if num < 1024:
return f"{num:.0f}{unit}{suffix}"
num /= 1024.0
return f"{num:.0f}T{suffix}"
def list_known_definitions(ds_list, quiet=False) -> None:
indent = " " * 4
for shortname in sorted(ds_list):
if quiet:
print(shortname)
continue
dataset = definition[shortname]
if shortname in fileinfo_dirty:
size_information = "unverified dataset"
else:
size_information = _human_readable(get_resident_size(shortname))
print(
"{shortname}: {dataset[name]} ({size_information})".format(
shortname=shortname, dataset=dataset, size_information=size_information
)
)
print(
"{indent}{author} ({license})".format(
author=dataset.get("author", "unknown author"),
indent=indent,
license=dataset.get("license", "unknown license"),
)
)
if dataset.get("url"):
print("{indent}{dataset[url]}".format(indent=indent, dataset=dataset))
print(
"\n{}\n".format(
textwrap.fill(
dataset["description"],
initial_indent=indent,
subsequent_indent=indent,
)
)
)
|