1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367
|
"""Classes for tracking external data used by mmCIF models.
"""
import os
class Location:
"""Identifies the location where a resource can be found.
Do not use this class itself, but one of its subclasses.
Typically the resource may be found in a file (either on the local
disk or at a DOI) - for this use one of the subclasses of
:class:`FileLocation`. Alternatively the resource may be found in
an experiment-specific database such as PDB or EMDB - for this use
:class:`DatabaseLocation` or one of its subclasses. A Location may
be passed to
- a :class:`~ihm.dataset.Dataset` to point to where an
experimental dataset may be found;
- an :class:`~ihm.model.Ensemble` to point to coordinates for an
entire ensemble, for example as a DCD file;
- a :class:`ihm.model.LocalizationDensity` to point to an external
localization density, for example in MRC format;
- :data:`ihm.System.locations` to point to other files relating
to the modeling in general, such as a modeling control script
(:class:`WorkflowFileLocation`) or a command script for a
visualization package such as ChimeraX
(:class:`VisualizationFileLocation`);
- a :class:`ihm.protocol.Step` or :class:`ihm.analysis.Step` to
describe an individual modeling step;
- or a :class:`~ihm.startmodel.StartingModel` to describe how a
starting model was constructed.
:param str details: Additional details about the dataset, if known.
"""
# 'details' can differ without affecting dataset equality
_eq_keys = []
_allow_duplicates = False
def __init__(self, details=None):
self.details = details
# Locations compare equal iff they are the same class, have the
# same attributes, and allow_duplicates=False
def _eq_vals(self):
if self._allow_duplicates:
return id(self)
else:
return tuple([self.__class__]
+ [getattr(self, x) for x in self._eq_keys])
def __eq__(self, other):
# We can never be equal to None
return other is not None and self._eq_vals() == other._eq_vals()
def __hash__(self):
return hash(self._eq_vals())
class DatabaseLocation(Location):
"""A dataset stored in an official database (PDB, EMDB, PRIDE, etc.).
Generally a subclass should be used specific to the database -
for example, :class:`PDBLocation`, :class:`EMDBLocation`, or
:class:`PRIDELocation`, although this base class can be used directly
for "other" databases not currently supported by the IHM dictionary.
:param str db_code: The accession code inside the database.
:param str version: The version of the dataset in the database.
:param str details: Additional details about the dataset, if known.
"""
_eq_keys = Location._eq_keys + ['db_name', 'access_code', 'version']
db_name = 'Other'
def __init__(self, db_code, version=None, details=None):
super().__init__(details)
self.access_code = db_code
self.version = version
def __str__(self):
return "<%s.%s(%s)>" % (self.__module__, self.__class__.__name__,
repr(self.access_code))
class EMDBLocation(DatabaseLocation):
"""Something stored in the EMDB database.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'EMDB'
class PDBLocation(DatabaseLocation):
"""Something stored in the PDB database.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'PDB'
class PDBDevLocation(DatabaseLocation):
"""Something stored in the PDB-Dev database.
This should only be used for legacy entries. All former PDB-Dev entries
(now PDB-IHM) should now have PDB identifiers; use :class:`PDBLocation`
instead.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'PDB-Dev'
class ModelArchiveLocation(DatabaseLocation):
"""Something stored in Model Archive.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'MODEL ARCHIVE'
class BMRBLocation(DatabaseLocation):
"""Something stored in the BMRB database.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'BMRB'
class MassIVELocation(DatabaseLocation):
"""Something stored in the MassIVE database.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'MASSIVE'
class EMPIARLocation(DatabaseLocation):
"""Something stored in the EMPIAR database.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'EMPIAR'
class SASBDBLocation(DatabaseLocation):
"""Something stored in the SASBDB database.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'SASBDB'
class PRIDELocation(DatabaseLocation):
"""Something stored in the PRIDE database.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'PRIDE'
class JPOSTLocation(DatabaseLocation):
"""Something stored in the JPOST database.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'jPOSTrepo'
class BioGRIDLocation(DatabaseLocation):
"""Something stored in the BioGRID database.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'BioGRID'
class ProXLLocation(DatabaseLocation):
"""Something stored in the ProXL database.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'ProXL'
class IProXLocation(DatabaseLocation):
"""Something stored in the iProX database.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'iProX'
class AlphaFoldDBLocation(DatabaseLocation):
"""Something stored in the AlphaFoldDB database.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'AlphaFoldDB'
class ProteomeXchangeLocation(DatabaseLocation):
"""Something stored in the ProteomeXchange database.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'ProteomeXchange'
class BMRbigLocation(DatabaseLocation):
"""Something stored in the BMRbig database.
See :class:`DatabaseLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects."""
db_name = 'BMRbig'
class FileLocation(Location):
"""Base class for an individual file or directory stored externally.
:param str path: the location of the file or directory (this can
be `None` if `repo` is set, to refer to the entire repository)
:param repo: object that describes the repository
containing the file, or `None` if it is stored on the local disk
:type repo: :class:`Repository`
:param str details: optional description of the file
:param str file_format: optional file type (e.g. TXT, PNG, FASTA)
"""
_eq_keys = Location._eq_keys + ['repo', 'path', 'content_type']
content_type = 'Other'
def __init__(self, path, repo=None, details=None, file_format=None):
super().__init__(details)
self.repo, self.file_format = repo, file_format
if repo:
self.path = path
# Cannot determine file size if non-local
self.file_size = None
else:
if not os.path.exists(path):
raise ValueError("%s does not exist" % path)
self.file_size = os.stat(path).st_size
# Store absolute path in case the working directory changes later
self.path = os.path.abspath(path)
def __str__(self):
return "<%s.%s(%s)>" % (self.__module__, self.__class__.__name__,
repr(self.path))
class InputFileLocation(FileLocation):
"""An externally stored file used as input.
See :class:`FileLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects.
For example, any :class:`~ihm.dataset.Dataset` that isn't stored in
a domain-specific database would use this class."""
content_type = 'Input data or restraints'
class OutputFileLocation(FileLocation):
"""An externally stored file used for output.
See :class:`FileLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects.
For example, this can be used to point to an externally-stored
:class:`model ensemble <ihm.model.Ensemble>` or a
:class:`localization density <ihm.model.LocalizationDensity>`.
"""
content_type = "Modeling or post-processing output"
class WorkflowFileLocation(FileLocation):
"""An externally stored file that controls the workflow (e.g. a script).
See :class:`FileLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects.
Typically these objects are used to provide more information on how
a :class:`~ihm.startmodel.StartingModel` was generated, how an
individual :class:`ihm.protocol.Step` or :class:`ihm.analysis.Step`
was performed, or to describe the overall modeling (by addition
to :data:`ihm.System.locations`). This can be useful to capture fine
details of the modeling that aren't covered by the mmCIF dictionary,
and to allow models to be precisely reproduced.
"""
content_type = "Modeling workflow or script"
class VisualizationFileLocation(FileLocation):
"""An externally stored file that is used for visualization.
See :class:`FileLocation` for a description of the parameters
and :class:`Location` for discussion of the usage of these objects.
"""
content_type = "Visualization script"
class Repository:
"""A repository containing modeling files, i.e. a collection of related
files at a remote, public location. This can include code repositories
such as GitHub, file archival services such as Zenodo, or any other
service that provides a DOI, such as the supplementary information for
a publication.
This can also be used if the script plus related files are part of a
repository, which has been archived somewhere with a DOI.
This will be used to construct permanent references to files
used in this modeling, even if they haven't been uploaded to
a database such as PDB or EMDB.
See :meth:`ihm.System.update_locations_in_repositories`.
See also :class:`FileLocation`.
:param str doi: the Digital Object Identifier for the repository
:param str root: the path on the local disk to the top-level
directory of the repository, or `None` if files in this
repository aren't checked out.
:param str url: If given, a location that this repository can be
downloaded from.
:param str top_directory: If given, prefix all paths for files in
this repository with this value. This is useful when the
archived version of the repository is found in a subdirectory
at the URL or DOI (for example, GitHub repositories
archived at Zenodo get placed in a subdirectory named
for the repository and git hash).
:param str details: Additional text describing this repository
"""
reference_type = 'DOI'
# Two repositories compare equal if their DOIs and URLs are the same
def __eq__(self, other):
return self.doi == other.doi and self.url == other.url
def __hash__(self):
return hash((self.doi, self.url))
def __str__(self):
return "<ihm.location.Repository(%r)>" % self.doi
def __init__(self, doi, root=None, url=None, top_directory=None,
details=None):
# todo: DOI should be optional (could also use URL, local path)
self.doi = doi
self.url, self.top_directory = url, top_directory
self.details = details
if root is not None:
# Store absolute path in case the working directory changes later
self._root = os.path.abspath(root)
reference = property(lambda self: self.doi)
def __get_reference_provider(self):
if self.reference and 'zenodo' in self.reference:
return 'Zenodo'
reference_provider = property(__get_reference_provider)
def __get_refers_to(self):
if self.url:
return 'Archive' if self.url.endswith(".zip") else 'File'
return 'Other'
refers_to = property(__get_refers_to)
@staticmethod
def _update_in_repos(fileloc, repos):
"""If the given FileLocation maps to somewhere within one of the
passed repositories, update it to reflect that."""
if fileloc.repo:
return
orig_path = fileloc.path
for repo in repos:
relpath = os.path.relpath(orig_path, repo._root)
if not relpath.startswith('..'):
# Prefer the shortest paths if multiple repositories can match
if fileloc.repo is None or len(fileloc.path) > len(relpath):
fileloc.repo = repo
fileloc.path = relpath
def _get_full_path(self, path):
"""Prefix the given path with our top-level directory"""
return os.path.join(self.top_directory or "", path)
|