File: location.py

package info (click to toggle)
python-ihm 2.7-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 3,368 kB
  • sloc: python: 30,422; ansic: 5,990; sh: 24; makefile: 20
file content (367 lines) | stat: -rw-r--r-- 14,861 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
"""Classes for tracking external data used by mmCIF models.
"""

import os


class Location:
    """Identifies the location where a resource can be found.

       Do not use this class itself, but one of its subclasses.
       Typically the resource may be found in a file (either on the local
       disk or at a DOI) - for this use one of the subclasses of
       :class:`FileLocation`. Alternatively the resource may be found in
       an experiment-specific database such as PDB or EMDB - for this use
       :class:`DatabaseLocation` or one of its subclasses. A Location may
       be passed to

         - a :class:`~ihm.dataset.Dataset` to point to where an
           experimental dataset may be found;
         - an :class:`~ihm.model.Ensemble` to point to coordinates for an
           entire ensemble, for example as a DCD file;
         - a :class:`ihm.model.LocalizationDensity` to point to an external
           localization density, for example in MRC format;
         - :data:`ihm.System.locations` to point to other files relating
           to the modeling in general, such as a modeling control script
           (:class:`WorkflowFileLocation`) or a command script for a
           visualization package such as ChimeraX
           (:class:`VisualizationFileLocation`);
         - a :class:`ihm.protocol.Step` or :class:`ihm.analysis.Step` to
           describe an individual modeling step;
         - or a :class:`~ihm.startmodel.StartingModel` to describe how a
           starting model was constructed.

       :param str details: Additional details about the dataset, if known.

    """

    # 'details' can differ without affecting dataset equality
    _eq_keys = []
    _allow_duplicates = False

    def __init__(self, details=None):
        self.details = details

    # Locations compare equal iff they are the same class, have the
    # same attributes, and allow_duplicates=False
    def _eq_vals(self):
        if self._allow_duplicates:
            return id(self)
        else:
            return tuple([self.__class__]
                         + [getattr(self, x) for x in self._eq_keys])

    def __eq__(self, other):
        # We can never be equal to None
        return other is not None and self._eq_vals() == other._eq_vals()

    def __hash__(self):
        return hash(self._eq_vals())


class DatabaseLocation(Location):
    """A dataset stored in an official database (PDB, EMDB, PRIDE, etc.).
       Generally a subclass should be used specific to the database -
       for example, :class:`PDBLocation`, :class:`EMDBLocation`, or
       :class:`PRIDELocation`, although this base class can be used directly
       for "other" databases not currently supported by the IHM dictionary.

       :param str db_code: The accession code inside the database.
       :param str version: The version of the dataset in the database.
       :param str details: Additional details about the dataset, if known.
    """

    _eq_keys = Location._eq_keys + ['db_name', 'access_code', 'version']
    db_name = 'Other'

    def __init__(self, db_code, version=None, details=None):
        super().__init__(details)
        self.access_code = db_code
        self.version = version

    def __str__(self):
        return "<%s.%s(%s)>" % (self.__module__, self.__class__.__name__,
                                repr(self.access_code))


class EMDBLocation(DatabaseLocation):
    """Something stored in the EMDB database.
       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'EMDB'


class PDBLocation(DatabaseLocation):
    """Something stored in the PDB database.
       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'PDB'


class PDBDevLocation(DatabaseLocation):
    """Something stored in the PDB-Dev database.
       This should only be used for legacy entries. All former PDB-Dev entries
       (now PDB-IHM) should now have PDB identifiers; use :class:`PDBLocation`
       instead.

       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'PDB-Dev'


class ModelArchiveLocation(DatabaseLocation):
    """Something stored in Model Archive.
       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'MODEL ARCHIVE'


class BMRBLocation(DatabaseLocation):
    """Something stored in the BMRB database.
       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'BMRB'


class MassIVELocation(DatabaseLocation):
    """Something stored in the MassIVE database.
       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'MASSIVE'


class EMPIARLocation(DatabaseLocation):
    """Something stored in the EMPIAR database.
       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'EMPIAR'


class SASBDBLocation(DatabaseLocation):
    """Something stored in the SASBDB database.
       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'SASBDB'


class PRIDELocation(DatabaseLocation):
    """Something stored in the PRIDE database.
       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'PRIDE'


class JPOSTLocation(DatabaseLocation):
    """Something stored in the JPOST database.
       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'jPOSTrepo'


class BioGRIDLocation(DatabaseLocation):
    """Something stored in the BioGRID database.
       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'BioGRID'


class ProXLLocation(DatabaseLocation):
    """Something stored in the ProXL database.
       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'ProXL'


class IProXLocation(DatabaseLocation):
    """Something stored in the iProX database.
       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'iProX'


class AlphaFoldDBLocation(DatabaseLocation):
    """Something stored in the AlphaFoldDB database.
       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'AlphaFoldDB'


class ProteomeXchangeLocation(DatabaseLocation):
    """Something stored in the ProteomeXchange database.
       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'ProteomeXchange'


class BMRbigLocation(DatabaseLocation):
    """Something stored in the BMRbig database.
       See :class:`DatabaseLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects."""
    db_name = 'BMRbig'


class FileLocation(Location):
    """Base class for an individual file or directory stored externally.

       :param str path: the location of the file or directory (this can
              be `None` if `repo` is set, to refer to the entire repository)
       :param repo: object that describes the repository
              containing the file, or `None` if it is stored on the local disk
       :type repo: :class:`Repository`
       :param str details: optional description of the file
       :param str file_format: optional file type (e.g. TXT, PNG, FASTA)
    """

    _eq_keys = Location._eq_keys + ['repo', 'path', 'content_type']

    content_type = 'Other'

    def __init__(self, path, repo=None, details=None, file_format=None):
        super().__init__(details)
        self.repo, self.file_format = repo, file_format
        if repo:
            self.path = path
            # Cannot determine file size if non-local
            self.file_size = None
        else:
            if not os.path.exists(path):
                raise ValueError("%s does not exist" % path)
            self.file_size = os.stat(path).st_size
            # Store absolute path in case the working directory changes later
            self.path = os.path.abspath(path)

    def __str__(self):
        return "<%s.%s(%s)>" % (self.__module__, self.__class__.__name__,
                                repr(self.path))


class InputFileLocation(FileLocation):
    """An externally stored file used as input.
       See :class:`FileLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects.

       For example, any :class:`~ihm.dataset.Dataset` that isn't stored in
       a domain-specific database would use this class."""
    content_type = 'Input data or restraints'


class OutputFileLocation(FileLocation):
    """An externally stored file used for output.
       See :class:`FileLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects.

       For example, this can be used to point to an externally-stored
       :class:`model ensemble <ihm.model.Ensemble>` or a
       :class:`localization density <ihm.model.LocalizationDensity>`.
    """
    content_type = "Modeling or post-processing output"


class WorkflowFileLocation(FileLocation):
    """An externally stored file that controls the workflow (e.g. a script).
       See :class:`FileLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects.

       Typically these objects are used to provide more information on how
       a :class:`~ihm.startmodel.StartingModel` was generated, how an
       individual :class:`ihm.protocol.Step` or :class:`ihm.analysis.Step`
       was performed, or to describe the overall modeling (by addition
       to :data:`ihm.System.locations`). This can be useful to capture fine
       details of the modeling that aren't covered by the mmCIF dictionary,
       and to allow models to be precisely reproduced.
    """
    content_type = "Modeling workflow or script"


class VisualizationFileLocation(FileLocation):
    """An externally stored file that is used for visualization.
       See :class:`FileLocation` for a description of the parameters
       and :class:`Location` for discussion of the usage of these objects.
    """
    content_type = "Visualization script"


class Repository:
    """A repository containing modeling files, i.e. a collection of related
       files at a remote, public location. This can include code repositories
       such as GitHub, file archival services such as Zenodo, or any other
       service that provides a DOI, such as the supplementary information for
       a publication.

       This can also be used if the script plus related files are part of a
       repository, which has been archived somewhere with a DOI.
       This will be used to construct permanent references to files
       used in this modeling, even if they haven't been uploaded to
       a database such as PDB or EMDB.
       See :meth:`ihm.System.update_locations_in_repositories`.

       See also :class:`FileLocation`.

       :param str doi: the Digital Object Identifier for the repository
       :param str root: the path on the local disk to the top-level
              directory of the repository, or `None` if files in this
              repository aren't checked out.
       :param str url: If given, a location that this repository can be
              downloaded from.
       :param str top_directory: If given, prefix all paths for files in
              this repository with this value. This is useful when the
              archived version of the repository is found in a subdirectory
              at the URL or DOI (for example, GitHub repositories
              archived at Zenodo get placed in a subdirectory named
              for the repository and git hash).
       :param str details: Additional text describing this repository
    """

    reference_type = 'DOI'

    # Two repositories compare equal if their DOIs and URLs are the same
    def __eq__(self, other):
        return self.doi == other.doi and self.url == other.url

    def __hash__(self):
        return hash((self.doi, self.url))

    def __str__(self):
        return "<ihm.location.Repository(%r)>" % self.doi

    def __init__(self, doi, root=None, url=None, top_directory=None,
                 details=None):
        # todo: DOI should be optional (could also use URL, local path)
        self.doi = doi
        self.url, self.top_directory = url, top_directory
        self.details = details
        if root is not None:
            # Store absolute path in case the working directory changes later
            self._root = os.path.abspath(root)

    reference = property(lambda self: self.doi)

    def __get_reference_provider(self):
        if self.reference and 'zenodo' in self.reference:
            return 'Zenodo'
    reference_provider = property(__get_reference_provider)

    def __get_refers_to(self):
        if self.url:
            return 'Archive' if self.url.endswith(".zip") else 'File'
        return 'Other'
    refers_to = property(__get_refers_to)

    @staticmethod
    def _update_in_repos(fileloc, repos):
        """If the given FileLocation maps to somewhere within one of the
           passed repositories, update it to reflect that."""
        if fileloc.repo:
            return
        orig_path = fileloc.path
        for repo in repos:
            relpath = os.path.relpath(orig_path, repo._root)
            if not relpath.startswith('..'):
                # Prefer the shortest paths if multiple repositories can match
                if fileloc.repo is None or len(fileloc.path) > len(relpath):
                    fileloc.repo = repo
                    fileloc.path = relpath

    def _get_full_path(self, path):
        """Prefix the given path with our top-level directory"""
        return os.path.join(self.top_directory or "", path)