File: _format.py

package info (click to toggle)
q2-fragment-insertion 2024.5.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 776 kB
  • sloc: python: 2,004; makefile: 32; sh: 13
file content (97 lines) | stat: -rw-r--r-- 3,836 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
# ----------------------------------------------------------------------------
# Copyright (c) 2016-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import re

import ijson
import skbio

import qiime2.plugin.model as model
from qiime2.plugin import ValidationError

from q2_types.feature_data import AlignedDNAFASTAFormat
from q2_types.tree import NewickFormat


class PlacementsFormat(model.TextFileFormat):
    fields = {'tree', 'placements', 'metadata', 'version', 'fields'}

    def _validate_(self, level):
        # doi.org/10.1371/journal.pone.0031009
        keys_found = set()

        # Can't self.open(mode='rb'), so we defer to the backing pathlib object
        with self.path.open(mode='rb') as fh:
            root_element = None
            for prefix, event, value in ijson.parse(fh):
                if root_element is None:
                    if event != 'start_map':
                        raise ValidationError('Root element of file must be a '
                                              'JSON object')
                    else:
                        root_element = True

                # Skip parsing attributes that could be prohibitively large
                if prefix.startswith('placements') \
                        or prefix.startswith('tree'):
                    continue

                # Restricted to only checking root-level keys
                if event == 'map_key' and prefix == '':
                    keys_found.add(value)

        if keys_found != self.fields:
            raise ValidationError('Expected the following fields: %s, found '
                                  '%s.' % (sorted(self.fields),
                                           sorted(keys_found)))


PlacementsDirFmt = model.SingleFileDirectoryFormat(
    'PlacementsDirFmt', 'placements.json', PlacementsFormat)


class RAxMLinfoFormat(model.TextFileFormat):
    def _validate_(self, level):
        sigs = ['This is RAxML version', 'Base frequencies',
                'Final GAMMA likelihood']

        info = self.path.read_text()

        for sig in sigs:
            new_sig = sig.replace(r' ', r'\W+')
            if not re.search(new_sig, info):
                raise ValidationError('Missing structured content: "%s".'
                                      % sig)


class SeppReferenceDirFmt(model.DirectoryFormat):
    alignment = model.File(r'aligned-dna-sequences.fasta',
                           format=AlignedDNAFASTAFormat)
    phylogeny = model.File(r'tree.nwk', format=NewickFormat)
    raxml_info = model.File(r'raxml-info.txt', format=RAxMLinfoFormat)

    def _validate_(self, level):
        seqs = self.alignment.view(skbio.TabularMSA)
        tree = self.phylogeny.view(skbio.TreeNode)

        seqs.reassign_index(minter='id')
        alignment_ids = set(seqs.index)
        phylogeny_ids = {t.name for t in tree.tips()}

        if alignment_ids != phylogeny_ids:
            raise ValidationError('IDs found in the alignment file that are '
                                  'missing in the phylogeny file: %s. IDs '
                                  'found in the phylogeny file that are '
                                  'missing in the alignment file: %s.'
                                  % (sorted(alignment_ids - phylogeny_ids),
                                     sorted(phylogeny_ids - alignment_ids)))

        # NOTE: not worrying about validating raxml info file at present. In
        # the future we will have a method that will _run_ raxml as part of the
        # database construction process, which will guarantee that the tree
        # matches the raxml info file.