File: fastareader.py

package info (click to toggle)
python-dendropy 4.2.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 68,392 kB
  • ctags: 3,947
  • sloc: python: 41,840; xml: 1,400; makefile: 15
file content (145 lines) | stat: -rw-r--r-- 7,333 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
#! /usr/bin/env python

##############################################################################
##  DendroPy Phylogenetic Computing Library.
##
##  Copyright 2010-2015 Jeet Sukumaran and Mark T. Holder.
##  All rights reserved.
##
##  See "LICENSE.rst" for terms and conditions of usage.
##
##  If you use this work or any portion thereof in published work,
##  please cite it as:
##
##     Sukumaran, J. and M. T. Holder. 2010. DendroPy: a Python library
##     for phylogenetic computing. Bioinformatics 26: 1569-1571.
##
##############################################################################

"""
Implementation of FASTA-format data reader.
"""

from dendropy.dataio import ioservice
from dendropy.utility.error import DataParseError
from dendropy.utility import deprecate

class FastaReader(ioservice.DataReader):
    "Encapsulates loading and parsing of a FASTA format file."

    def __init__(self, **kwargs):
        """
        Keyword Arguments
        -----------------
        data_type: str
            When reading into a |DataSet| object, the type of data must be
            specified: "dna", "rna", "protein", "restriction", "infinite",
            "standard", or "continuous".
        default_state_alphabet: |StateAlphabet| instance
            A |StateAlphabet| object to be used to manage the alphabet of the
            characters (|StandardCharacterMatrix| **only**).
        """
        ioservice.DataReader.__init__(self)
        self.data_type = kwargs.pop("data_type", None)
        self.default_state_alphabet = kwargs.pop("default_state_alphabet", None)
        if self.default_state_alphabet is not None:
            if self.data_type is None:
                self.data_type = "standard"
            elif self.data_type != "standard":
                raise ValueError("Cannot specify 'default_state_alphabet' with data type of '{}'".format(self.data_type))
        self.check_for_unused_keyword_arguments(kwargs)

    def _read(self,
            stream,
            taxon_namespace_factory=None,
            tree_list_factory=None,
            char_matrix_factory=None,
            state_alphabet_factory=None,
            global_annotations_target=None):
        taxon_namespace = taxon_namespace_factory(label=None)
        if self.data_type is None:
            raise TypeError("Data type must be specified for this schema")
        if self.data_type == "standard" and self.default_state_alphabet is not None:
            char_matrix = char_matrix_factory(
                    self.data_type,
                    label=None,
                    taxon_namespace=taxon_namespace,
                    default_state_alphabet=self.default_state_alphabet,
                    )
        else:
            char_matrix = char_matrix_factory(
                    self.data_type,
                    label=None,
                    taxon_namespace=taxon_namespace)
        symbol_state_map = char_matrix.default_state_alphabet.full_symbol_state_map
        curr_vec = None
        curr_taxon = None
        for line_index, line in enumerate(stream):
            s = line.strip()
            if not s:
                continue
            if s.startswith('>'):
                name = s[1:].strip()
                curr_taxon = taxon_namespace.require_taxon(label=name)
                if curr_taxon in char_matrix:
                    raise DataParseError(message="FASTA error: Repeated sequence name ('{}') found".format(name), line_num=line_index + 1, stream=stream)
                if curr_vec is not None and len(curr_vec) == 0:
                    raise DataParseError(message="FASTA error: Expected sequence, but found another sequence name ('{}')".format(name), line_num=line_index + 1, stream=stream)
                curr_vec = char_matrix[curr_taxon]
            elif curr_vec is None:
                raise DataParseError(message="FASTA error: Expecting a lines starting with > before sequences", line_num=line_index + 1, stream=stream)
            else:
                states = []
                for col_ind, c in enumerate(s):
                    c = c.strip()
                    if not c:
                        continue
                    try:
                        state = symbol_state_map[c]
                    except KeyError:
                        raise DataParseError(message="Unrecognized sequence symbol '{}'".format(c), line_num=line_index + 1, col_num=col_ind + 1, stream=stream)
                    states.append(state)
                curr_vec.extend(states)
        product = self.Product(
                taxon_namespaces=None,
                tree_lists=None,
                char_matrices=[char_matrix])
        return product


class DnaFastaReader(FastaReader):

    def __init__(self, **kwargs):
        deprecate.dendropy_deprecation_warning(
                preamble="Deprecated since DendroPy 4:",
                old_construct="d = dendropy.CharacterMatrix.get_from_path(schema='dnafasta', ...)\nd = dendropy.DataSet.get_from_path(schema='dnafasta', ...)",
                new_construct="d = dendropy.DnaCharacterMatrix.get(path=..., schema='fasta', ...)\nd = dendropy.DataSet.get(path=..., schema='fasta', data_type='dna', ...)",
                stacklevel=7)
        # raise TypeError("'dnafasta' is no longer a supported schema: use 'schema=\"fasta\"' with the 'DnaCharacterMatrix.get()' method instead or 'schema=\"fasta\"' and 'data_type=\"dna\" with the 'DataSet.get()' or 'DataSet.read()' methods")
        kwargs["data_type"] = "dna"
        FastaReader.__init__(self, **kwargs)

class RnaFastaReader(FastaReader):

    def __init__(self, **kwargs):
        deprecate.dendropy_deprecation_warning(
                preamble="Deprecated since DendroPy 4:",
                old_construct="d = dendropy.CharacterMatrix.get_from_path(schema='rnafasta', ...)\nd = dendropy.DataSet.get_from_path(schema='rnafasta', ...)",
                new_construct="d = dendropy.RnaCharacterMatrix.get(path=..., schema='fasta', ...)\nd = dendropy.DataSet.get(path=..., schema='fasta', data_type='rna', ...)",
                stacklevel=7)
        # raise TypeError("'rnafasta' is no longer a supported schema: use 'schema=\"fasta\"' with the 'RnaCharacterMatrix.get()' method instead or 'schema=\"fasta\"' and 'data_type=\"dna\" with the 'DataSet.get()' or 'DataSet.read()' methods")
        kwargs["data_type"] = "rna"
        FastaReader.__init__(self, **kwargs)

class ProteinFastaReader(FastaReader):

    def __init__(self, **kwargs):
        deprecate.dendropy_deprecation_warning(
                preamble="Deprecated since DendroPy 4:",
                old_construct="d = dendropy.CharacterMatrix.get_from_path(schema='proteinfasta', ...)\nd = dendropy.DataSet.get_from_path(schema='proteinfasta', ...)",
                new_construct="d = dendropy.ProteinCharacterMatrix.get(path=..., schema='fasta', ...)\nd = dendropy.DataSet.get(path=..., schema='fasta', data_type='protein', ...)",
                stacklevel=7)
        # raise TypeError("'proteinfasta' is no longer a supported schema: use 'schema=\"fasta\"' with the 'ProteinCharacterMatrix.get()' method instead or 'schema=\"fasta\"' and 'data_type=\"dna\" with the 'DataSet.get()' or 'DataSet.read()' methods")
        kwargs["data_type"] = "protein"
        FastaReader.__init__(self, **kwargs)