1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169
|
#!/usr/bin/env python
# Copyright (c) 2005 David D. Ding <dding@berkeley.edu>
#
# This software is distributed under the MIT Open Source License.
# <http://www.opensource.org/licenses/mit-license.html>
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
"""Reads Sequences in interleaved Phylip format (not sequential) and returns a
list of sequences. Phylip is a very common phylogeny generating sequence type
that has the following traits:
1) First line contains number of species and number of characters in a species'
sequence. Options may follow, and they can be spaced or unspaced. Options are
simply letters such as A and W after the number of characters.
2) Options don't have to contain U in order for a usertree to appear.
3) If there are options then options appear first, then the sequences. For the
first iteration of sequences the first ten spaces are reserved for names of
options and species, the rest is for sequences.
4) For the second and following iterations the names are removed, only
sequence appears
4) At end of file a usertree may appear. First there is a number that indicts
the number of lines the usertree will take, and then the usertrees follow.
Examples:
6 50 W
W 0101001111 0101110101 01011
dmras1 GTCGTCGTTG GACCTGGAGG CGTGG
hschras GTGGTGGTGG GCGCCGGCCG TGTGG
ddrasa GTTATTGTTG GTGGTGGTGG TGTCG
spras GTAGTTGTAG GAGATGGTGG TGTTG
scras1 GTAGTTGTCG GTGGAGGTGG CGTTG
scras2 GTCGTCGTTG GTGGTGGTGG TGTTG
0101001111 0101110101 01011
GTCGTCGTTG GACCTGGAGG CGTGG
GTGGTGGTGG GCGCCGGCCG TGTGG
GTTATTGTTG GTGGTGGTGG TGTCG
GTAGTTGTAG GAGATGGTGG TGTTG
GTAGTTGTCG GTGGAGGTGG CGTTG
GTCGTCGTTG GTGGTGGTGG TGTTG
1
((dmras1,ddrasa),((hschras,spras),(scras1,scras2)));
"""
from typing import Iterator, Optional, TextIO
from ..seq import Alphabet, Seq, SeqList
names = ("phylip",)
extensions = ("phy",)
def iterseq(fin: TextIO, alphabet: Optional[Alphabet] = None) -> Iterator[Seq]:
"""Iterate over the sequences in the file."""
# Default implementation
return iter(read(fin, alphabet))
# Read takes in a phylip file name, reads it, processes it, and returns a SeqList
def read(fin: TextIO, alphabet: Optional[Alphabet] = None) -> SeqList:
sequence: list = [] # where sequences are stored
idents = []
num_seq = 0
num_total_seq = 0 # length of sequence of 1 species
tracker = 0 # track what sequence the line is on
usertree_tracker = 0 # track usertree lines
options = "" # options
num_options = 0 # number/lens of options - U
line = fin.readline()
while line:
s_line = (
line.split()
) # for ease of use, not used in all scenarios, but easier on the eye
if s_line == []: # see nothing do nothing
pass
elif (
s_line[0].isdigit()
and len(s_line) == 1
and len(sequence) == num_seq
and len(sequence[0]) == num_total_seq
):
usertree_tracker = int(s_line[0])
pass
elif num_options > 0:
if len(sequence) < num_seq:
if s_line[0][0] in options:
num_options -= 1
pass
else:
raise ValueError(
"Not an option, but it should be one"
) # pragma: no cover
else:
num_options -= 1
pass
elif usertree_tracker > 0: # basically skip usertree
if len(sequence[num_seq - 1]) == num_total_seq:
usertree_tracker -= 1 # pragma: no cover
else:
raise ValueError("User Tree in Wrong Place")
# problems parse error unexpected
elif s_line[0].isdigit():
if len(s_line) >= 2 and len(sequence) == 0: # identifies first line of file
num_seq = int(s_line[0]) # get number of sequences
num_total_seq = int(s_line[1]) # get length of sequences
if len(s_line) > 2: # takes care of the options
options = "".join(s_line[2:])
num_options = len(options) - options.count("U")
else:
raise ValueError("parse error")
# when options end, this takes care of the sequence
elif num_options == 0:
if num_seq == 0:
raise ValueError("Empty File, or possibly wrong file")
elif tracker < num_seq:
if num_seq > len(sequence):
sequence.append("".join(line[10:].split())) # removes species name
idents.append(line[0:10].strip())
tracker += 1
else:
sequence[tracker] += "".join(s_line)
tracker += 1
if tracker == num_seq:
tracker = 0
num_options = len(options) - options.count("U")
line = fin.readline()
if len(sequence) != len(idents) or len(sequence) != num_seq:
raise ValueError("Number of different sequences wrong") # pragma: no cover
seqs = []
for i in range(0, len(idents)):
if len(sequence[i]) == num_total_seq:
seqs.append(Seq(sequence[i], alphabet, idents[i]))
else:
raise ValueError("extra sequence in list") # pragma: no cover
return SeqList(seqs)
|