1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
|
#!/usr/bin/env python
# Copyright (c) 2005 Clare Gollnick <cgollnick@berkeley.edu>
#
# This software is distributed under the MIT Open Source License.
# <http://www.opensource.org/licenses/mit-license.html>
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
# to deal in the Software without restriction, including without limitation
# the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the
# Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
#
"""Read sequence information in MSF format.
This is a file format for biological sequence data. The sequences are interweaved and each line is
labeled with the sequence name. The MSF format can be identified in one or more of the following
ways:
1. The word PileUp on the first line (optional)
2. the word !!AA_MULTIPLE_ALIGNMENT or !!NA_MULTIPLE_ALIGNMENT at the start of the file (optional)
3. the word MSF on the first line of the file, and the characters ".." at the end of this line
(optional)
4. A header containing sequence information followed by a line with the characters "//"
"""
import re
from typing import Iterator, Optional, TextIO
from ..seq import Alphabet, Seq, SeqList
from ..utils import Token
example = """
PileUp
MSF: 64 Type: P Check: 767 ..
Name: Cow Len: 100 Check: 3761 Weight: 1.00
Name: Carp Len: 100 Check: 1550 Weight: 1.00
Name: Chicken Len: 100 Check: 2397 Weight: 1.00
Name: Human Len: 100 Check: 9021 Weight: 1.00
Name: Loach Len: 100 Check: 984 Weight: 1.00
Name: Mouse Len: 100 Check: 2993 Weight: 1.00
//
Cow MAYPMQLGFQ DATSPIMEEL LHFHDHTLMI VFLISSLVLY IISLMLTTKL
Carp MAHPTQLGFK DAAMPVMEEL LHFHDHALMI VLLISTLVLY IITAMVSTKL
Chicken MANHSQLGFQ DASSPIMEEL VEFHDHALMV ALAICSLVLY LLTLMLMEKL
Human MAHAAQVGLQ DATSPIMEEL ITFHDHALMI IFLICFLVLY ALFLTLTTKL
Loach MAHPTQLGFQ DAASPVMEEL LHFHDHALMI VFLISALVLY VIITTVSTKL
Mouse MAYPFQLGLQ DATSPIMEEL MNFHDHTLMI VFLISSLVLY IISLMLTTKL
Cow THTSTMDAQE VETIWTILPA IILILIALPS LRILYMMDEI NNPSLTVKTM
Carp TNKYILDSQE IEIVWTILPA VILVLIALPS LRILYLMDEI NDPHLTIKAM
Chicken S.SNTVDAQE VELIWTILPA IVLVLLALPS LQILYMMDEI DEPDLTLKAI
Human TNTNISDAQE METVWTILPA IILVLIALPS LRILYMTDEV NDPSLTIKSI
Loach TNMYILDSQE IEIVWTVLPA LILILIALPS LRILYLMDEI NDPHLTIKAM
Mouse THTSTMDAQE VETIWTILPA VILIMIALPS LRILYMMDEI NNPVLTVKTM
"""
names = ("msf", "gcg-msf", "gcg", "PileUp")
extensions = ("msf",)
end_header = re.compile(r"(//)(\s*)$")
seq_line = re.compile(r"\s*(\S+)\s+([\S\s.?]+)$")
def iterseq(fin: TextIO, alphabet: Optional[Alphabet] = None) -> Iterator[Seq]:
"""Iterate over the sequences in the file."""
# Default implementation
return iter(read(fin, alphabet))
def read(fin: TextIO, alphabet: Optional[Alphabet] = None) -> SeqList:
alphabet = Alphabet(alphabet)
seq_ids = []
seqs: list = []
block_count = 0
for token in _line_is(fin):
if token.typeof == "begin_block":
block_count = 0
elif token.typeof == "seq_id":
if len(seqs) <= block_count:
seq_ids.append(token.data)
seqs.append([])
elif token.typeof == "seq":
data = token.data
assert data is not None
if not alphabet.alphabetic(data):
raise ValueError(
"Character on line: %d not in alphabet: %s : %s"
% (token.lineno, alphabet, token.data)
)
seqs[block_count].append(data)
block_count += 1
if seq_ids == []:
raise ValueError("Parse error, possible wrong format")
seqs = [Seq("".join(s), alphabet, name=i) for s, i in zip(seqs, seq_ids)]
return SeqList(seqs)
def _line_is(fin: TextIO) -> Iterator[Token]:
header, body, block = range(3)
yield Token("begin")
state = header
for L, line in enumerate(fin):
if state == header:
if line.isspace():
continue
m = end_header.match(line)
if m is not None:
yield Token("end_header")
state = body
continue
else:
continue # pragma: no cover
if state == body:
if line.isspace():
continue
yield Token("begin_block")
state = block
# skips to a block of sequences
if state == block:
if line.isspace():
yield Token("end_block")
state = body
continue
m = seq_line.match(line)
if m is None:
raise ValueError("Parse error on line: %d" % L) # pragma: no cover
if m.group(1).isdigit() and m.group(2).strip().isdigit():
continue
yield Token("seq_id", m.group(1).strip())
data = m.group(2)
data = "".join((data.split()))
yield Token("seq", data.strip())
|