1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115
|
# Copyright 2008-2015 by Peter Cock. All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license. Please see the LICENSE file that should have been included
# as part of this package.
#
# This module is for reading and writing IntelliGenetics format files as
# SeqRecord objects. This file format appears to be the same as the MASE
# multiple sequence alignment format.
"""Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format.
You are expected to use this module via the Bio.SeqIO functions.
"""
from __future__ import print_function
from Bio.Alphabet import single_letter_alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
def IgIterator(handle, alphabet=single_letter_alphabet):
"""Iterate over IntelliGenetics records (as SeqRecord objects).
handle - input file
alphabet - optional alphabet
The optional free format file header lines (which start with two
semi-colons) are ignored.
The free format commentary lines at the start of each record (which
start with a semi-colon) are recorded as a single string with embedded
new line characters in the SeqRecord's annotations dictionary under the
key 'comment'.
Example:
>>> with open("IntelliGenetics/TAT_mase_nuc.txt") as handle:
... for record in IgIterator(handle):
... print("%s length %i" % (record.id, len(record)))
...
A_U455 length 303
B_HXB2R length 306
C_UG268A length 267
D_ELI length 309
F_BZ163A length 309
O_ANT70 length 342
O_MVP5180 length 348
CPZGAB length 309
CPZANT length 309
A_ROD length 390
B_EHOA length 420
D_MM251 length 390
STM_STM length 387
VER_AGM3 length 354
GRI_AGM677 length 264
SAB_SAB1C length 219
SYK_SYK length 330
"""
# Skip any file header text before the first record (;; lines)
while True:
line = handle.readline()
if not line:
break # Premature end of file, or just empty?
if not line.startswith(";;"):
break
while line:
# Now iterate over the records
if line[0] != ";":
raise ValueError(
"Records should start with ';' and not:\n%r" % line)
# Try and agree with SeqRecord convention from the GenBank parser,
# (and followed in the SwissProt parser) which stores the comments
# as a long string with newlines under annotations key 'comment'.
# Note some examples use "; ..." and others ";..."
comment_lines = []
while line.startswith(";"):
# TODO - Extract identifier from lines like "LOCUS\tB_SF2"?
comment_lines.append(line[1:].strip())
line = handle.readline()
title = line.rstrip()
seq_lines = []
while True:
line = handle.readline()
if not line:
break
if line[0] == ";":
break
# Remove trailing whitespace, and any internal spaces
seq_lines.append(line.rstrip().replace(" ", ""))
seq_str = "".join(seq_lines)
if seq_str.endswith("1"):
# Remove the optional terminator (digit one)
seq_str = seq_str[:-1]
if "1" in seq_str:
raise ValueError(
"Potential terminator digit one found within sequence.")
# Return the record and then continue...
record = SeqRecord(Seq(seq_str, alphabet),
id=title, name=title)
record.annotations['comment'] = "\n".join(comment_lines)
yield record
# We should be at the end of the file now
assert not line
if __name__ == "__main__":
from Bio._utils import run_doctest
run_doctest(verbose=0)
|