File: IgIO.py

package info (click to toggle)
python-biopython 1.73%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 57,852 kB
  • sloc: python: 169,977; xml: 97,539; ansic: 15,653; sql: 1,208; makefile: 159; sh: 63
file content (117 lines) | stat: -rw-r--r-- 3,880 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# Copyright 2008-2015 by Peter Cock.  All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO support for the "ig" (IntelliGenetics or MASE) file format.

This module is for reading and writing IntelliGenetics format files as
SeqRecord objects.  This file format appears to be the same as the MASE
multiple sequence alignment format.

You are expected to use this module via the Bio.SeqIO functions.
"""

from __future__ import print_function

from Bio.Alphabet import single_letter_alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


def IgIterator(handle, alphabet=single_letter_alphabet):
    """Iterate over IntelliGenetics records (as SeqRecord objects).

    handle - input file
    alphabet - optional alphabet

    The optional free format file header lines (which start with two
    semi-colons) are ignored.

    The free format commentary lines at the start of each record (which
    start with a semi-colon) are recorded as a single string with embedded
    new line characters in the SeqRecord's annotations dictionary under the
    key 'comment'.

    Examples
    --------
    >>> with open("IntelliGenetics/TAT_mase_nuc.txt") as handle:
    ...     for record in IgIterator(handle):
    ...         print("%s length %i" % (record.id, len(record)))
    ...
    A_U455 length 303
    B_HXB2R length 306
    C_UG268A length 267
    D_ELI length 309
    F_BZ163A length 309
    O_ANT70 length 342
    O_MVP5180 length 348
    CPZGAB length 309
    CPZANT length 309
    A_ROD length 390
    B_EHOA length 420
    D_MM251 length 390
    STM_STM length 387
    VER_AGM3 length 354
    GRI_AGM677 length 264
    SAB_SAB1C length 219
    SYK_SYK length 330

    """
    # Skip any file header text before the first record (;; lines)
    while True:
        line = handle.readline()
        if not line:
            break  # Premature end of file, or just empty?
        if not line.startswith(";;"):
            break

    while line:
        # Now iterate over the records
        if line[0] != ";":
            raise ValueError(
                "Records should start with ';' and not:\n%r" % line)

        # Try and agree with SeqRecord convention from the GenBank parser,
        # (and followed in the SwissProt parser) which stores the comments
        # as a long string with newlines under annotations key 'comment'.

        # Note some examples use "; ..." and others ";..."
        comment_lines = []
        while line.startswith(";"):
            # TODO - Extract identifier from lines like "LOCUS\tB_SF2"?
            comment_lines.append(line[1:].strip())
            line = handle.readline()
        title = line.rstrip()

        seq_lines = []
        while True:
            line = handle.readline()
            if not line:
                break
            if line[0] == ";":
                break
            # Remove trailing whitespace, and any internal spaces
            seq_lines.append(line.rstrip().replace(" ", ""))
        seq_str = "".join(seq_lines)
        if seq_str.endswith("1"):
            # Remove the optional terminator (digit one)
            seq_str = seq_str[:-1]
        if "1" in seq_str:
            raise ValueError(
                "Potential terminator digit one found within sequence.")

        # Return the record and then continue...
        record = SeqRecord(Seq(seq_str, alphabet),
                           id=title, name=title)
        record.annotations['comment'] = "\n".join(comment_lines)
        yield record

    # We should be at the end of the file now
    assert not line


if __name__ == "__main__":
    from Bio._utils import run_doctest
    run_doctest(verbose=0)