File: TabIO.py

package info (click to toggle)
python-biopython 1.73%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 57,852 kB
  • sloc: python: 169,977; xml: 97,539; ansic: 15,653; sql: 1,208; makefile: 159; sh: 63
file content (126 lines) | stat: -rw-r--r-- 4,490 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
# Copyright 2008-2017 by Peter Cock.  All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Bio.SeqIO support for the "tab" (simple tab separated) file format.

You are expected to use this module via the Bio.SeqIO functions.

The "tab" format is an ad-hoc plain text file format where each sequence is
on one (long) line.  Each line contains the identifier/description, followed
by a tab, followed by the sequence.  For example, consider the following
short FASTA format file::

    >ID123456 possible binding site?
    CATCNAGATGACACTACGACTACGACTCAGACTAC
    >ID123457 random sequence
    ACACTACGACTACGACTCAGACTACAAN

Apart from the descriptions, this can be represented in the simple two column
tab separated format as follows::

    ID123456(tab)CATCNAGATGACACTACGACTACGACTCAGACTAC
    ID123457(tab)ACACTACGACTACGACTCAGACTACAAN

When reading this file, "ID123456" or "ID123457" will be taken as the record's
.id and .name property.  There is no other information to record.

Similarly, when writing to this format, Biopython will ONLY record the record's
.id and .seq (and not the description or any other information) as in the
example above.
"""

from __future__ import print_function

from Bio.Alphabet import single_letter_alphabet
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqIO.Interfaces import SequentialSequenceWriter
from Bio.SeqIO.Interfaces import _clean, _get_seq_string


def TabIterator(handle, alphabet=single_letter_alphabet):
    """Iterate over tab separated lines as SeqRecord objects.

    Each line of the file should contain one tab only, dividing the line
    into an identifier and the full sequence.

    Arguments:
     - handle - input file
     - alphabet - optional alphabet

    The first field is taken as the record's .id and .name (regardless of
    any spaces within the text) and the second field is the sequence.

    Any blank lines are ignored.

    Examples
    --------
    >>> with open("GenBank/NC_005816.tsv") as handle:
    ...     for record in TabIterator(handle):
    ...         print("%s length %i" % (record.id, len(record)))
    gi|45478712|ref|NP_995567.1| length 340
    gi|45478713|ref|NP_995568.1| length 260
    gi|45478714|ref|NP_995569.1| length 64
    gi|45478715|ref|NP_995570.1| length 123
    gi|45478716|ref|NP_995571.1| length 145
    gi|45478717|ref|NP_995572.1| length 357
    gi|45478718|ref|NP_995573.1| length 138
    gi|45478719|ref|NP_995574.1| length 312
    gi|45478720|ref|NP_995575.1| length 99
    gi|45478721|ref|NP_995576.1| length 90

    """
    for line in handle:
        try:
            title, seq = line.split("\t")  # will fail if more than one tab!
        except ValueError:
            if line.strip() == "":
                # It's a blank line, ignore it
                continue
            raise ValueError("Each line should have one tab separating the" +
                             " title and sequence, this line has %i tabs: %r"
                             % (line.count("\t"), line))
        title = title.strip()
        seq = seq.strip()  # removes the trailing new line
        yield SeqRecord(Seq(seq, alphabet),
                        id=title, name=title,
                        description="")


class TabWriter(SequentialSequenceWriter):
    """Class to write simple tab separated format files (OBSOLETE).

    Each line consists of "id(tab)sequence" only.

    Any description, name or other annotation is not recorded.

    This class is now obsolete. Please use the function ``as_tab`` instead,
    or the top level ``Bio.SeqIO.write()`` function with ``format="tab"``.
    """

    def write_record(self, record):
        """Write a single tab line to the file."""
        assert self._header_written
        assert not self._footer_written
        self._record_written = True
        self.handle.write(as_tab(record))


def as_tab(record):
    title = _clean(record.id)
    seq = _get_seq_string(record)  # Catches sequence being None
    assert "\t" not in title
    assert "\n" not in title
    assert "\r" not in title
    assert "\t" not in seq
    assert "\n" not in seq
    assert "\r" not in seq
    return "%s\t%s\n" % (title, seq)


if __name__ == "__main__":
    from Bio._utils import run_doctest
    run_doctest(verbose=0)