File: SwissIO.py

package info (click to toggle)
python-biopython 1.68%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 46,860 kB
  • ctags: 13,237
  • sloc: python: 160,306; xml: 93,216; ansic: 9,118; sql: 1,208; makefile: 155; sh: 63
file content (146 lines) | stat: -rw-r--r-- 6,309 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
# Copyright 2006-2013 by Peter Cock.
# Revisions copyright 2008-2009 by Michiel de Hoon.
# All rights reserved.
#
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Bio.SeqIO support for the "swiss" (aka SwissProt/UniProt) file format.

You are expected to use this module via the Bio.SeqIO functions.
See also the Bio.SwissProt module which offers more than just accessing
the sequences as SeqRecord objects.

See also Bio.SeqIO.UniprotIO.py which supports the "uniprot-xml" format.
"""

from __future__ import print_function

from Bio import Seq
from Bio import SeqRecord
from Bio import Alphabet
from Bio import SeqFeature
from Bio import SwissProt


def _make_position(location_string, offset=0):
    """Turn a Swiss location position into a SeqFeature position object (PRIVATE).

    An offset of -1 is used with a start location to make it pythonic.
    """
    if location_string == "?":
        return SeqFeature.UnknownPosition()
    # Hack so that feature from 0 to 0 becomes 0 to 0, not -1 to 0.
    try:
        return SeqFeature.ExactPosition(max(0, offset + int(location_string)))
    except ValueError:
        pass
    if location_string.startswith("<"):
        try:
            return SeqFeature.BeforePosition(max(0, offset + int(location_string[1:])))
        except ValueError:
            pass
    elif location_string.startswith(">"):  # e.g. ">13"
        try:
            return SeqFeature.AfterPosition(max(0, offset + int(location_string[1:])))
        except ValueError:
            pass
    elif location_string.startswith("?"):  # e.g. "?22"
        try:
            return SeqFeature.UncertainPosition(max(0, offset + int(location_string[1:])))
        except ValueError:
            pass
    raise NotImplementedError("Cannot parse location '%s'" % location_string)


def _make_seqfeature(name, from_res, to_res, description, ft_id):
    """Construct SeqFeature from feature data from parser (PRIVATE)."""
    loc = SeqFeature.FeatureLocation(_make_position(from_res, -1),
                                     _make_position(to_res, 0))
    if not ft_id:
        ft_id = "<unknown id>"  # The default in SeqFeature object
    return SeqFeature.SeqFeature(loc, type=name, id=ft_id,
                                 qualifiers={"description": description})


def SwissIterator(handle):
    """Breaks up a Swiss-Prot/UniProt file into SeqRecord objects.

    Every section from the ID line to the terminating // becomes
    a single SeqRecord with associated annotation and features.

    This parser is for the flat file "swiss" format as used by:
     - Swiss-Prot aka SwissProt
     - TrEMBL
     - UniProtKB aka UniProt Knowledgebase

    For consistency with BioPerl and EMBOSS we call this the "swiss"
    format. See also the SeqIO support for "uniprot-xml" format.

    Rather than calling it directly, you are expected to use this
    parser via Bio.SeqIO.parse(..., format="swiss") instead.
    """
    swiss_records = SwissProt.parse(handle)
    for swiss_record in swiss_records:
        # Convert the SwissProt record to a SeqRecord
        seq = Seq.Seq(swiss_record.sequence, Alphabet.generic_protein)
        record = SeqRecord.SeqRecord(seq,
                                     id=swiss_record.accessions[0],
                                     name=swiss_record.entry_name,
                                     description=swiss_record.description,
                                     features=[_make_seqfeature(*f) for f
                                               in swiss_record.features],
                                     )
        record.description = swiss_record.description
        for cross_reference in swiss_record.cross_references:
            if len(cross_reference) < 2:
                continue
            database, accession = cross_reference[:2]
            dbxref = "%s:%s" % (database, accession)
            if dbxref not in record.dbxrefs:
                record.dbxrefs.append(dbxref)
        annotations = record.annotations
        annotations['accessions'] = swiss_record.accessions
        if swiss_record.created:
            annotations['date'] = swiss_record.created[0]
        if swiss_record.sequence_update:
            annotations[
                'date_last_sequence_update'] = swiss_record.sequence_update[0]
        if swiss_record.annotation_update:
            annotations['date_last_annotation_update'] = swiss_record.annotation_update[0]
        if swiss_record.gene_name:
            annotations['gene_name'] = swiss_record.gene_name
        annotations['organism'] = swiss_record.organism.rstrip(".")
        annotations['taxonomy'] = swiss_record.organism_classification
        annotations['ncbi_taxid'] = swiss_record.taxonomy_id
        if swiss_record.host_organism:
            annotations['organism_host'] = swiss_record.host_organism
        if swiss_record.host_taxonomy_id:
            annotations['host_ncbi_taxid'] = swiss_record.host_taxonomy_id
        if swiss_record.comments:
            annotations['comment'] = "\n".join(swiss_record.comments)
        if swiss_record.references:
            annotations['references'] = []
            for reference in swiss_record.references:
                feature = SeqFeature.Reference()
                feature.comment = " ".join("%s=%s;" % k_v for k_v in reference.comments)
                for key, value in reference.references:
                    if key == 'PubMed':
                        feature.pubmed_id = value
                    elif key == 'MEDLINE':
                        feature.medline_id = value
                    elif key == 'DOI':
                        pass
                    elif key == 'AGRICOLA':
                        pass
                    else:
                        raise ValueError(
                            "Unknown key %s found in references" % key)
                feature.authors = reference.authors
                feature.title = reference.title
                feature.journal = reference.location
                annotations['references'].append(feature)
        if swiss_record.keywords:
            record.annotations['keywords'] = swiss_record.keywords
        yield record