File: biopython_integration.py

package info (click to toggle)
python-gffutils 0.13-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 10,164 kB
  • sloc: python: 5,557; makefile: 57; sh: 13
file content (95 lines) | stat: -rw-r--r-- 2,662 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
Module for integration with BioPython, specifically SeqRecords and SeqFeature
objects.
"""

try:
    from Bio.SeqFeature import SeqFeature, FeatureLocation
except ImportError:
    import warnings

    warnings.warn("BioPython must be installed to use this module")
from .feature import Feature, feature_from_line

_biopython_strand = {
    "+": 1,
    "-": -1,
    ".": None,
    "?": 0,
}
_feature_strand = dict((v, k) for k, v in _biopython_strand.items())


def to_seqfeature(feature):
    """
    Converts a gffutils.Feature object to a Bio.SeqFeature object.

    The GFF fields `source`, `score`, `seqid`, and `frame` are stored as
    qualifiers.  GFF `attributes` are also stored as qualifiers.

    Parameters
    ----------
    feature : Feature object, or string
        If string, assume it is a GFF or GTF-format line; otherwise just use
        the provided feature directly.
    """
    if isinstance(feature, str):
        feature = feature_from_line(feature)

    qualifiers = {
        "source": [feature.source],
        "score": [feature.score],
        "seqid": [feature.seqid],
        "frame": [feature.frame],
    }
    qualifiers.update(feature.attributes)
    return SeqFeature(
        # Convert from GFF 1-based to standard Python 0-based indexing used by
        # BioPython
        FeatureLocation(
            feature.start - 1, feature.stop, strand=_biopython_strand[feature.strand]
        ),
        id=feature.id,
        type=feature.featuretype,
        qualifiers=qualifiers,
    )


def from_seqfeature(s, **kwargs):
    """
    Converts a Bio.SeqFeature object to a gffutils.Feature object.

    The GFF fields `source`, `score`, `seqid`, and `frame` are assumed to be
    stored as qualifiers.  Any other qualifiers will be assumed to be GFF
    attributes.
    """
    source = s.qualifiers.get("source", ".")[0]
    score = s.qualifiers.get("score", ".")[0]
    seqid = s.qualifiers.get("seqid", ".")[0]
    frame = s.qualifiers.get("frame", ".")[0]
    strand = _feature_strand[s.location.strand]

    # BioPython parses 1-based GenBank positions into 0-based for use within
    # Python.  We need to convert back to 1-based GFF format here.
    start = s.location.start + 1
    stop = s.location.end
    featuretype = s.type
    id = s.id
    attributes = dict(s.qualifiers)
    attributes.pop("source", ".")
    attributes.pop("score", ".")
    attributes.pop("seqid", ".")
    attributes.pop("frame", ".")
    return Feature(
        seqid,
        source,
        featuretype,
        start,
        stop,
        score,
        strand,
        frame,
        attributes,
        id=id,
        **kwargs
    )