File: core.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (86 lines) | stat: -rw-r--r-- 2,611 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
Support for "biological sequence" files.

:Author: Bob Harris (rsharris@bx.psu.edu)

See seq.py for more information
"""

import struct

from . import (
    fasta,
    nib,
    qdna,
)

# DNA reverse complement table

DNA_COMP = (
    "                                             -                  "
    " TVGH  CD  M KN   YSA BWXR       tvgh  cd  m kn   ysa bwxr      "
    "                                                                "
    "                                                                "
)


def reverse_complement(text):
    return text.translate(DNA_COMP)[::-1]


def seq_file(file, format=None, revcomp=False, name="", gap=None, contig=None):
    if format is None:
        format = infer_format(file)
    if (contig is not None) and (format not in ["fasta", None]):
        raise ValueError(f"Contigs are not supported for format {format}")
    if format == "fasta":
        return fasta.FastaFile(file, revcomp=revcomp, name=name, gap=gap, contig=contig)
    elif format == "nib":
        return nib.NibFile(file, revcomp=revcomp, name=name, gap=gap)
    elif format == "qdna":
        return qdna.QdnaFile(file, revcomp=revcomp, name=name, gap=gap)
    else:
        if format is None:
            format = ""
        else:
            format = " " + format
        raise ValueError(f"Unknown sequence format{format} in {file.name}")


def seq_reader(file, format=None, revcomp=False, name="", gap=None):
    if format is None:
        format = infer_format(file)
    if format == "fasta":
        return fasta.FastaReader(file, revcomp=revcomp, name=name, gap=gap)
    elif format == "nib":
        return nib.NibReader(file, revcomp=revcomp, name=name, gap=gap)
    elif format == "qdna":
        return qdna.QdnaReader(file, revcomp=revcomp, name=name, gap=gap)
    else:
        raise ValueError(f"Unknown sequence format {format}")


def seq_writer(outfile, format=None, name=""):
    if format == "fasta":
        return fasta.FastaWriter(outfile)
    elif format == "nib":
        return nib.NibWriter(outfile)
    elif format == "qdna":
        return qdna.QdnaWriter(outfile)
    else:
        raise ValueError(f"Unknown sequence format {format}")


def infer_format(file):
    format = None
    magic = struct.unpack(">L", file.read(4))[0]
    if (magic == nib.NIB_MAGIC_NUMBER) or (magic == nib.NIB_MAGIC_NUMBER_SWAP):
        format = "nib"
    elif (magic == qdna.qdnaMagic) or (magic == qdna.qdnaMagicSwap):
        format = "qdna"
    else:
        file.seek(0)
        if file.read(1) == b">":
            format = "fasta"
    file.seek(0)
    return format