File: nib.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (87 lines) | stat: -rw-r--r-- 2,901 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
"""
Classes to support nib files.

:Author: James Taylor (james@bx.psu.edu), Bob Harris (rsharris@bx.psu.edu)

A nib sequence is a sequence of DNA, using the 10 character alphabet A,C,G,T,N
(upper and lower case).  The file is packed as 4 bits per character.

nib file format
---------------

Fields can be in big- or little-endian format;  they must match the endianess
of the magic number.

============ =========== ======================================================
offset 0x00: 6B E9 3D 3A big endian magic number (3A 3D E9 6B => little endian)
offset 0x04: xx xx xx xx length of data sequence (counted in characters)
offset 0x08:  ...        data sequence;  most significant nybble in each
                         byte is first in sequence
============ =========== ======================================================
"""

import math
import struct

from bx.seq.seq import (
    SeqFile,
    SeqReader,
)
from . import _nib

NIB_MAGIC_NUMBER = 0x6BE93D3A
NIB_MAGIC_NUMBER_SWAP = 0x3A3DE96B
NIB_MAGIC_SIZE = 4
NIB_LENGTH_SIZE = 4


class NibFile(SeqFile):
    def __init__(self, file, revcomp=False, name="", gap=None):
        SeqFile.__init__(self, file, revcomp, name, gap)

        self.byte_order = ">"
        magic = struct.unpack(">L", file.read(NIB_MAGIC_SIZE))[0]
        if magic != NIB_MAGIC_NUMBER:
            if magic == NIB_MAGIC_NUMBER_SWAP:
                self.byte_order = "<"
            else:
                raise Exception("Not a NIB file")
        self.magic = magic
        self.length = struct.unpack(f"{self.byte_order}L", file.read(NIB_LENGTH_SIZE))[0]

    def raw_fetch(self, start, length):
        # Check parameters
        assert start >= 0, "Start must be greater than 0"
        assert length >= 0, "Length must be greater than 0"
        assert start + length <= self.length, "Interval beyond end of sequence"
        # Read block of bytes containing sequence
        block_start = int(math.floor(start / 2))
        block_end = int(math.floor((start + length - 1) / 2))
        block_len = block_end + 1 - block_start
        self.file.seek(NIB_MAGIC_SIZE + NIB_LENGTH_SIZE + block_start)
        raw = self.file.read(block_len)
        # Unpack compressed block into a character string and return
        return _nib.translate_raw_data(raw, start, length)


class NibReader(SeqReader):
    def __init__(self, file, revcomp=False, name="", gap=None):
        SeqReader.__init__(self, file, revcomp, name, gap)

    def __next__(self):
        if self.seqs_read != 0:
            return  # nib files have just one sequence
        seq = NibFile(self.file, self.revcomp, self.name, self.gap)
        self.seqs_read += 1
        return seq


class NibWriter:
    def __init__(self, file):
        self.file = file

    def write(self, seq):
        assert False, "NibWriter.write() is not implemented yet"

    def close(self):
        self.file.close()