File: twobit_tests.py

package info (click to toggle)
python-bx 0.13.0-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 5,000 kB
  • sloc: python: 17,136; ansic: 2,326; makefile: 24; sh: 8
file content (52 lines) | stat: -rw-r--r-- 1,689 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import random

import pytest

from bx.seq import twobit


def quick_fasta_iter(f):
    current_header = None
    current_sequence = []
    for line in f:
        if line.startswith("#"):
            continue
        if line.startswith(">"):
            if current_sequence:
                yield current_header, "".join(current_sequence)
                current_sequence = []
            current_header = line.strip()[1:]
        else:
            current_sequence.append("".join(line.split()))
    if current_sequence:
        yield current_header, "".join(current_sequence)
        current_sequence = []


@pytest.mark.parametrize("filename", ["test", "testN", "testMask"])
def test_random_subseq_matches(filename):
    test_fa = f"test_data/seq_tests/{filename}.fa"
    test_twobit = f"test_data/seq_tests/{filename}.2bit"
    # Load Fasta data
    expected = {}
    with open(test_fa) as f:
        for h, s in quick_fasta_iter(f):
            expected[h] = s
    # Open 2bit
    with open(test_twobit, "rb") as f:
        t = twobit.TwoBitFile(f)
        for k, s in expected.items():
            assert k in t.index
            # assert t.index[k].size == len(s)
            length = len(s)
            for _ in range(100):
                start = random.randint(0, length - 2)
                end = random.randint(start + 1, length)
                assert t[k].get(start, end) == s[start:end]
                assert t[k][start:end] == s[start:end], "seq: %s, start: %d, end: %d\nExpected:\n%s\nActual:\n%s\n" % (
                    k,
                    start,
                    end,
                    s[start:end],
                    t.get(k, start, end),
                )