File: test_core_stringops.py

package info (click to toggle)
python-bioframe 0.4.1-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,000 kB
  • sloc: python: 5,860; makefile: 38; sh: 13
file content (70 lines) | stat: -rw-r--r-- 2,559 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import pandas as pd
import numpy as np
import pytest

from bioframe.core import stringops
from bioframe.core.stringops import parse_region


def test_to_ucsc_string():
    assert stringops.to_ucsc_string(("chr21", 1, 4)) == "chr21:1-4"


def test_parse_region():
    # UCSC-style names
    assert parse_region("chr21") == ("chr21", 0, None)
    assert parse_region("chr21:1000-2000") == ("chr21", 1000, 2000)
    assert parse_region("chr21:1,000-2,000") == ("chr21", 1000, 2000)

    # Ensembl style names
    assert parse_region("6") == ("6", 0, None)
    assert parse_region("6:1000-2000") == ("6", 1000, 2000)
    assert parse_region("6:1,000-2,000") == ("6", 1000, 2000)

    # FASTA style names
    assert parse_region("gb|accession|locus") == ("gb|accession|locus", 0, None)
    assert parse_region("gb|accession|locus:1000-2000") == (
        "gb|accession|locus",
        1000,
        2000,
    )
    assert parse_region("gb|accession|locus:1,000-2,000") == (
        "gb|accession|locus",
        1000,
        2000,
    )

    # Punctuation in names (aside from :)
    assert parse_region("name-with-hyphens-") == ("name-with-hyphens-", 0, None)
    assert parse_region("GL000207.1") == ("GL000207.1", 0, None)
    assert parse_region("GL000207.1:1000-2000") == ("GL000207.1", 1000, 2000)

    # Trailing dash
    assert parse_region("chr21:1000-") == ("chr21", 1000, None)

    # Humanized units
    assert parse_region("6:1kb-2kb") == ("6", 1000, 2000)
    assert parse_region("6:1k-2000") == ("6", 1000, 2000)
    assert parse_region("6:1kb-2M") == ("6", 1000, 2000000)
    assert parse_region("6:1Gb-") == ("6", 1000000000, None)

    with pytest.raises(ValueError):
        parse_region("chr1:2,000-1,000")  # reverse selection

    with pytest.raises(ValueError):
        parse_region("chr1::1000-2000")  # more than one colon


def test_parse_region_string():
    assert stringops.parse_region_string("6:1kb-2kb") == ("6", 1000, 2000)
    assert stringops.parse_region_string("6:1,000-2,000") == ("6", 1000, 2000)
    assert stringops.parse_region_string("c6:1000-2000") == ("c6", 1000, 2000)


def test_is_complete_ucsc_string():
    assert stringops.is_complete_ucsc_string("chrX:1M-2M") is True
    assert stringops.is_complete_ucsc_string("chrX") is False
    assert stringops.is_complete_ucsc_string("1M-2M") is False
    assert stringops.is_complete_ucsc_string(1000) is False
    assert stringops.is_complete_ucsc_string(np.array([100, 200])) is False
    assert stringops.is_complete_ucsc_string(np.array(["chr1:100-200"])) is False