1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
|
from numpy import (
allclose,
isnan,
)
from bx.motif import pwm
def test_create():
m = pwm.FrequencyMatrix.from_rows(["A", "C", "G", "T"], get_ctcf_rows())
# Alphabet sort
assert m.sorted_alphabet == ["A", "C", "G", "T"]
# Character to index mapping
assert m.char_to_index[ord("A")] == 0
assert m.char_to_index[ord("C")] == 1
assert m.char_to_index[ord("G")] == 2
assert m.char_to_index[ord("T")] == 3
assert m.char_to_index[ord("Q")] == -1
# Values
assert allclose(m.values[0], [2620, 2052, 3013, 2314])
assert allclose(m.values[19], [3144, 3231, 3056, 567])
def test_scoring():
m = pwm.FrequencyMatrix.from_rows(["A", "C", "G", "T"], get_ctcf_rows())
# Stormo method
sm = m.to_stormo_scoring_matrix()
# Forward matches
assert allclose(sm.score_string("AATCACCACCTCCTGGCAGG")[0], -156.8261261)
assert allclose(sm.score_string("TGCCTGCCTCTGTAGGCTCC")[0], -128.8106842)
assert allclose(sm.score_string("GTTGCCAGTTGGGGGAAGCA")[0], 4.65049839)
assert allclose(sm.score_string("GCAGACACCAGGTGGTTCAG")[0], 1.60168743)
# Reverse matches
rc = sm.reverse_complement()
assert allclose(rc.score_string("AATCACCACCTCCTGGCAGG")[0], 0.014178276062)
assert allclose(rc.score_string("TGCCTGCCTCTGTAGGCTCC")[0], 0.723828315735)
assert allclose(rc.score_string("GTTGCCAGTTGGGGGAAGCA")[0], -126.99407196)
assert allclose(rc.score_string("GCAGACACCAGGTGGTTCAG")[0], -86.9560623169)
# Nothing valid
assert isnan(sm.score_string_with_gaps("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")).all()
# Too short
assert isnan(sm.score_string("TTTT")).all()
def test_scoring_with_gaps():
m = pwm.FrequencyMatrix.from_rows(["A", "C", "G", "T"], get_ctcf_rows())
# Stormo method
sm = m.to_stormo_scoring_matrix()
# Forward matches
assert allclose(sm.score_string_with_gaps("GTTGCCAGT----TGGGGGAAGCATTT---AA")[0], 4.65049839)
assert allclose(sm.score_string_with_gaps("GCAGA--CACCAGGTGG--TTCAG---")[0], 1.60168743)
assert allclose(sm.score_string_with_gaps("----GTTGCCAGTTGGGGGAAGCA")[4], 4.65049839)
assert allclose(sm.score_string_with_gaps("TTT--GTT--GCCA--GTTGGGG-G-A-A-G-C-A-")[5], 4.65049839)
assert isnan(sm.score_string_with_gaps("TTT--GTT--GCCA--GTTGGGG-G-A-A-G-C-A-")[4])
# Nothing valid
assert isnan(sm.score_string_with_gaps("XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX")).all()
assert isnan(sm.score_string_with_gaps("------------------------------------")).all()
# Too short
assert isnan(sm.score_string_with_gaps("TTTT")).all()
assert isnan(sm.score_string_with_gaps("TTTT----")).all()
def get_ctcf_rows():
"""
The CTCF primary site motif
"""
return [
[2620, 2052, 3013, 2314],
[0, 3580, 1746, 4672],
[2008, 1790, 4497, 1703],
[3362, 0, 6637, 0],
[0, 10000, 0, 0],
[0, 10000, 0, 0],
[7467, 0, 1310, 1222],
[786, 4890, 4323, 0],
[1179, 6288, 829, 1703],
[10000, 0, 0, 0],
[0, 0, 10000, 0],
[4847, 0, 5152, 0],
[0, 0, 6200, 3799],
[0, 0, 10000, 0],
[0, 0, 10000, 0],
[1572, 7467, 0, 960],
[3842, 0, 5545, 611],
[0, 5895, 4104, 0],
[1615, 4192, 1397, 2794],
[3144, 3231, 3056, 567],
]
|