1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130
|
from sqt.align import (edit_distance as ed, GlobalAlignment as GA, consensus,
hamming_distance)
from random import choice, seed, randint
import pytest
STRING_PAIRS = [
('', ''),
('', 'A'),
('A', 'A'),
('AB', ''),
('AB', 'ABC'),
('TGAATCCC', 'CCTGAATC'),
('ANANAS', 'BANANA'),
('SISSI', 'MISSISSIPPI'),
('GGAATCCC', 'TGAGGGATAAATATTTAGAATTTAGTAGTAGTGTT'),
('TCTGTTCCCTCCCTGTCTCA', 'TTTTAGGAAATACGCC'),
('TGAGACACGCAACATGGGAAAGGCAAGGCACACAGGGGATAGG', 'AATTTATTTTATTGTGATTTTTTGGAGGTTTGGAAGCCACTAAGCTATACTGAGACACGCAACAGGGGAAAGGCAAGGCACA'),
('TCCATCTCATCCCTGCGTGTCCCATCTGTTCCCTCCCTGTCTCA', 'TTTTAGGAAATACGCCTGGTGGGGTTTGGAGTATAGTGAAAGATAGGTGAGTTGGTCGGGTG'),
('A', 'TCTGCTCCTGGCCCATGATCGTATAACTTTCAAATTT'),
('GCGCGGACT', 'TAAATCCTGG'),
]
seed(10)
def randstring():
return ''.join(choice('AC') for _ in range(randint(0, 10)))
STRING_PAIRS.extend((randstring(), randstring()) for _ in range(100000))
def test_edit_distance():
assert ed('', '') == 0
assert ed('', 'A') == 1
assert ed('A', 'B') == 1
assert ed('A', 'A') == 0
assert ed('A', 'AB') == 1
assert ed('BA', 'AB') == 2
for s, t in STRING_PAIRS:
assert ed(s, '') == len(s)
assert ed('', s) == len(s)
assert ed(s, t) == ed(t, s)
def test_edit_distance_bytes():
assert ed(b'', b'') == 0
assert ed(b'', b'A') == 1
assert ed(b'A', b'B') == 1
assert ed(b'A', b'A') == 0
assert ed(b'A', b'AB') == 1
assert ed(b'BA', b'AB') == 2
for s, t in STRING_PAIRS:
s = s.encode('ascii')
t = t.encode('ascii')
assert ed(s, b'') == len(s)
assert ed(b'', s) == len(s)
assert ed(s, t) == ed(t, s)
def assert_banded(s, t, maxdiff):
banded_dist = ed(s, t, maxdiff=maxdiff)
true_dist = ed(s, t)
if true_dist > maxdiff:
assert banded_dist > maxdiff
else:
assert banded_dist == true_dist
def test_edit_distance_banded():
for maxdiff in range(5):
assert_banded('ABC', '', maxdiff)
for s, t in STRING_PAIRS:
assert_banded(s, '', maxdiff)
assert_banded('', s, maxdiff)
assert_banded(s, t, maxdiff)
assert_banded(t, s, maxdiff)
def nongap_characters(row):
"""
Return the non-gap characters (not '\0') of an alignment row.
"""
try:
return row.replace(b'\0', b'')
except TypeError:
return row.replace('\0', '')
def count_gaps(row):
try:
return row.count(b'\0')
except TypeError:
return row.count('\0')
def count_mismatches(row1, row2):
if type(row1) is str:
gap = '\0'
else:
gap = 0
return sum(1 for (c1, c2) in zip(row1, row2) if c1 != c2 and c1 != gap and c2 != gap)
def test_global_alignment():
for s, t in STRING_PAIRS:
distance = ed(s, t)
ga = GA(s, t)
assert len(ga.row1) == len(ga.row2)
assert ga.errors == distance
assert nongap_characters(ga.row1) == s
assert nongap_characters(ga.row2) == t
assert ga.errors == count_gaps(ga.row1) + count_gaps(ga.row2) + count_mismatches(ga.row1, ga.row2)
def test_consensus():
d = dict(a='AAA', b='ACA', c='AAG', d='TAA', e='AAA')
assert consensus(d) == 'AAA'
assert consensus(d.values()) == 'AAA'
def test_hamming_distance():
assert hamming_distance('', '') == 0
assert hamming_distance('A', 'A') == 0
assert hamming_distance('HELLO', 'HELLO') == 0
assert hamming_distance('ABC', 'DEF') == 3
def test_hamming_distance_incorrect_length():
with pytest.raises(IndexError):
hamming_distance('A', 'BC')
|