File: test_substitution.py

package info (click to toggle)
python-skbio 0.6.2-4
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 9,312 kB
  • sloc: python: 60,482; ansic: 672; makefile: 224
file content (154 lines) | stat: -rw-r--r-- 5,841 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE.txt, distributed with this software.
# ----------------------------------------------------------------------------

from unittest import TestCase, main

import numpy as np
from numpy.testing import assert_array_equal

from skbio import SubstitutionMatrix


class TestSubstitutionMatrix(TestCase):
    def setUp(self):
        self.alphabet = 'ACGTN'
        self.scores = np.array([
            [1, -2, -2, -2, 0],
            [-2, 1, -2, -2, 0],
            [-2, -2, 1, -2, 0],
            [-2, -2, -2, 1, 0],
            [0, 0, 0, 0, 0]])

    def test_init(self):
        # typical usage
        # alphabet becomes tuple of characters
        alphabet = tuple(self.alphabet)
        obs = SubstitutionMatrix(self.alphabet, self.scores)
        self.assertTupleEqual(obs.alphabet, alphabet)

        # alphabet is an alias of ids
        self.assertTupleEqual(obs.alphabet, obs.ids)

        # matrix is ndarray (this is important for alignment efficiency)
        self.assertTrue(isinstance(obs.scores, np.ndarray))
        self.assertTupleEqual(obs.shape, (5, 5))
        assert_array_equal(obs.scores, self.scores)

        # data type becomes float
        self.assertEqual(obs.dtype, np.float64)

        # scores is an alias of data
        assert_array_equal(obs.scores, obs.data)

        # character to index mapping
        self.assertDictEqual(obs._char_map, dict(zip(
            alphabet, range(len(alphabet)))))

        # alphabet can be encoded as ASCII characters
        self.assertTrue(obs._is_ascii)

        # hash table of ASCII characters
        self.assertTrue(isinstance(obs._char_hash, np.ndarray))
        self.assertTrue(obs._char_hash.dtype.type is np.uint8)
        for i, char in enumerate(alphabet):
            self.assertEqual(i, obs._char_hash[ord(char)])

    def test_init_alt_alphabet(self):
        # alternative formats of alphabet: list, dictionary (only keys matter),
        # and iterator
        alphabet = tuple(self.alphabet)
        for alp in (list(alphabet),
                    dict.fromkeys(alphabet),
                    iter(alphabet)):
            obs = SubstitutionMatrix(alp, self.scores)
            self.assertTupleEqual(obs.alphabet, alphabet)

    def test_init_alt_scores(self):
        # alternative format of scores: nested list
        obs = SubstitutionMatrix(self.alphabet, self.scores.tolist())
        assert_array_equal(obs.scores, self.scores)

        # condensed matrix (less likely because diagonal is zero)
        obs = SubstitutionMatrix('ACGT', [-1] * 6)
        assert_array_equal(obs.scores, np.identity(4) - 1)

    def test_to_dict(self):
        mat = SubstitutionMatrix(self.alphabet, self.scores)
        obs = mat.to_dict()
        exp = {'A': {'A': 1., 'C': -2., 'G': -2., 'T': -2., 'N': 0.},
               'C': {'A': -2., 'C': 1., 'G': -2., 'T': -2., 'N': 0.},
               'G': {'A': -2., 'C': -2., 'G': 1., 'T': -2., 'N': 0.},
               'T': {'A': -2., 'C': -2., 'G': -2., 'T': 1., 'N': 0.},
               'N': {'A': 0., 'C': 0., 'G': 0., 'T': 0., 'N': 0.}}
        self.assertDictEqual(obs, exp)

    def test_from_dict(self):
        d = {'a': {'a': 1, 'b': 0, 'c': 0},
             'b': {'a': 0, 'b': 1, 'c': 0},
             'c': {'a': 0, 'b': 0, 'c': 1}}
        obs = SubstitutionMatrix.from_dict(d)
        self.assertTrue(isinstance(obs, SubstitutionMatrix))
        self.assertTupleEqual(obs.alphabet, tuple('abc'))
        exp = np.array([[1., 0., 0.],
                        [0., 1., 0.],
                        [0., 0., 1.]])
        assert_array_equal(obs.data, exp)

        # alphabet is inconsistent
        msg = ('The outer and inner layers of the dictionary must have the '
               'same set of keys.')
        d['d'] = {'a': 0, 'b': 0, 'c': 0}
        with self.assertRaisesRegex(ValueError, msg):
            SubstitutionMatrix.from_dict(d)
        del d['d']
        d['a']['d'] = 2
        with self.assertRaisesRegex(ValueError, msg):
            SubstitutionMatrix.from_dict(d)
        del d['a']['d']

        # scores are not numbers
        d['a']['b'] = 'hello'
        with self.assertRaises(ValueError):
            SubstitutionMatrix.from_dict(d)
        d['a']['b'] = None
        with self.assertRaises(TypeError):
            SubstitutionMatrix.from_dict(d)

    def test_identity(self):
        obs = SubstitutionMatrix.identity('ACGT', 1, -2)
        self.assertTrue(isinstance(obs, SubstitutionMatrix))
        self.assertTupleEqual(obs.alphabet, tuple('ACGT'))
        exp = np.array([[1., -2., -2., -2.],
                        [-2., 1., -2., -2.],
                        [-2., -2., 1., -2.],
                        [-2., -2., -2., 1.]])
        assert_array_equal(obs.scores, exp)

    def test_by_name(self):
        obs = SubstitutionMatrix.by_name('NUC.4.4')
        self.assertEqual(len(obs.alphabet), 15)
        self.assertEqual(obs['A', 'T'], -4)
        obs = SubstitutionMatrix.by_name('BLOSUM50')
        self.assertEqual(len(obs.alphabet), 24)
        self.assertEqual(obs['M', 'K'], -2)
        obs = SubstitutionMatrix.by_name('blosum50')
        self.assertEqual(len(obs.alphabet), 24)
        self.assertEqual(obs['M', 'K'], -2)
        msg = 'Substitution matrix "hello" does not exist.'
        with self.assertRaisesRegex(ValueError, msg):
            SubstitutionMatrix.by_name('hello')

    def test_get_names(self):
        obs = SubstitutionMatrix.get_names()
        self.assertTrue('NUC.4.4' in obs)
        self.assertTrue('PAM250' in obs)
        self.assertTrue('BLOSUM62' in obs)


if __name__ == "__main__":
    main()