File: test_blast6.py

package info (click to toggle)
python-skbio 0.5.1-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 16,556 kB
  • ctags: 7,222
  • sloc: python: 42,085; ansic: 670; makefile: 180; sh: 10
file content (119 lines) | stat: -rw-r--r-- 6,174 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
# ----------------------------------------------------------------------------
# Copyright (c) 2013--, scikit-bio development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file COPYING.txt, distributed with this software.
# ----------------------------------------------------------------------------

import unittest

import pandas as pd
import numpy as np

from skbio.util import get_data_path, assert_data_frame_almost_equal
from skbio.io.format.blast6 import _blast6_to_data_frame


class TestBlast6Reader(unittest.TestCase):
    def test_default_valid_single_line(self):
        fp = get_data_path('blast6_default_single_line')
        df = _blast6_to_data_frame(fp, default_columns=True)
        exp = pd.DataFrame([['query1', 'subject2', 75.0, 8.0, 2.0, 0.0, 1.0,
                             8.0, 2.0, 9.0, 0.06, 11.5]],
                           columns=['qseqid', 'sseqid', 'pident', 'length',
                                    'mismatch', 'gapopen', 'qstart', 'qend',
                                    'sstart', 'send', 'evalue', 'bitscore'])
        assert_data_frame_almost_equal(df, exp)

    def test_default_valid_multi_line(self):
        fp = get_data_path('blast6_default_multi_line')
        df = _blast6_to_data_frame(fp, default_columns=True)
        exp = pd.DataFrame([['query1', 'subject2', 100.00, 8.0, 0.0, 0.0, 1.0,
                             8.0, 3.0, 10.0, 9e-05, 16.9],
                            ['query1', 'subject2', 75.00, 8.0, 2.0, 0.0, 1.0,
                             8.0, 2.0, 9.0, 0.060, 11.5],
                            ['query2', 'subject1', 71.43, 7.0, 2.0, 0.0, 1.0,
                            7.0, 1.0, 7.0, 0.044, 11.9]],
                           columns=['qseqid', 'sseqid', 'pident', 'length',
                                    'mismatch', 'gapopen', 'qstart', 'qend',
                                    'sstart', 'send', 'evalue', 'bitscore'])
        assert_data_frame_almost_equal(df, exp)

    def test_custom_valid_single_line(self):
        fp = get_data_path('blast6_custom_single_line')
        df = _blast6_to_data_frame(fp, columns=['qacc', 'qseq', 'btop',
                                                'sframe', 'ppos',
                                                'positive', 'gaps'])
        exp = pd.DataFrame([['query1', 'PAAWWWWW', 8.0, 1.0, 100.00, 8.0,
                             0.0]], columns=['qacc', 'qseq', 'btop', 'sframe',
                                             'ppos', 'positive', 'gaps'])
        assert_data_frame_almost_equal(df, exp)

    def test_custom_valid_multi_line(self):
        fp = get_data_path('blast6_custom_multi_line')
        df = _blast6_to_data_frame(fp, columns=['sacc', 'score', 'gapopen',
                                                'qcovs', 'sblastnames',
                                                'sallacc', 'qaccver'])
        exp = pd.DataFrame([['subject2', 32.0, 0.0, 100.0, np.nan, 'subject2',
                             'query1'], ['subject2', 18.0, 0.0, 100.0, np.nan,
                                         'subject2', 'query1'],
                            ['subject1', 19.0, 0.0, 70.0, np.nan, 'subject1',
                             'query2']], columns=['sacc', 'score', 'gapopen',
                                                  'qcovs', 'sblastnames',
                                                  'sallacc', 'qaccver'])
        exp['sblastnames'] = exp['sblastnames'].astype(object)
        assert_data_frame_almost_equal(df, exp)

    def test_valid_nan_handling(self):
        fp = get_data_path('blast6_custom_mixed_nans')
        df = _blast6_to_data_frame(fp, columns=['qacc', 'qseq', 'btop',
                                                'sframe', 'ppos', 'positive',
                                                'gaps'])
        exp = pd.DataFrame([[np.nan, 'PAAWWWWW', 8.0, 1.0, 100.00, np.nan,
                             0.0], ['query1', np.nan, 8.0, 1.0, np.nan, 8.0,
                                    0.0]], columns=['qacc', 'qseq', 'btop',
                                                    'sframe', 'ppos',
                                                    'positive', 'gaps'])
        assert_data_frame_almost_equal(df, exp)

    def test_valid_minimal(self):
        fp = get_data_path('blast6_custom_minimal')
        df = _blast6_to_data_frame(fp, columns=['sacc'])
        exp = pd.DataFrame([['subject2']], columns=['sacc'])
        assert_data_frame_almost_equal(df, exp)

    def test_custom_and_default_passed_error(self):
        fp = get_data_path('blast6_default_single_line')
        with self.assertRaisesRegex(ValueError,
                                    "`columns` and `default_columns`"):
            _blast6_to_data_frame(fp, columns=['qseqid'], default_columns=True)

    def test_no_columns_passed_error(self):
        fp = get_data_path('blast6_default_single_line')
        with self.assertRaisesRegex(ValueError,
                                    "Either `columns` or `default_columns`"):
            _blast6_to_data_frame(fp)

    def test_wrong_amount_of_columns_error(self):
        fp = get_data_path('blast6_invalid_number_of_columns')
        with self.assertRaisesRegex(
                ValueError, "Specified number of columns \(12\).*\(10\)"):
            _blast6_to_data_frame(fp, default_columns=True)

    def test_different_data_in_same_column(self):
        fp = get_data_path('blast6_invalid_type_in_column')
        with self.assertRaises(ValueError):
            _blast6_to_data_frame(fp, default_columns=True)

    def test_wrong_column_name_error(self):
        fp = get_data_path('blast6_default_single_line')
        with self.assertRaisesRegex(ValueError,
                                    "Unrecognized column.*'abcd'"):
            _blast6_to_data_frame(fp, columns=['qseqid', 'sseqid', 'pident',
                                               'length', 'mismatch', 'gapopen',
                                               'qstart', 'qend', 'sstart',
                                               'send', 'abcd', 'bitscore'])

if __name__ == '__main__':
    unittest.main()