File: test_subsample.py

package info (click to toggle)
augur 24.4.0-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 25,312 kB
  • sloc: python: 14,253; sh: 227; makefile: 35
file content (158 lines) | stat: -rw-r--r-- 7,444 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import pytest
import pandas as pd

import augur.filter.subsample
from augur.errors import AugurError


@pytest.fixture
def valid_metadata() -> pd.DataFrame:
    columns = ['strain', 'date', 'country']
    data = [
        ("SEQ_1","2020-01-XX","A"),
        ("SEQ_2","2020-02-01","A"),
        ("SEQ_3","2020-03-01","B"),
        ("SEQ_4","2020-04-01","B"),
        ("SEQ_5","2020-05-01","B")
    ]
    return pd.DataFrame.from_records(data, columns=columns).set_index('strain')


class TestSequencesPerGroup:
    @pytest.mark.parametrize(
        "target_max_value, counts_per_group, expected_sequences_per_group",
        [
            (3, [2, 2], 1),
            (3, [2, 1], 3),
            (9, [5, 5], 3),
            (9, [5, 4], 9),
            (9, [5, 3], 9),
        ],
    )
    def test_sequences_per_group(self, target_max_value, counts_per_group, expected_sequences_per_group):
        assert augur.filter.subsample._calculate_sequences_per_group(target_max_value, counts_per_group) == expected_sequences_per_group


class TestFilterGroupBy:
    def test_filter_groupby_strain_subset(self, valid_metadata: pd.DataFrame):
        metadata = valid_metadata.copy()
        strains = ['SEQ_1', 'SEQ_3', 'SEQ_5']
        group_by_strain = augur.filter.subsample.get_groups_for_subsampling(strains, metadata)
        assert group_by_strain == {
            'SEQ_1': ('_dummy',),
            'SEQ_3': ('_dummy',),
            'SEQ_5': ('_dummy',)
        }

    def test_filter_groupby_dummy(self, valid_metadata: pd.DataFrame):
        metadata = valid_metadata.copy()
        strains = metadata.index.tolist()
        group_by_strain = augur.filter.subsample.get_groups_for_subsampling(strains, metadata)
        assert group_by_strain == {
            'SEQ_1': ('_dummy',),
            'SEQ_2': ('_dummy',),
            'SEQ_3': ('_dummy',),
            'SEQ_4': ('_dummy',),
            'SEQ_5': ('_dummy',)
        }

    def test_filter_groupby_invalid_error(self, valid_metadata: pd.DataFrame):
        groups = ['invalid']
        metadata = valid_metadata.copy()
        strains = metadata.index.tolist()
        with pytest.raises(AugurError) as e_info:
            augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
        assert str(e_info.value) == "The specified group-by categories (['invalid']) were not found."

    def test_filter_groupby_invalid_warn(self, valid_metadata: pd.DataFrame, capsys):
        groups = ['country', 'year', 'month', 'invalid']
        metadata = valid_metadata.copy()
        strains = metadata.index.tolist()
        group_by_strain = augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
        assert group_by_strain == {
            'SEQ_1': ('A', 2020, (2020, 1), 'unknown'),
            'SEQ_2': ('A', 2020, (2020, 2), 'unknown'),
            'SEQ_3': ('B', 2020, (2020, 3), 'unknown'),
            'SEQ_4': ('B', 2020, (2020, 4), 'unknown'),
            'SEQ_5': ('B', 2020, (2020, 5), 'unknown')
        }
        captured = capsys.readouterr()
        assert captured.err == "WARNING: Some of the specified group-by categories couldn't be found: invalid\nFiltering by group may behave differently than expected!\n"

    def test_filter_groupby_missing_year_error(self, valid_metadata: pd.DataFrame):
        groups = ['year']
        metadata = valid_metadata.copy()
        metadata = metadata.drop('date', axis='columns')
        strains = metadata.index.tolist()
        with pytest.raises(AugurError) as e_info:
            augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
        assert str(e_info.value) == "The specified group-by categories (['year']) were not found. Note that using any of ['month', 'week', 'year'] requires a column called 'date'."

    def test_filter_groupby_missing_month_error(self, valid_metadata: pd.DataFrame):
        groups = ['month']
        metadata = valid_metadata.copy()
        metadata = metadata.drop('date', axis='columns')
        strains = metadata.index.tolist()
        with pytest.raises(AugurError) as e_info:
            augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
        assert str(e_info.value) == "The specified group-by categories (['month']) were not found. Note that using any of ['month', 'week', 'year'] requires a column called 'date'."

    def test_filter_groupby_missing_year_and_month_error(self, valid_metadata: pd.DataFrame):
        groups = ['year', 'month']
        metadata = valid_metadata.copy()
        metadata = metadata.drop('date', axis='columns')
        strains = metadata.index.tolist()
        with pytest.raises(AugurError) as e_info:
            augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
        assert str(e_info.value) == "The specified group-by categories (['year', 'month']) were not found. Note that using any of ['month', 'week', 'year'] requires a column called 'date'."

    def test_filter_groupby_missing_date_warn(self, valid_metadata: pd.DataFrame, capsys):
        groups = ['country', 'year', 'month']
        metadata = valid_metadata.copy()
        metadata = metadata.drop('date', axis='columns')
        strains = metadata.index.tolist()
        group_by_strain = augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
        assert group_by_strain == {
            'SEQ_1': ('A', 'unknown', 'unknown'),
            'SEQ_2': ('A', 'unknown', 'unknown'),
            'SEQ_3': ('B', 'unknown', 'unknown'),
            'SEQ_4': ('B', 'unknown', 'unknown'),
            'SEQ_5': ('B', 'unknown', 'unknown')
        }
        captured = capsys.readouterr()
        assert captured.err == "WARNING: A 'date' column could not be found to group-by ['month', 'year'].\nFiltering by group may behave differently than expected!\n"

    def test_filter_groupby_no_strains(self, valid_metadata: pd.DataFrame):
        groups = ['country', 'year', 'month']
        metadata = valid_metadata.copy()
        strains = []
        group_by_strain = augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
        assert group_by_strain == {}

    def test_filter_groupby_only_year_provided(self, valid_metadata: pd.DataFrame):
        groups = ['country', 'year']
        metadata = valid_metadata.copy()
        metadata['date'] = '2020'
        strains = metadata.index.tolist()
        group_by_strain = augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
        assert group_by_strain == {
            'SEQ_1': ('A', 2020),
            'SEQ_2': ('A', 2020),
            'SEQ_3': ('B', 2020),
            'SEQ_4': ('B', 2020),
            'SEQ_5': ('B', 2020)
        }

    def test_filter_groupby_only_year_month_provided(self, valid_metadata: pd.DataFrame):
        groups = ['country', 'year', 'month']
        metadata = valid_metadata.copy()
        metadata['date'] = '2020-01'
        strains = metadata.index.tolist()
        group_by_strain = augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
        assert group_by_strain == {
            'SEQ_1': ('A', 2020, (2020, 1)),
            'SEQ_2': ('A', 2020, (2020, 1)),
            'SEQ_3': ('B', 2020, (2020, 1)),
            'SEQ_4': ('B', 2020, (2020, 1)),
            'SEQ_5': ('B', 2020, (2020, 1))
        }