1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
|
import pytest
import pandas as pd
import augur.filter.subsample
from augur.errors import AugurError
@pytest.fixture
def valid_metadata() -> pd.DataFrame:
columns = ['strain', 'date', 'country']
data = [
("SEQ_1","2020-01-XX","A"),
("SEQ_2","2020-02-01","A"),
("SEQ_3","2020-03-01","B"),
("SEQ_4","2020-04-01","B"),
("SEQ_5","2020-05-01","B")
]
return pd.DataFrame.from_records(data, columns=columns).set_index('strain')
class TestSequencesPerGroup:
@pytest.mark.parametrize(
"target_max_value, counts_per_group, expected_sequences_per_group",
[
(3, [2, 2], 1),
(3, [2, 1], 3),
(9, [5, 5], 3),
(9, [5, 4], 9),
(9, [5, 3], 9),
],
)
def test_sequences_per_group(self, target_max_value, counts_per_group, expected_sequences_per_group):
assert augur.filter.subsample._calculate_sequences_per_group(target_max_value, counts_per_group) == expected_sequences_per_group
class TestFilterGroupBy:
def test_filter_groupby_strain_subset(self, valid_metadata: pd.DataFrame):
metadata = valid_metadata.copy()
strains = ['SEQ_1', 'SEQ_3', 'SEQ_5']
group_by_strain = augur.filter.subsample.get_groups_for_subsampling(strains, metadata)
assert group_by_strain == {
'SEQ_1': ('_dummy',),
'SEQ_3': ('_dummy',),
'SEQ_5': ('_dummy',)
}
def test_filter_groupby_dummy(self, valid_metadata: pd.DataFrame):
metadata = valid_metadata.copy()
strains = metadata.index.tolist()
group_by_strain = augur.filter.subsample.get_groups_for_subsampling(strains, metadata)
assert group_by_strain == {
'SEQ_1': ('_dummy',),
'SEQ_2': ('_dummy',),
'SEQ_3': ('_dummy',),
'SEQ_4': ('_dummy',),
'SEQ_5': ('_dummy',)
}
def test_filter_groupby_invalid_error(self, valid_metadata: pd.DataFrame):
groups = ['invalid']
metadata = valid_metadata.copy()
strains = metadata.index.tolist()
with pytest.raises(AugurError) as e_info:
augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
assert str(e_info.value) == "The specified group-by categories (['invalid']) were not found."
def test_filter_groupby_invalid_warn(self, valid_metadata: pd.DataFrame, capsys):
groups = ['country', 'year', 'month', 'invalid']
metadata = valid_metadata.copy()
strains = metadata.index.tolist()
group_by_strain = augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
assert group_by_strain == {
'SEQ_1': ('A', 2020, (2020, 1), 'unknown'),
'SEQ_2': ('A', 2020, (2020, 2), 'unknown'),
'SEQ_3': ('B', 2020, (2020, 3), 'unknown'),
'SEQ_4': ('B', 2020, (2020, 4), 'unknown'),
'SEQ_5': ('B', 2020, (2020, 5), 'unknown')
}
captured = capsys.readouterr()
assert captured.err == "WARNING: Some of the specified group-by categories couldn't be found: invalid\nFiltering by group may behave differently than expected!\n"
def test_filter_groupby_missing_year_error(self, valid_metadata: pd.DataFrame):
groups = ['year']
metadata = valid_metadata.copy()
metadata = metadata.drop('date', axis='columns')
strains = metadata.index.tolist()
with pytest.raises(AugurError) as e_info:
augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
assert str(e_info.value) == "The specified group-by categories (['year']) were not found. Note that using any of ['month', 'week', 'year'] requires a column called 'date'."
def test_filter_groupby_missing_month_error(self, valid_metadata: pd.DataFrame):
groups = ['month']
metadata = valid_metadata.copy()
metadata = metadata.drop('date', axis='columns')
strains = metadata.index.tolist()
with pytest.raises(AugurError) as e_info:
augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
assert str(e_info.value) == "The specified group-by categories (['month']) were not found. Note that using any of ['month', 'week', 'year'] requires a column called 'date'."
def test_filter_groupby_missing_year_and_month_error(self, valid_metadata: pd.DataFrame):
groups = ['year', 'month']
metadata = valid_metadata.copy()
metadata = metadata.drop('date', axis='columns')
strains = metadata.index.tolist()
with pytest.raises(AugurError) as e_info:
augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
assert str(e_info.value) == "The specified group-by categories (['year', 'month']) were not found. Note that using any of ['month', 'week', 'year'] requires a column called 'date'."
def test_filter_groupby_missing_date_warn(self, valid_metadata: pd.DataFrame, capsys):
groups = ['country', 'year', 'month']
metadata = valid_metadata.copy()
metadata = metadata.drop('date', axis='columns')
strains = metadata.index.tolist()
group_by_strain = augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
assert group_by_strain == {
'SEQ_1': ('A', 'unknown', 'unknown'),
'SEQ_2': ('A', 'unknown', 'unknown'),
'SEQ_3': ('B', 'unknown', 'unknown'),
'SEQ_4': ('B', 'unknown', 'unknown'),
'SEQ_5': ('B', 'unknown', 'unknown')
}
captured = capsys.readouterr()
assert captured.err == "WARNING: A 'date' column could not be found to group-by ['month', 'year'].\nFiltering by group may behave differently than expected!\n"
def test_filter_groupby_no_strains(self, valid_metadata: pd.DataFrame):
groups = ['country', 'year', 'month']
metadata = valid_metadata.copy()
strains = []
group_by_strain = augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
assert group_by_strain == {}
def test_filter_groupby_only_year_provided(self, valid_metadata: pd.DataFrame):
groups = ['country', 'year']
metadata = valid_metadata.copy()
metadata['date'] = '2020'
strains = metadata.index.tolist()
group_by_strain = augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
assert group_by_strain == {
'SEQ_1': ('A', 2020),
'SEQ_2': ('A', 2020),
'SEQ_3': ('B', 2020),
'SEQ_4': ('B', 2020),
'SEQ_5': ('B', 2020)
}
def test_filter_groupby_only_year_month_provided(self, valid_metadata: pd.DataFrame):
groups = ['country', 'year', 'month']
metadata = valid_metadata.copy()
metadata['date'] = '2020-01'
strains = metadata.index.tolist()
group_by_strain = augur.filter.subsample.get_groups_for_subsampling(strains, metadata, group_by=groups)
assert group_by_strain == {
'SEQ_1': ('A', 2020, (2020, 1)),
'SEQ_2': ('A', 2020, (2020, 1)),
'SEQ_3': ('B', 2020, (2020, 1)),
'SEQ_4': ('B', 2020, (2020, 1)),
'SEQ_5': ('B', 2020, (2020, 1))
}
|