File: test_parse.py

package info (click to toggle)
augur 24.4.0-3
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 25,312 kB
  • sloc: python: 14,253; sh: 227; makefile: 35
file content (80 lines) | stat: -rw-r--r-- 3,190 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import Bio.SeqIO
import Bio.Seq
from pathlib import Path
import sys

# make sure we get the local version of parse
sys.path.insert(0, (str(Path(__file__).parent.parent.parent)))
from augur import parse


class TestParse:
    def test_fix_dates(self, capsys):
        full_date = "4-5-2020"
        assert parse.fix_dates(full_date) == "2020-05-04"
        assert parse.fix_dates(full_date, dayfirst=False) == "2020-04-05"
        partial_date_no_month = "2020-04"
        assert parse.fix_dates(partial_date_no_month) == "2020-04-XX"
        assert parse.fix_dates(partial_date_no_month, dayfirst=False) == "2020-04-XX"

        partial_date_only_year = "2020"
        assert parse.fix_dates(partial_date_only_year) == "2020-XX-XX"

        # We should allow valid ambiguous dates to pass through the function
        # without a warning to the user.
        valid_ambiguous_date = "2020-XX-XX"
        assert parse.fix_dates(valid_ambiguous_date) == valid_ambiguous_date
        assert "WARNING" not in capsys.readouterr().err

        # We should warn the user when their dates cannot be parsed by pandas or
        # as a valid ambiguous date.
        malformed_date = "Aaasd123AS"
        assert(parse.fix_dates(malformed_date)) == malformed_date
        assert "WARNING" in capsys.readouterr().err

    def test_prettify(self):
        # test trim
        trim_string = "trim_here"
        assert parse.prettify(trim_string, trim=4) == trim_string[:4] + "..."
        trim_string_short = "trim"
        assert parse.prettify(trim_string_short, trim=4) == trim_string_short

        # test caps
        caps_string_pos = "usa"
        caps_string_neg = "usad"
        assert parse.prettify(caps_string_pos) == caps_string_pos.upper()
        assert parse.prettify(caps_string_neg) == caps_string_neg

        # test camelCase
        camel_case_string = "testing_one_two_three"
        assert parse.prettify(camel_case_string, camelCase=True) == "Testing One Two Three"

        # test remove comma
        remove_comma_string = "remove,the,commas"
        parse.prettify(remove_comma_string, removeComma=True)
        assert parse.prettify(remove_comma_string, removeComma=True) == "removethecommas"

        # test etal
        etal_lower_string = "testing string Et Al Et al"
        etal_strip_string = "nextstrain et al. et al Et Al."
        assert parse.prettify(etal_lower_string, etal='lower') == etal_lower_string.lower()
        assert parse.prettify(etal_strip_string, etal='strip') == "nextstrain   "

    def test_parse_sequence(self):
        sequence_record = Bio.SeqRecord.SeqRecord(
            seq=Bio.Seq.Seq("ATCG"),
            id="sequence_A",
            description="sequence_A|USA|2020-10-03|north_america"
        )
        fields = ["strain", "country", "date", "region"]
        sequence, metadata = parse.parse_sequence(
            sequence_record,
            fields=fields,
            strain_key="strain",
            prettify_fields=["region"]
        )

        assert sequence.id == metadata["strain"]
        assert sequence.id == "sequence_A"
        assert set(fields) == set(metadata.keys())
        assert metadata["region"] == "North America"