File: test_translate.py

package info (click to toggle)
python-cogent 2024.5.7a1%2Bdfsg-3
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 74,600 kB
  • sloc: python: 92,479; makefile: 117; sh: 16
file content (144 lines) | stat: -rw-r--r-- 5,294 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
from unittest import TestCase

from cogent3 import DNA, make_aligned_seqs, make_unaligned_seqs
from cogent3.app.composable import NotCompleted
from cogent3.app.translate import (
    best_frame,
    get_code,
    get_fourfold_degenerate_sets,
    select_translatable,
    translate_frames,
    translate_seqs,
)


class TestTranslatable(TestCase):
    """testing translation functions"""

    def test_best_frame(self):
        """correctly identify best frame with/without allowing rc"""
        make_seq = DNA.make_seq
        seq = make_seq("ATGCTAACATAAA", name="fake1")
        f = best_frame(seq)
        self.assertEqual(f, 1)
        f = best_frame(seq, require_stop=True)
        self.assertEqual(f, 1)

        # a challenging seq, translatable in 1 and 3 frames, ending on stop in
        # frame 1. Should return frame 1 irrespective of require_stop
        seq = make_seq("ATGTTACGGACGATGCTGAAGTCGAAGATCCACCGCGCCACGGTGACCTGCTGA")
        f = best_frame(seq)
        self.assertEqual(f, 1)

        # a rc seq
        f = best_frame(seq)
        seq = make_seq(
            "AATATAAATGCCAGCTCATTACAGCATGAGAACAGCAGTTTATTACTTCATAAAGTCATA", name="fake2"
        )
        f = best_frame(seq, allow_rc=True)
        self.assertEqual(f, 1)
        with self.assertRaises(ValueError):
            f = best_frame(seq, allow_rc=True, require_stop=True)

        rc = seq.rc()
        f = best_frame(rc, allow_rc=True)
        self.assertEqual(f, -1)

    def test_select_translatable(self):
        """correctly get translatable seqs"""
        data = {
            "a": "AATATAAATGCCAGCTCATTACAGCATGAGAACA" "GCAGTTTATTACTTCATAAAGTCATA",
            "rc": "TATGACTTTATGAAGTAATAAACTGCTGTTCTCA" "TGCTGTAATGAGCTGGCATTTATATT",
        }
        seqs = make_unaligned_seqs(data=data, moltype=DNA)
        trans = select_translatable(allow_rc=False)
        tr = trans(seqs)
        ex = data.copy()
        ex.pop("rc")
        self.assertEqual(tr.to_dict(), ex)
        trans = select_translatable(allow_rc=True)
        tr = trans(seqs)
        ex = data.copy()
        ex["rc"] = data["a"]
        self.assertEqual(tr.to_dict(), ex)

        # if seqs not translatable returns NotCompletedResult
        data = dict(a="TAATTGATTAA", b="GCAGTTTATTA")
        seqs = make_unaligned_seqs(data=data, moltype=DNA)
        got = select_translatable(allow_rc=False)
        self.assertTrue(type(got), NotCompleted)

    def test_translate_frames(self):
        """returns translated sequences"""
        seq = DNA.make_seq("ATGCTGACATAAA", name="fake1")
        tr = translate_frames(seq)
        self.assertEqual(tr, ["MLT*", "C*HK", "ADI"])
        # with the bacterial nuclear and plant plastid code
        tr = translate_frames(seq, gc="Euplotid Nuclear")
        self.assertEqual(tr, ["MLT*", "CCHK", "ADI"])


class TestTranslate(TestCase):
    def test_translate_seqcoll(self):
        """correctly translate a sequence collection"""
        seqs = dict(a="ATGAGG", b="ATGTAA")
        seqs = make_unaligned_seqs(seqs)
        # trim terminal stops
        translater = translate_seqs()
        aa = translater(seqs)
        self.assertEqual(aa.to_dict(), dict(a="MR", b="M"))
        self.assertEqual(aa.moltype.label, "protein")
        # don't trim terminal stops, returns NotCompleted
        translater = translate_seqs(trim_terminal_stop=False)
        aa = translater(seqs)
        self.assertIsInstance(aa, NotCompleted)

    def test_translate_aln(self):
        """correctly translates alignments"""
        data = dict(a="ATGAGGCCC", b="ATGTTT---")
        # an array alignment
        aln = make_aligned_seqs(data)
        translater = translate_seqs()
        aa = translater(aln)
        self.assertEqual(aa.to_dict(), dict(a="MRP", b="MF-"))
        self.assertEqual(aa.moltype.label, "protein")
        self.assertIsInstance(aa, type(aln))
        # Alignment
        aln = aln.to_type(array_align=True)
        aa = translater(aln)
        self.assertEqual(aa.to_dict(), dict(a="MRP", b="MF-"))
        self.assertEqual(aa.moltype.label, "protein")
        self.assertIsInstance(aa, type(aln))


class TestFourFoldDegen(TestCase):
    def test_get_fourfold_degenerate_sets(self):
        """correctly identify 4-fold degenerate codons"""
        # using straight characters
        expect = set()
        for di in "GC", "GG", "CT", "CC", "TC", "CG", "AC", "GT":
            expect.update([frozenset(di + n for n in "ACGT")])

        for i in range(1, 3):
            got = get_fourfold_degenerate_sets(get_code(i), as_indices=False)
            self.assertEqual(got, expect)

        with self.assertRaises(AssertionError):
            # as_indices requires an alphabet
            get_fourfold_degenerate_sets(get_code(1), as_indices=True)

        expect = set()
        for di in "GC", "GG", "CT", "CC", "TC", "CG", "AC", "GT":
            codons = list(
                map(
                    lambda x: tuple(DNA.alphabet.to_indices(x)),
                    [di + n for n in "ACGT"],
                )
            )
            expect.update([frozenset(codons)])

        for i in range(1, 3):
            got = get_fourfold_degenerate_sets(
                get_code(i), alphabet=DNA.alphabet, as_indices=True
            )
            self.assertEqual(got, expect)