File: test_raxml.py

package info (click to toggle)
q2-phylogeny 2024.5.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 936 kB
  • sloc: python: 3,580; makefile: 32; sh: 13
file content (376 lines) | stat: -rw-r--r-- 17,661 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
# ----------------------------------------------------------------------------
# Copyright (c) 2016-2023, QIIME 2 development team.
#
# Distributed under the terms of the Modified BSD License.
#
# The full license is in the file LICENSE, distributed with this software.
# ----------------------------------------------------------------------------

import os
import pkg_resources
import shutil
import unittest
import skbio
import tempfile

from qiime2.plugin.testing import TestPluginBase
from qiime2.util import redirected_stdio
from q2_types.feature_data import AlignedDNAFASTAFormat

from q2_phylogeny import raxml, raxml_rapid_bootstrap
from q2_phylogeny._raxml import (run_command, _build_rapid_bootstrap_command,
                                 _set_raxml_version)


class RaxmlTests(TestPluginBase):

    package = 'q2_phylogeny.tests'

    @classmethod
    def setUpClass(cls):
        super(TestPluginBase, cls).setUpClass()
        tmpdir = tempfile.mkdtemp()
        src = pkg_resources.resource_filename(cls.package, 'data')
        dst = os.path.join(tmpdir, 'data')
        shutil.copytree(src, dst)
        cls.data_dir = dst

    @classmethod
    def tearDownClass(cls):
        super(TestPluginBase, cls).setUpClass()
        shutil.rmtree(cls.data_dir)

    def get_data_path(self, filename):
        # Override TestPluginBase.get_data_path so that it returns paths to
        # temporary copies of test data.
        return os.path.join(self.data_dir, filename)

    def test_raxml(self):
        # Test that output tree is made.
        # Reads tree output and compares tip labels to expected labels.
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
        with redirected_stdio(stderr=os.devnull):
            obs = raxml(input_sequences)
        obs_tree = skbio.TreeNode.read(str(obs))
        # load the resulting tree and test that it has the right number of
        # tips and the right tip ids
        tips = list(obs_tree.tips())
        tip_names = [t.name for t in tips]
        self.assertEqual(set(tip_names),
                         set(['GCA001510755', 'GCA001045515', 'GCA000454205',
                              'GCA000473545', 'GCA000196255', 'GCA000686145',
                              'GCA001950115', 'GCA001971985', 'GCA900007555']))

    def test_raxml_underscore_ids(self):
        # Test that output tree is made with underscores in tip IDs.
        # Some programs and python wrappers may strip underscores.
        # Reads tree output and compares tip labels to expected labels.
        input_fp = self.get_data_path('aligned-dna-sequences-4.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
        with redirected_stdio(stderr=os.devnull):
            obs = raxml(input_sequences)
        obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)
        # load the resulting tree and test that it has the right number of
        # tips and the right tip ids
        tips = list(obs_tree.tips())
        tip_names = [t.name for t in tips]
        self.assertEqual(set(tip_names),
                         set(['GCA_001510755_1', 'GCA_001045515_1',
                              'GCA_000454205_1', 'GCA_000473545_1',
                              'GCA_000196255_1', 'GCA_002142615_1',
                              'GCA_000686145_1', 'GCA_001950115_1',
                              'GCA_001971985_1', 'GCA_900007555_1']))

    def test_set_raxml_version(self):
        obs_stand_1 = _set_raxml_version(raxml_version='Standard',
                                         n_threads=1)
        self.assertTrue('raxmlHPC' in str(obs_stand_1[0]))
        self.assertTrue(len(obs_stand_1) == 1)

        obs_sse3_1 = _set_raxml_version(raxml_version='SSE3', n_threads=1)
        self.assertTrue('raxmlHPC-SSE3' in str(obs_sse3_1[0]))
        self.assertTrue(len(obs_sse3_1) == 1)

        obs_avx2_1 = _set_raxml_version(raxml_version='AVX2', n_threads=1)
        self.assertTrue('raxmlHPC-AVX2' in str(obs_avx2_1[0]))
        self.assertTrue(len(obs_avx2_1) == 1)

        obs_stand_4 = _set_raxml_version(raxml_version='Standard',
                                         n_threads=4)
        self.assertTrue('raxmlHPC-PTHREADS' in str(obs_stand_4[0]))
        self.assertTrue('4' in str(obs_stand_4[1]))
        self.assertTrue(len(obs_stand_4) == 2)

        obs_sse3_4 = _set_raxml_version(raxml_version='SSE3', n_threads=4)
        self.assertTrue('raxmlHPC-PTHREADS-SSE3' in str(obs_sse3_4[0]))
        self.assertTrue('4' in str(obs_sse3_4[1]))
        self.assertTrue(len(obs_sse3_4) == 2)

        obs_avx2_4 = _set_raxml_version(raxml_version='AVX2', n_threads=4)
        self.assertTrue('raxmlHPC-PTHREADS-AVX2' in str(obs_avx2_4[0]))
        self.assertTrue('4' in str(obs_avx2_4[1]))
        self.assertTrue(len(obs_avx2_4) == 2)

    def test_raxml_version(self):
        # Test that an output tree is made when invoking threads.
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        with redirected_stdio(stderr=os.devnull):
            obs = raxml(input_sequences, raxml_version='SSE3')
        obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)

        # load the resulting tree and test that it has the right number of
        # tips and the right tip ids
        tips = list(obs_tree.tips())
        tip_names = [t.name for t in tips]

        self.assertEqual(set(tip_names),
                         set(['GCA001510755', 'GCA001045515', 'GCA000454205',
                              'GCA000473545', 'GCA000196255', 'GCA000686145',
                              'GCA001950115', 'GCA001971985', 'GCA900007555']))

    def test_raxml_n_threads(self):
        # Test that an output tree is made when invoking threads.
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        with redirected_stdio(stderr=os.devnull):
            obs = raxml(input_sequences, n_threads=2)
        obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)

        # load the resulting tree and test that it has the right number of
        # tips and the right tip ids
        tips = list(obs_tree.tips())
        tip_names = [t.name for t in tips]

        self.assertEqual(set(tip_names),
                         set(['GCA001510755', 'GCA001045515', 'GCA000454205',
                              'GCA000473545', 'GCA000196255', 'GCA000686145',
                              'GCA001950115', 'GCA001971985', 'GCA900007555']))

    def test_raxml_with_seed(self):
        # Test tip-to-tip dists are identical to manually run RAxML output.
        # This test is comparing an ordered series of tip-to-tip distances
        # to a tree output from a manual run of the default command:
        # raxmlHPC -m GTRGAMMA -p 1723 -s aligned-dna-sequences-3.fasta -n q2
        # NOTE: I cleanly rounded the tip-to-tip dists (i.e. `%.4f`) as RAxML
        # may return slightly different rounding errors on different
        # systems.
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        with redirected_stdio(stderr=os.devnull):
            obs = raxml(input_sequences, seed=1723)
        obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)
        obs_tl = list(obs_tree.tip_tip_distances().to_series())
        obs_series = set(['%.4f' % e for e in obs_tl])

        exp_tree = skbio.TreeNode.read(self.get_data_path('test.tre'))
        exp_tl = list(exp_tree.tip_tip_distances().to_series())
        exp_series = set(['%.4f' % e for e in exp_tl])

        self.assertEqual(obs_series, exp_series)

    def test_raxml_model_choice(self):
        # Tip to tip dists should NOT be identical under different models.
        # Default is GTRGAMMA, we'll compare ouput to GRTGAMMAI & GTRCAT.
        # This test is comparing an ordered series of tip-to-tip distances.
        # Take note, that for this comparison to work, all must have the same
        # seed value set.
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        # default GTRGAMMA
        with redirected_stdio(stderr=os.devnull):
            gtrg = raxml(input_sequences, seed=1723)
            gtrg_tree = skbio.TreeNode.read(
                        str(gtrg), convert_underscores=False)
            gtrg_td = set(gtrg_tree.tip_tip_distances().to_series())

        # set GTRGAMMAI
        with redirected_stdio(stderr=os.devnull):
            gtrgi = raxml(input_sequences, seed=1723,
                          substitution_model='GTRGAMMAI')
            gtrgi_tree = skbio.TreeNode.read(
                         str(gtrgi), convert_underscores=False)
            gtrgi_td = set(gtrgi_tree.tip_tip_distances().to_series())

        # set GTRCAT
        with redirected_stdio(stderr=os.devnull):
            gtrcat = raxml(input_sequences, seed=1723,
                           substitution_model='GTRCAT')
            gtrcat_tree = skbio.TreeNode.read(
                          str(gtrcat), convert_underscores=False)
            gtrcat_td = set(gtrcat_tree.tip_tip_distances().to_series())

        # test pairs are not equivalent
        self.assertNotEqual(gtrg_td, gtrgi_td)
        self.assertNotEqual(gtrg_td, gtrcat_td)
        self.assertNotEqual(gtrgi_td, gtrcat_td)

    def test_raxml_num_searches(self):
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
        with redirected_stdio(stderr=os.devnull):
            obs = raxml(input_sequences, seed=1723, n_searches=5)
        obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)
        obs_tl = list(obs_tree.tip_tip_distances().to_series())
        obs_series = set(['%.4f' % e for e in obs_tl])

        exp_tree = skbio.TreeNode.read(self.get_data_path('test3.tre'))
        exp_tl = list(exp_tree.tip_tip_distances().to_series())
        exp_series = set(['%.4f' % e for e in exp_tl])
        self.assertEqual(obs_series, exp_series)

    def test_rapid_bootstrap_command(self):
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
        with tempfile.TemporaryDirectory() as temp_dir:
            with redirected_stdio(stderr=os.devnull):
                obs = _build_rapid_bootstrap_command(input_sequences, 1723,
                                                     8752, 15, 'GTRGAMMA',
                                                     temp_dir, 'bs')
        self.assertTrue(str(input_sequences) in str(obs[11]))
        self.assertTrue('1723' in obs[5])
        self.assertTrue('8752' in obs[7])
        self.assertTrue('15' in obs[9])
        self.assertTrue('GTRGAMMA' in obs[3])
        self.assertTrue(str(temp_dir) in obs[13])
        self.assertTrue('bs' in obs[15])

    def test_raxml_rapid_bootstrap(self):
        # Test that output tree is made.
        # Reads tree output and compares tip labels to expected labels.
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
        with redirected_stdio(stderr=os.devnull):
            obs = raxml_rapid_bootstrap(input_sequences)
        obs_tree = skbio.TreeNode.read(str(obs))
        # load the resulting tree and test that it has the right number of
        # tips and the right tip ids
        tips = list(obs_tree.tips())
        tip_names = [t.name for t in tips]
        self.assertEqual(set(tip_names),
                         set(['GCA001510755', 'GCA001045515', 'GCA000454205',
                              'GCA000473545', 'GCA000196255', 'GCA000686145',
                              'GCA001950115', 'GCA001971985', 'GCA900007555']))

    def test_raxml_rapid_bootstrap_n_threads(self):
        # Test that an output tree is made when invoking threads.
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        with redirected_stdio(stderr=os.devnull):
            obs = raxml_rapid_bootstrap(input_sequences, n_threads=2)
        obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)

        # load the resulting tree and test that it has the right number of
        # tips and the right tip ids
        tips = list(obs_tree.tips())
        tip_names = [t.name for t in tips]

        self.assertEqual(set(tip_names),
                         set(['GCA001510755', 'GCA001045515', 'GCA000454205',
                              'GCA000473545', 'GCA000196255', 'GCA000686145',
                              'GCA001950115', 'GCA001971985', 'GCA900007555']))

    def test_raxml_rapid_bootstrap_with_seed(self):
        # Test tip-to-tip dists are identical to manually run RAxML output.
        # This test is comparing an ordered series of tip-to-tip distances
        # to a tree output from a manual run of the default command:
        #     raxmlHPC -f a -m GTRGAMMA -p 1723 -x 3871 -N 10
        #         -s aligned-dna-sequences-3.fasta -n q2
        # NOTE: I cleanly rounded the tip-to-tip dists (i.e. `%.4f`) as RAxML
        # may return slightly different rounding errors on different
        # systems (and at times, between conda environments).
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')

        # test that branchlengths are identical
        with redirected_stdio(stderr=os.devnull):
            obs = raxml_rapid_bootstrap(input_sequences, seed=1723,
                                        rapid_bootstrap_seed=3871,
                                        bootstrap_replicates=10)
        obs_tree = skbio.TreeNode.read(str(obs), convert_underscores=False)
        # sometimes we lose the last set of numbers on long floats
        obs_tl = list(obs_tree.tip_tip_distances().to_series())
        obs_series = set(['%.4f' % e for e in obs_tl])

        exp_tree = skbio.TreeNode.read(self.get_data_path('test2.tre'),
                                       convert_underscores=True)
        exp_tl = list(exp_tree.tip_tip_distances().to_series())
        exp_series = set(['%.4f' % e for e in exp_tl])
        self.assertEqual(obs_series, exp_series)

        # test that bootstrap supports are identical
        obs_bs = [node.name for node in obs_tree.non_tips()].sort()
        exp_bs = [node.name for node in exp_tree.non_tips()].sort()
        self.assertEqual(obs_bs, exp_bs)

    def test_run_not_verbose(self):
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
        aligned_fp = str(input_sequences)

        with tempfile.TemporaryDirectory() as temp_dir:
            cmd = ['raxmlHPC',
                   '-m', 'GTRGAMMA',
                   '-p', '1723',
                   '-s', aligned_fp,
                   '-w', temp_dir,
                   '-n', 'q2']

            with redirected_stdio(stderr=os.devnull):
                run_command(cmd, verbose=False)
            obs_tree_fp = os.path.join(temp_dir, 'RAxML_bestTree.q2')
            obs_tree = skbio.TreeNode.read(str(obs_tree_fp),
                                           convert_underscores=False)
        # load the resulting tree and test that it has the right number of
        # tips and the right tip ids
        tips = list(obs_tree.tips())
        tip_names = [t.name for t in tips]
        self.assertEqual(set(tip_names),
                         set(['GCA001510755', 'GCA001045515',
                              'GCA000454205', 'GCA000473545',
                              'GCA000196255', 'GCA000686145',
                              'GCA001950115', 'GCA001971985',
                              'GCA900007555']))

    def test_run_rapid_bs_not_verbose(self):
        input_fp = self.get_data_path('aligned-dna-sequences-3.fasta')
        input_sequences = AlignedDNAFASTAFormat(input_fp, mode='r')
        aligned_fp = str(input_sequences)

        with tempfile.TemporaryDirectory() as temp_dir:
            cmd = ['raxmlHPC',
                   '-m', 'GTRGAMMA',
                   '-p', '1723',
                   '-s', aligned_fp,
                   '-w', temp_dir,
                   '-n', 'q2',
                   '-f', 'a',
                   '-x', '9834',
                   '-N', '10']

            with redirected_stdio(stderr=os.devnull):
                run_command(cmd, verbose=False)

            obs_tree_fp = os.path.join(temp_dir, 'RAxML_bipartitions.q2')
            obs_tree = skbio.TreeNode.read(str(obs_tree_fp),
                                           convert_underscores=False)
        # load the resulting tree and test that it has the right number of
        # tips and the right tip ids
        tips = list(obs_tree.tips())
        tip_names = [t.name for t in tips]
        self.assertEqual(set(tip_names),
                         set(['GCA001510755', 'GCA001045515',
                              'GCA000454205', 'GCA000473545',
                              'GCA000196255', 'GCA000686145',
                              'GCA001950115', 'GCA001971985',
                              'GCA900007555']))


if __name__ == "__main__":
    unittest.main()