File: test_dialign.py

package info (click to toggle)
python-cogent 2023.2.12a1%2Bdfsg-2%2Bdeb12u1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 12,416 kB
  • sloc: python: 89,165; makefile: 117; sh: 16
file content (132 lines) | stat: -rw-r--r-- 4,438 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env python

import unittest

from cogent3 import PROTEIN
from cogent3.parse.dialign import DialignParser, parse_data_line


__author__ = "Gavin Huttley"
__copyright__ = "Copyright 2007-2022, The Cogent Project"
__credits__ = ["Gavin Huttley"]
__license__ = "BSD-3"
__version__ = "2023.2.12a1"
__maintainer__ = "Gavin Huttley"
__email__ = "gavin.huttley@anu.edu.au"
__status__ = "Production"


data = """
                           DIALIGN 2.2.1 
                           *************

          Program code written by Burkhard Morgenstern and Said Abdeddaim 
             e-mail contact: dialign (at) gobics (dot) de 

          Published research assisted by DIALIGN 2 should cite:  

             Burkhard Morgenstern (1999).
             DIALIGN 2: improvement of the segment-to-segment
             approach to multiple sequence alignment.
             Bioinformatics 15, 211 - 218. 

          For more information, please visit the DIALIGN home page at 

             http://bibiserv.techfak.uni-bielefeld.de/dialign/ 

         ************************************************************



   program call:  dialign -fa -fn /tmp/ct/seq1.fasta /tmp/ct/seq1.txt  


   Aligned sequences:          length:
   ==================          =======

   1) HTL2                        57
   2) MMLV                        58
   3) HEPB                        62
   4) ECOL                        54

   Average seq. length:           57.8 


   Please note that only upper-case letters are considered to be aligned. 


   Alignment (DIALIGN format):
   ===========================

HTL2               1   ldtapC-LFS DGS------P QKAAYVL--- ----WDQTIL QQDITPLPSH 
MMLV               1   pdadhtw-YT DGSSLLQEGQ RKAGAAVtte teviWa---- KALDAG---T 
HEPB               1   rpgl-CQVFA DAT------P TGWGLVM--- ----GHQRMR GTFSAPLPIH 
ECOL               1   mlkqv-EIFT DGSCLGNPGP GGYGAIL--- ----RYRGRE KTFSAGytrT 

                       0000000588 8882222229 9999999000 0000666666 6666633334 

HTL2              37   ethSAQKGEL LALICGLRAa k--------- --- 
MMLV              43   ---SAQRAEL IALTQALKm- ---------- --- 
HEPB              37   t------AEL LAA-CFARSr sganiigtdn svv 
ECOL              43   ---TNNRMEL MAAIv----- ---------- --- 

                       0003333455 5533333300 0000000000 000 




   Sequence tree:
   ==============

Tree constructed using UPGMAbased on DIALIGN fragment weight scores

((HTL2        :0.130254MMLV        :0.130254):0.067788(HEPB        :0.120520ECOL        :0.120520):0.077521);



""".splitlines()


class TestDialign(unittest.TestCase):
    def setUp(self):
        aln_seqs = {
            "HTL2": "ldtapC-LFSDGS------PQKAAYVL-------WDQTILQQDITPLPSHethSAQKGELLALICGLRAak------------",
            "MMLV": "pdadhtw-YTDGSSLLQEGQRKAGAAVtteteviWa----KALDAG---T---SAQRAELIALTQALKm--------------",
            "HEPB": "rpgl-CQVFADAT------PTGWGLVM-------GHQRMRGTFSAPLPIHt------AELLAA-CFARSrsganiigtdnsvv",
            "ECOL": "mlkqv-EIFTDGSCLGNPGPGGYGAIL-------RYRGREKTFSAGytrT---TNNRMELMAAIv------------------",
        }
        self.aln_seqs = {}
        for name, seq in list(aln_seqs.items()):
            self.aln_seqs[name] = PROTEIN.make_seq(seq, name=name)
        self.QualityScores = "00000005888882222229999999900000006666666666633334000333345555333333000000000000000"

    def test_line_split(self):
        """test splitting of sequence record lines"""
        result = parse_data_line(
            "HTL2               1   ldtapcLFSD GS------PQ KAAYVLWDQT ILQQDITPLP SHethsaqkg "
        )
        self.assertEqual(
            result, ("HTL2", "ldtapcLFSDGS------PQKAAYVLWDQTILQQDITPLPSHethsaqkg")
        )
        result = parse_data_line(
            "                       1111111111 1000001111 1111033333 3333333333 3000000000 "
        )
        self.assertEqual(
            result, (None, "11111111111000001111111103333333333333333000000000")
        )

    def test_aligned_from_dialign(self):
        """test getting aligned seqs"""
        aligned_seq = dict(list(DialignParser(data, seq_maker=PROTEIN.make_seq)))
        assert aligned_seq == self.aln_seqs

    def test_quality_scores(self):
        """test quality scores correctly returned"""
        result = dict(
            list(DialignParser(data, seq_maker=PROTEIN.make_seq, get_scores=True))
        )
        assert result["QualityScores"] == self.QualityScores


if __name__ == "__main__":
    unittest.main()