File: mmseqs2.py

package info (click to toggle)
openstructure 2.11.1-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 206,240 kB
  • sloc: cpp: 188,571; python: 36,686; ansic: 34,298; fortran: 3,275; sh: 312; xml: 146; makefile: 29
file content (145 lines) | stat: -rwxr-xr-x 4,898 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
import subprocess
import ost
from ost import settings

def createdb(infasta, resultDB, exe_path=None):
    """
    Convert fasta files containing query and/or target sequences into a mmseqs2 database.
    
    :param infasta: The fasta file from which the mmseqs2 database will be created.
    :type infasta: :class:`string`

    :param resultDB: The output location for mmseqs2 database.
    :type resultDB: :class:`string`  

    :param exe_path: The path where mmseqs2 executable is located
    :type exe_path: :class:`string`

    """

    mmseqs2_exe = settings.Locate('mmseqs2', explicit_file_name=exe_path)
    args=[mmseqs2_exe, 'createdb', infasta, resultDB]

    ost.LogInfo(f"running MMseqs2 {' '.join(args)}")
    mmseqs2_pipe=subprocess.run(args)



def create_index(trg_db, exe_path=None, directory=None):
    """

    An index file of the targetDB is computed for a fast read-in.
    It is recommended to compute the index if the targetDB is reused for several searches.   
    A directory for temporary files is generated. 
    It is recommended to create this temporary folder on a local drive. 
    
    :param trg_db: The target database mmseqs2 file.
     (You need to initially convert a fasta file into a mmseqs2 database using createdb).
    :type trg_db: :class:`string`

    :param exe_path: The path where mmseqs2 executable is located.
    :type exe_path: :class:`string'

    :param directory: The directory for temperary files.
    :type directory: :class:`string`

    """

    mmseqs2_exe = settings.Locate('mmseqs2', explicit_file_name=exe_path)
    args=[mmseqs2_exe, 'createindex', trg_db, directory]

    ost.LogInfo(f"running MMseqs2 {' '.join(args)}")
    mmseqs2_pipe=subprocess.run(args)



def alignment(query_db, trg_db, resultDB, directory, resultDB_m8, sen=None, exe_path=None,
              start_sens=None, sens_steps=None, fmt=None):
    """

    The alignment consists of two steps the prefilter and alignment.

    :param query_db: The query database mmseqs2 file.
     (You need to initially convert a fasta file into a mmseqs2 database using createdb).
    :type query_db: :class:`string`
    
    :param trg_db: The target database mmseqs2 file.
     (You need to initially convert a fasta file into a mmseqs2 database using createdb).
    :type trg_db: :class:`string`

    :param resultDB: The output location.
     (Output of createdb)
    :type resultDB: :class:`string`  

    :param exe_path: The path where mmseqs2 executable is located.
    :type exe_path: :class:`string`

    :param directory: The directory for temperary files.
    :type directory: :class:`string`

    :param sen: It controls the speed and sensitivity of the search.
                A very fast search would use a sensitivity of 1.0,
                while a very sensitive search would use a sensitivity of up to 7.0.  
    :type sen: :class:`float`

    :param start_sens: Best hit fast. The lowest sensitivity is defined with --start-sens.                      
    :type start_sens: :class:`int`

    :param sens_steps: Best hit fast. 
           The number of steps to reach the highest sensitivity can be defined with --sens-steps.
    :type sens_steps: :class:`int`

    Convert the result database into a BLAST tab formatted file.
    The file is formatted as a tab-separated list with 12 columns: 
    (1,2) identifiers for query and target sequences/profiles, 
    (3) sequence identity, 
    (4) alignment length, 
    (5) number of mismatches, 
    (6) number of gap openings,
    (7-8, 9-10) domain start and end-position in query and in target, 
    (11) E-value, 
    and (12) bit score.

    The option --format-output defines a custom output format. 
    The fields that are supported can be found in the following link:
    https://github.com/soedinglab/mmseqs2/wiki#custom-alignment-format-with-convertalis

    :param resultDB_m8: The output location
    :type resultDB_m8: :class:`string`

    :param fmt: Format output type, if the default is not used.
    :type fmt: :class:`string`

    """
    mmseqs2_exe = settings.Locate('mmseqs2', explicit_file_name=exe_path)
    command=[mmseqs2_exe, 'search', query_db, trg_db, resultDB, directory, '-a']

    if sen:
        sen=str(sen)
        command.append('-s')
        command.append(sen)

    if start_sens and sens_steps:
        start_sens=str(start_sens)
        command.append('--start-sens')
        command.append(start_sens)
        sens_steps=str(sens_steps)
        command.append('--sens-steps')
        command.append(sens_steps)


    ost.LogInfo(f"running MMseqs2 {' '.join(command)}")

    mmseqs2_pipe=subprocess.run(command)


    args=[mmseqs2_exe, 'convertalis', query_db, trg_db, resultDB, resultDB_m8]
   
    if fmt:
        args.append('--format-output')
        args.append(fmt)


    ost.LogInfo(f"running MMseqs2 (' '.join(args))")

    mmseqs2_pipe=subprocess.run(args)