File: align.pyx

package info (click to toggle)
obitools 3.0.1~b26%2Bdfsg-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 26,788 kB
  • sloc: ansic: 24,299; python: 657; sh: 27; makefile: 20
file content (285 lines) | stat: -rw-r--r-- 12,645 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
#cython: language_level=3

from obitools3.apps.progress cimport ProgressBar  # @UnresolvedImport
from obitools3.dms import DMS
from obitools3.dms.view.view cimport View
from obitools3.uri.decode import open_uri
from obitools3.apps.optiongroups import addMinimalInputOption, addMinimalOutputOption, addNoProgressBarOption
from obitools3.dms.view import RollbackException
from obitools3.apps.config import logger
from obitools3.utils cimport tobytes, str2bytes

from obitools3.dms.capi.obilcsalign cimport obi_lcs_align_one_column, \
                                            obi_lcs_align_two_columns

from io import BufferedWriter
from cpython.exc cimport PyErr_CheckSignals

import time
import sys


__title__="Align one sequence column with itself or two sequence columns"


def addOptions(parser):
   
   addMinimalInputOption(parser)
   addMinimalOutputOption(parser)
   addNoProgressBarOption(parser)

   group=parser.add_argument_group('obi align specific options')

   group.add_argument('--input-2', '-I',
                      action="store", dest="align:inputuri2",
                      metavar='<INPUT URI>',
                      default="",
                      type=str,
                      help="Eventually, the URI of the second input to align with the first one.")
      
   group.add_argument('--threshold','-t',
                      action="store", dest="align:threshold",
                      metavar='<THRESHOLD>',
                      default=0.0,
                      type=float,
                      help="Score threshold. If the score is normalized and expressed in similarity (default),"
                           " it is an identity, e.g. 0.95 for an identity of 95%%. If the score is normalized"
                           " and expressed in distance, it is (1.0 - identity), e.g. 0.05 for an identity of 95%%."
                           " If the score is not normalized and expressed in similarity, it is the length of the"
                           " Longest Common Subsequence. If the score is not normalized and expressed in distance,"
                           " it is (reference length - LCS length)."
                           " Only sequence pairs with a similarity above <THRESHOLD> are printed. Default: 0.00"
                           " (no threshold).")
   
   group.add_argument('--longest-length','-L',
                      action="store_const", dest="align:reflength",
                      default=0,
                      const=1,
                      help="The reference length is the length of the longest sequence."
                           " Default: the reference length is the length of the alignment.")
   
   group.add_argument('--shortest-length','-l',
                      action="store_const", dest="align:reflength",
                      default=0,
                      const=2,
                      help="The reference length is the length of the shortest sequence."
                           " Default: the reference length is the length of the alignment.")
   
   group.add_argument('--raw','-r',
                      action="store_false", dest="align:normalize",
                      default=True,
                      help="Raw score, not normalized. Default: score is normalized with the reference sequence length.")
   
   group.add_argument('--distance','-D',
                      action="store_false", dest="align:similarity",
                      default=True,
                      help="Score is expressed in distance. Default: score is expressed in similarity.")
   
   group.add_argument('--print-seq','-s',
                      action="store_true", dest="align:printseq",
                      default=False,
                      help="The nucleotide sequences are written in the output view. Default: they are not written.")
   
   group.add_argument('--print-count','-n',
                      action="store_true", dest="align:printcount",
                      default=False,
                      help="Sequence counts are written in the output view. Default: they are not written.")
   
   group.add_argument('--thread-count','-p',   # TODO should probably be in a specific option group
                      action="store", dest="align:threadcount",
                      metavar='<THREAD COUNT>',
                      default=1,
                      type=int,
                      help="Number of threads to use for the computation. Default: one.")


cpdef align_columns(bytes dms_n, 
                    bytes input_view_1_n, 
                    bytes output_view_n,
                    bytes input_view_2_n=b"",
                    bytes input_column_1_n=b"", 
                    bytes input_column_2_n=b"",
                    bytes input_elt_1_n=b"", 
                    bytes input_elt_2_n=b"",
                    bytes id_column_1_n=b"", 
                    bytes id_column_2_n=b"",
                    double threshold=0.0, bint normalize=True, 
                    int reference=0, bint similarity_mode=True,
                    bint print_seq=False, bint print_count=False,
                    bytes comments=b"{}",
                    int thread_count=1) : 

    if input_view_2_n == b"" and input_column_2_n == b"" :
        if obi_lcs_align_one_column(dms_n, \
                                    input_view_1_n, \
                                    input_column_1_n, \
                                    input_elt_1_n, \
                                    id_column_1_n, \
                                    output_view_n, \
                                    comments, \
                                    print_seq, \
                                    print_count, \
                                    threshold, normalize, reference, similarity_mode,
                                    thread_count) < 0 :
            raise Exception("Error aligning sequences")        
            
    else:
        if obi_lcs_align_two_columns(dms_n, \
                                     input_view_1_n, \
                                     input_view_2_n, \
                                     input_column_1_n, \
                                     input_column_2_n, \
                                     input_elt_1_n, \
                                     input_elt_2_n, \
                                     id_column_1_n, \
                                     id_column_2_n, \
                                     output_view_n, \
                                     comments, \
                                     print_seq, \
                                     print_count, \
                                     threshold, normalize, reference, similarity_mode) < 0 :
            raise Exception("Error aligning sequences")        
 
 
def run(config):
    
    DMS.obi_atexit()
    
    logger("info", "obi align")

    # Open the input: only the DMS
    input = open_uri(config['obi']['inputURI'],
                     dms_only=True)
    if input is None:
        raise Exception("Could not read input")
    i_dms = input[0]
    i_dms_name = input[0].name
    i_uri = input[1]
    i_view_name = i_uri.split(b"/")[0]
    i_column_name = b""
    i_element_name = b""
    if len(i_uri.split(b"/")) >= 2:
        i_column_name = i_uri.split(b"/")[1]
    if len(i_uri.split(b"/")) == 3:
        i_element_name = i_uri.split(b"/")[2]
    if len(i_uri.split(b"/")) > 3:
        raise Exception("Input URI contains too many elements:", config['obi']['inputURI'])

    # Open the second input if there is one
    i_dms_2 = None
    i_dms_name_2 = b""
    original_i_view_name_2 = b""
    i_view_name_2 = b""
    i_column_name_2 = b""
    i_element_name_2 = b""
    if config['align']['inputuri2']:
        input_2 = open_uri(config['align']['inputuri2'],
                           dms_only=True)
        if input_2 is None:
            raise Exception("Could not read second input")
        i_dms_2 = input_2[0]
        i_dms_name_2 = i_dms_2.name
        i_uri_2 = input_2[1]
        original_i_view_name_2 = i_uri_2.split(b"/")[0]
        if len(i_uri_2.split(b"/")) >= 2:
            i_column_name_2 = i_uri_2.split(b"/")[1]
        if len(i_uri_2.split(b"/")) == 3:
            i_element_name_2 = i_uri_2.split(b"/")[2]
        if len(i_uri_2.split(b"/")) > 3:
            raise Exception("Input URI contains too many elements:", config['align']['inputuri2'])

        # If the 2 input DMS are not the same, temporarily import 2nd input view in first input DMS
        if i_dms != i_dms_2:
            temp_i_view_name_2 = original_i_view_name_2
            i=0
            while temp_i_view_name_2 in i_dms:  # Making sure view name is unique in input DMS
                temp_i_view_name_2 = original_i_view_name_2+b"_"+str2bytes(str(i))
                i+=1
            i_view_name_2 = temp_i_view_name_2
            View.import_view(i_dms_2.full_path[:-7], i_dms.full_path[:-7], original_i_view_name_2, i_view_name_2)

    # Open the output: only the DMS
    output = open_uri(config['obi']['outputURI'],
                      input=False,
                      dms_only=True)
    if output is None:
        raise Exception("Could not create output")
    o_dms = output[0]
    output_0 = output[0]
    o_dms_name = o_dms.name 
    final_o_view_name = output[1]
    o_view_name = final_o_view_name

    # If stdout output or the input and output DMS are not the same, align creating a temporary view in the input dms that will be exported to 
    # the right DMS and deleted in the other afterwards.
    if i_dms != o_dms or type(output_0)==BufferedWriter:
        if type(output_0)==BufferedWriter:
            o_dms = i_dms
        o_view_name = b"temp"
        while o_view_name in i_dms:   # Making sure view name is unique in input DMS
            o_view_name = o_view_name+b"_"+str2bytes(str(i))
            i+=1
        
    # Save command config in View comments
    command_line = " ".join(sys.argv[1:])
    
    i_dms_list = [i_dms_name]
    if i_dms_name_2:
        i_dms_list.append(i_dms_name_2)
    i_view_list = [i_view_name]
    if original_i_view_name_2:
        i_view_list.append(original_i_view_name_2)
    comments = View.print_config(config, "align", command_line, input_dms_name=i_dms_list, input_view_name=i_view_list)
    
    # Call cython alignment function
      # Using default ID columns of the view. TODO discuss adding option
    align_columns(i_dms.name_with_full_path,  \
                  i_view_name,  \
                  o_view_name,  \
                  input_view_2_n   = i_view_name_2,  \
                  input_column_1_n = i_column_name,  \
                  input_column_2_n = i_column_name_2, \
                  input_elt_1_n    = i_element_name,  \
                  input_elt_2_n    = i_element_name_2, \
                  id_column_1_n    = b"",  \
                  id_column_2_n    = b"", \
                  threshold        = config['align']['threshold'], \
                  normalize        = config['align']['normalize'],  \
                  reference        = config['align']['reflength'],  \
                  similarity_mode  = config['align']['similarity'],  \
                  print_seq        = config['align']['printseq'],  \
                  print_count      = config['align']['printcount'], \
                  comments         = comments, \
                  thread_count     = config['align']['threadcount'])

    # If the input and output DMS are not the same, export result view to output DMS
    if i_dms != o_dms:
        View.import_view(i_dms.full_path[:-7], o_dms.full_path[:-7], o_view_name, final_o_view_name)

    # Save command config in output DMS comments
    o_dms.record_command_line(command_line)

    #print("\n\nOutput view:\n````````````", file=sys.stderr)
    #print(repr(o_dms[final_o_view_name]), file=sys.stderr)

    # If the two input DMS are different, delete the temporary input view in the first input DMS
    if i_dms_2 and i_dms != i_dms_2:
        View.delete_view(i_dms, i_view_name_2)
        i_dms_2.close()

    # stdout output: write to buffer
    if type(output_0)==BufferedWriter:
        logger("info", "Printing to output...")
        o_view = o_dms[o_view_name]
        o_view.print_to_output(output_0, noprogressbar=config['obi']['noprogressbar'])
        o_view.close()

    # If the input and the output DMS are different, delete the temporary result view in the input DMS
    if i_dms != o_dms or type(output_0)==BufferedWriter:
        View.delete_view(i_dms, o_view_name)
        o_dms.close(force=True)
    
    i_dms.close(force=True)
    
    logger("info", "Done.")