File: make_deletions.py

package info (click to toggle)
mindthegap 2.3.0-5
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 4,076 kB
  • sloc: cpp: 4,482; python: 917; sh: 419; makefile: 5
file content (255 lines) | stat: -rwxr-xr-x 9,949 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
#!/usr/bin/python3
# -*- coding: utf-8 -*-
'''Program to generate random deletions in a given genome. C. Lemaitre'''


# 25/07/2013 modif pour avoir un format bed pour les positions de la délétion
import getopt, sys
import random
        

def usage():
    '''Usage'''
    print("-----------------------------------------------------------------------------")
    print(sys.argv[0]," : deletion simulator")
    print("-----------------------------------------------------------------------------")
    print("usage: ",sys.argv[0]," -g genome_file -o output [-n nb_del] [-m min_length] [-M max_length] [-s separator] [-N] [-b]")
    print("  -g: input fasta file containing the genome sequence(s)")
    print("  -o: file preffix for output files (.fasta : the new genome, .del.fasta : the deleted sequences, .del.txt : the positions of deletions)")
    print("  -n: number of deletions to generate, default=1")
    print("  -m: min size of the deletions (in bp), default = 100")
    print("  -M: max size of the deletions (in bp), default = 500")
    print("  -s: min distance between two consecutive deletions (in bp), default = 1, only positive separator are taken into account. For now it can not generate overlapping deletions.")
    print("  -N: autorize N inside the deletion (but still not in the borders)")
    print("  -b: bed format for output of the positions of deletions (instead of .del.txt will be .del.bed)")
    print("  -h: help")
    print("-----------------------------------------------------------------------------")
    sys.exit(2)


def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "hg:n:o:m:M:s:Nb", ["help", "genome=", "num=", "output=", "min=", "max=", "sep=", "enableN", "bed"])
    except getopt.GetoptError as err:
        # print help information and exit:
        print(str(err))  # will print something like "option -a not recognized"
        usage()
        sys.exit(2)

    # Default parameters
    sep=1
    min_length=100
    max_length=500
    genome_file=0
    nb_del=1
    output=0
    enableN=0
    bed_format=0
    for opt, arg in opts:
        if opt in ("-h", "--help"):
            usage()
            sys.exit()
        elif opt in ("-o", "--output"):
            output = arg
        elif opt in ("-g", "--genome"):
            genome_file = arg
        elif opt in ("-n", "--num"):
            nb_del = int(arg)
        elif opt in ("-m", "--min"):
            min_length = int(arg)
        elif opt in ("-M", "--max"):
            max_length = int(arg)
        elif opt in ("-s", "--sep"):
            sep = int(arg)
            if sep<0:
                sep=0
        elif opt in ("-N", "--enableN"):
            enableN=1
        elif opt in ("-b", "--bed"):
            bed_format=1
        else:
            assert False, "unhandled option"

    if genome_file == 0 or output==0:
        print("Missing arguments")
        usage()
        sys.exit(2)
    elif min_length<=0 or max_length<min_length:
        print("Error in parameters : deletion length must respect the following condition : 0 < min_length <= max_length")
        sys.exit(2)
    elif nb_del<=0:
        print("Error in parameters : number of deletions should be greater than 0")
        sys.exit(2)
    else:
        
        # defining special characters :
        char_seq="$"
        char_begin="b"
        char_end="e"
        char_N="N"
        char_set="1234567890"+char_seq+char_begin+char_end;
        if enableN==0:
            char_set=char_set+char_N
                
        
        # Getting the genome sequence
        cumul_seq=""
        total_length=0
        names=[]
        nb_seq=0
        nb_col_fasta=0
        filin=open(genome_file,"r")
        for line in filin:
            if line[0] ==">":
                # header
                current_name=(line.lstrip(">")).rstrip("\n")
                names.append(current_name)
                if nb_seq>0:
                    cumul_seq+="$"
                nb_seq+=1
            else:
                current_seq=(line.rstrip("\n")).upper()
                le=len(current_seq)
                cumul_seq+=current_seq
                total_length+=le
                # we record the fasta format (width) to write output in the same format
                if le > nb_col_fasta:
                    nb_col_fasta=le
        filin.close()
        cumul_seq+="$"
        total_length+=nb_seq #  to account for the $


#        # To test :
#        test_pos=[25,5,26]
#        test_length=[5,12,20]
#        remaining_length=total_length
#        new_seq=cumul_seq
#        for i in range(len(test_pos)):
#            left_pos=test_pos[i]
#            del_length=test_length[i]


        # Placing the deletions
        nb_ok=0  # nb of placed deletions
        nb_boucle=0  # to limit the running time if deletions are too difficult to place
        remaining_length=total_length
        new_seq=cumul_seq
        while nb_ok < nb_del and nb_boucle < (20*nb_del):
            left_pos=random.randint(1,remaining_length)
            del_length=random.randint(min_length,max_length)            
            right_pos=left_pos+del_length
            # checking if deletion is possible
            # extend the region from both sides
            if right_pos+sep<remaining_length and left_pos-sep>=0:
                del_seq_ext=new_seq[(left_pos-sep):(right_pos+sep)]
                borders=new_seq[(left_pos-sep):(left_pos+sep)]+new_seq[(right_pos-sep):(right_pos+sep)]
                if not containsAny(del_seq_ext,char_set) and not char_N in borders:
                    # good deletion : does not contain any end of sequence (chr) or begin or end of previous deletions
                    # and near the borders does not have any N
                    nb_ok+=1
                    # inserting the deletion : replacing the delerted sequence by b125e (for an del of length 125)
                    tmp_seq=new_seq[0:left_pos]+char_begin+str(del_length)+char_end+new_seq[right_pos:]
                    new_seq=tmp_seq
                    remaining_length+=2+len(str(del_length))-del_length# new_seq length
            nb_boucle+=1
                
        if nb_ok < nb_del:
            print("Warning: too difficult to place ",str(nb_del)," deletions, only ",str(nb_ok)," placed")

#        print "placed deletions : "+str(nb_ok)


        # Printing the output
        filout_fasta=open(output+".fasta","w")
        filout_del=open(output+".del.fasta","w")
        if bed_format==1:
            filout_pos=open(output+".del.bed","w")
        else:
            filout_pos=open(output+".del.txt","w")

        # Reading the cumul sequence to get the positions and write the 2 fasta files
        # Note : positions are 0-based and begin and end positions are included in the deletion : [5-10] length = 11
        index_seq=0
        chr_seq=""
        old_pos=0 # position in initial sequence (without deletion)
        new_pos=0 # position in new sequence (with the deletions)
        del_begin=0
        inside_del=False
        compt_del=0
        current_name=names[index_seq]
        header=current_name
        del_length_str=""

        # position file header for txt format
        if bed_format==0:
            filout_pos.write("id\tname\tpos\tlength\tinit.inf\tinit.sup\n")
        for c in new_seq:
            if c==char_seq:
                # writing previous sequence :
                fasta_lines=writeFasta(chr_seq,header,nb_col_fasta)
                filout_fasta.write(fasta_lines)
                index_seq+=1
                if index_seq<nb_seq:  # if not the last $
                    current_name=names[index_seq]
                    header=current_name
                    cumul_seq=cumul_seq[old_pos+1:]
                    old_pos=0
                    new_pos=0
                    chr_seq=""
            elif c==char_begin:
                del_begin=old_pos
                del_length_str=""
            elif c==char_end:
                compt_del+=1
                del_length=int(del_length_str)
                del_end=del_begin+del_length
                # Getting the sequence to delete using the coordinates on the initial sequence
                del_seq=cumul_seq[del_begin:del_end]
                old_pos+=del_length
                # Writing the deleted sequence :
                del_header="deletion_"+str(compt_del)+" : "+current_name+"_"+str(new_pos)
                fasta_lines=writeFasta(del_seq,del_header,nb_col_fasta)
                filout_del.write(fasta_lines)
                # Writing the deletion positions in text files
                if bed_format==1:
                    ## ordre des champs : ch_name, pos_begin, pos_end, id, length, init.begin, init.end
                    filout_pos.write(current_name+"\t"+str(new_pos)+"\t"+str(new_pos+1)+"\t"+str(compt_del)+"\t"+str(del_end-del_begin)+"\t"+str(del_begin)+"\t"+str(del_end)+"\n")
                else:
                    filout_pos.write(str(compt_del)+"\t"+current_name+"\t"+str(new_pos)+"\t"+str(del_end-del_begin)+"\t"+str(del_begin)+"\t"+str(del_end)+"\n")
            elif c in "0123456789":
                del_length_str+=c
            else:
                chr_seq+=c
                old_pos+=1
                new_pos+=1

        filout_fasta.close()
        filout_del.close()
        filout_pos.close()

#        print "recovered and printed deletions : "+str(compt_del)

def containsAny(str, set):
    """Check whether 'str' contains ANY of the chars in 'set'"""
    return 1 in [c in str for c in set]

def writeFasta(sequence,name,ncol):
    fasta_lines=">"+name+"\n"
    if ncol>0:
        le=len(sequence)
        ind=0
        while ind<le:
            fasta_lines+=sequence[ind:(ind+ncol)]+"\n"
            ind+=ncol
    else:
        fasta_lines+=sequence+"\n"
    return fasta_lines
    

if __name__ == "__main__":
    main()

# exemple :
# ./make_deletions.py -g essai.fasta -n 1 -m 10 -M 12 -o output