File: InsertMutations.py

package info (click to toggle)
arden 1.0-6
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,128 kB
  • sloc: python: 1,555; javascript: 249; sh: 10; makefile: 2
file content (292 lines) | stat: -rw-r--r-- 12,668 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#!/usr/bin/python3
"""
Created 2012
core Script for the generation of the artificial reference genome

The functions purpose is to to go through a list of positions and find balanced mutations
which fulfill the demands on the artificial reference genome. Once a initial start positions is
randomly selected all possible triplets with hamming distance 1 are generated and looked up 
in a dictionary which contains all triplet positions in the input genome. If a suitable partner
is found for the initial mutation the next start positions is chosen randomly. Else: try all other
triplets with hamming distance 1 until no one is left. This process can be accelerated by allowing
unbalanced mutations, but this will cause differences in the NUC/AA distribution and the AA neighborhood.


@author: Sven Giese
"""

import random as r
from . import Prep as INI



def getMutation(AA,Codon):
    """
    Returns a random mutation for a given AA and its Codon(DNA). The mutation is done in a way which supports the equilibrium 
    of the nucleotide distribution by only regarding hamming distance=1 Codons as possible mutations 

    @type  AA: string
    @param AA: Single AA.
    @type  Codon: string
    @param Codon: 3-letter dna code.
    @rtype:   list,list
    @return:  A list of all valid mutations (triplet) and the coresponding AA.
    """
    temp_mutationlist = []
    '''create a list of possible triplets within hamming distance 1 '''
    for item in list(INI.genetic_code.keys()):
        isvalid = INI.isvalidtriplet(item,Codon)
        ''' Hamming distance 1, AA is not equal to the given AA,forbid mutation to stopcodon '''
        if (isvalid == True and AA !=INI.genetic_code[item] and INI.genetic_code[item]!="*"):
            temp_mutationlist.append(item)
    
    
    aalist = []
    # generate a list of all possible amino acids resulting from the temp_mutationlist 
    for item in temp_mutationlist:
        if (item in INI.genetic_code):
            aalist.append(INI.genetic_code[item])
        else:
            aalist.append("n")
            
    return(temp_mutationlist,aalist)


def getdifference(triplet_old,triplet_new):
    """
    Given two triplets, returns the differences between them plus the position 

    @type  triplet_old: string
    @param triplet_old: AA triplet.
    @type  triplet_new: string
    @param triplet_new: AA triplet.
    @rtype:   Char,Char,int
    @return:  The new aminoacid, the old aminoacid and the position. 
    """
    for i in range(0,3):
        if (triplet_new[i]!=triplet_old[i]):
            
            return (triplet_new[i],triplet_old[i],i)
        
      
def isvalidposition(pdic,iprime,distance):
    """
    Checks if a position is valid for mutation. It queries all neighboring positions (iprime +-distance) to check whether there already was a mutation in pdic

    @type  pdic: dictionary
    @param pdic: Diciontary containing mutations and start/ stop codons..
    @type  iprime: int
    @param iprime: Position of the prospective mutation (DNA level)
    @type  distance: int
    @param distance: User defined parameter which limits the distance between two mutations.
    @rtype:   Bool
    @return: Boolean which decides if the position is valid (1= yes,0 = no) 
    """
    
    # deal with base shifts 
    distance = distance-2
    
    istforbidden = 0
    for o in range(-distance,distance+2,1):
        if (iprime+o in pdic):
            # E = end of orf
            # S = start of orf
            if((pdic[iprime+o]=="E") or (pdic[iprime+o]=="S")):
                if((o >3) or (o <-3)):
                    pass
                else:
                    istforbidden = 1
                    break
            else:
                istforbidden = 1
                break
    else:
            pass
                
    return(istforbidden)




def mutate_random(DNA,AminoAcid,distance,pdic,rev,header,Random,outputpath):
        """
        Mutates a given DNA(AminoAcid) Genomesequence on several positions (distance based on DISTANCE var. If one mutation is done
        a compareable Triplet is searched to "reverse" the  changes made in AA distribution, N distribution, AA neighborhood
    
        @type  DNA: list
        @param DNA: DNA sequence of the reference genome.
        @type  AminoAcid: list
        @param AminoAcid: AA sequence of the reference genome.
        @type  rev: Bool
        @param rev: Boolean which decides if unbalanced mutations are allowed (only initial mutation is performed)
        @type  pdic: dictionary
        @param pdic: Diciontary containing mutations and start/ stop codons..
        @type  header: string
        @param header: Header for the resulting artificial reference file (fasta format).
        @type  Random: Bool
        @param Random: Boolean for choosing on of the mutation modes (linear = 0,random = 1)
        @type  distance: int
        @param distance: User defined parameter which limits the distance between two mutations.
        @rtype:   list
        @return: Artificial reference genome sequence.
        """
        ##debug vals 
        start = [] # list of start positions of mutations ( start means first mutation in balanced case)
        both = []  # start and end position
        fobj2= open(outputpath+header+"_CompleteLog.txt","a")
        fobj2.write("BalancedMutation"+"\t"+"NewAA" + "\t" + "OldAA"+"\t"+"NewAAPos"+"\t"+"OldAAPos" +"\t"+ "NewDNA"+"\t"+ "OldDNA"+ "\t"+"NewDNAPos"+"\t"+"OldDNAPos"+"\n")
        fobj2.close()
        
        
        # generate start positions for mutation (the samplespace)
        samplespace = []
        for i in range (2,len(AminoAcid),int(distance/3)):
            samplespace.append(i)
        
        
        ##random_modification
        if (Random ==1):
            r.shuffle(samplespace)
        else:
            pass
     
        DNA = str(DNA, 'utf-8') 
        dna_list = list(DNA)
        AminoAcid_list = list(AminoAcid)
       
        '''the lookup dictionary for the aa triplets '''
        lookup_dic = INI.createdic(AminoAcid)

        #gotit indicator if a possibility was found to revert the initial changes (start of mutation)
        gotit=False
        # stat variables
        succ_counter = 0
        fail_counter = 0 
        skip = 0
        new_AA_triplet = ''
        new_triplet = ''
        position = 0
       
        ''' Main loop over the AminoAcid'''
        for i in samplespace:
            ''' no triplet left --> break '''
            if(i+2 >len(AminoAcid)):
                print("\t(finished...exceeded length of AA)")
                continue
            
            ''' AA which is going to be mutated'''
            AA = AminoAcid_list[i]
        
            '''index for dna : i*3 --> AminoAcid --> DNA
            #not i*3+3 because i starts at AA 2 since we need a right and left neighbor'''
            iprime = i*3
            
            '''AA and corresponding DNA triplet for the middle AA '''
            AA_triplet= AminoAcid_list[i-1]+AminoAcid_list[i]+AminoAcid_list[i+1]
            DNA_triplet = DNA[iprime:iprime+3]

            # get temporary list of all mutations. Iterate over it to find best possible substitution
            mutationsliste,aaliste = getMutation(AA,DNA_triplet)
            
            
            # isvalidposition returns 1 if the position isforbidden, else 0
            val = isvalidposition(pdic, iprime, distance)
            if (val ==1):
                skip+=1
                fobj2= open(outputpath+header+"_CompleteLog.txt","a")
                fobj2.write(str(0)+"\t"+new_AA_triplet + "\t" + "' '"+"\t"+str(i)+"\t"+"' '" +"\t"+ new_triplet+"\t"+ "' '"+ "\t"+str(iprime+position)+"\t"+"'(skipped)'"+"\n")
                fobj2.close()
                continue
                    
            else:
                pass
            

            for q,item in enumerate(mutationsliste):
                
                if gotit==True:
                    break
                else:
                    pass
                
                ''' old and new variables for before/after the mutation '''
                new_triplet                 = mutationsliste[q]
                new_AA                      = aaliste[q]
                new_N,old_N,position        = getdifference(DNA_triplet,new_triplet)
                new_AA_triplet              = AA_triplet[0]+new_AA+AA_triplet[2]
                tempdic = pdic
                tempdic[iprime+position]="M"
                
                if (new_AA_triplet in lookup_dic):
                    '''templist--> contains all starting positions of the "new_AA_triplet" which we want to substitute back '''
                    templist = lookup_dic[new_AA_triplet]
                    
                    
                    # add potential mutation to dictionary
                    tempposition = [iprime+position,"M"]
                    for l in range(0,len(templist)):
                        posi = templist[l]
                        # i*3 --> protein nach DNA, +3 betrachten IMMER mittlere AA
                        ''' suitable dna position found? '''
                        if (new_triplet == dna_list[posi*3+3]+dna_list[posi*3+3+1]+dna_list[posi*3+3+2]):
                            val = isvalidposition(tempdic, posi*3+3+position, distance)
                            
                            if (val ==1):
                                skip+=1
                                continue
                            else:
                                pass
                            
                            '''back substitution & do subs on 1st position'''
                            pdic[posi*3+3+position]="R"
                            dna_list[posi*3+3+position]= old_N
                            
                            pdic[iprime+position]="M"
                            dna_list[iprime+position]= new_N
                            
                            AminoAcid_list[i]= new_AA
                            AminoAcid_list[posi+1]= AA
                            
                            gotit = True
                            succ_counter+=1
                            #lookup_dic[new_AA_triplet]  = [i for i in lookup_dic[new_AA_triplet] if i!=posi]
                            lookup_dic[new_AA_triplet].remove(posi)
                            
                            '''writing the log file '''
                            fobj= open(outputpath+header+"_CompleteLog.txt","a")
                            fobj.write(str(1)+"\t"+AA_triplet + "\t" + new_AA_triplet+"\t"+str(i)+"\t"+str(posi) +"\t"+ DNA_triplet+"\t"+ str(new_triplet)+ "\t"+str(iprime+position)+"\t"+str(posi*3+3+position)+"\n")
                            fobj.close()
                            
                            ## statistics
                            start.append(iprime+position)
                            both.extend([iprime+position,posi*3+3+position])
                            break
                
                # no possible triplet positions for back substitution in lookup_dic    
                else:
                    continue
                 
            # after loop 
            if (gotit==False):
                fobj2= open(outputpath+header+"_CompleteLog.txt","a")
                fobj2.write(str(0)+"\t"+new_AA_triplet + "\t" + "' '"+"\t"+str(i)+"\t"+"' '" +"\t"+ new_triplet+"\t"+ "' '"+ "\t"+str(iprime+position)+"\t"+"'(tried)'"+"\n")
                fobj2.close()
                fail_counter+=1
                # reverse substitutions on? (=1) off (=0). If one dont change first mutation in the first place. Else: just change it.. 
                if (rev==0):
                    pdic[iprime+position]="M"
                    dna_list[iprime+position]= new_N
                    AminoAcid_list[i]= new_AA
                    start.append(iprime+position)
                    both.extend([iprime+position])        
            elif (gotit==True):
                    gotit = False
            
        # stats (INI.savepickle(pdic,header+"_pdic_e"))
        print("\r\n########Some stats:########")
        print("DNA length:\t" + str(len(DNA)))
        print("max substitutions:\t" + str(len(DNA)/distance))
        print("#Balanced Mutations:\t" + str(succ_counter))
        
        
        return ("".join(dna_list))