1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
|
# -*- coding: cp1252 -*-
'''
Created 2012
Contains various help functions which initialize / translate /preprocess the data
@author: Sven Giese'''
import pickle as pickle
import random
''' INIT DICTIONARIES '''
genetic_code={'GCT':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
'CGT':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R', 'AGA':'R', 'AGG':'R',
'AAT':'N', 'AAC':'N',
'GAT':'D', 'GAC':'D',
'TGT':'C', 'TGC':'C',
'CAA':'Q', 'CAG':'Q',
'GAA':'E', 'GAG':'E',
'GGT':'G', 'GGC':'G','GGA':'G', 'GGG':'G',
'CAT':'H', 'CAC':'H',
'ATT':'I', 'ATC':'I','ATA':'I',
'ATG':'M',
'TTA':'L', 'TTG':'L', 'CTT':'L', 'CTC':'L', 'CTA':'L', 'CTG':'L',
'AAA':'K', 'AAG':'K',
'TTT':'F', 'TTC':'F',
'CCT':'P', 'CCC':'P','CCA':'P', 'CCG':'P',
'TCT':'S', 'TCC':'S', 'TCA':'S', 'TCG':'S', 'AGT':'S', 'AGC':'S',
'ACT':'T', 'ACC':'T','ACA':'T', 'ACG':'T',
'TGG':'W',
'TAT':'Y', 'TAC':'Y',
'GTT':'V', 'GTC':'V','GTA':'V', 'GTG':'V',
'TAA':'*', 'TGA':'*','TAG':'*','NNN':'n'}
def createdic(AAsequence):
"""
Creates the dictionary for the AA triplets and searches the starting indices
of the triplets in the given aminoacid sequence.
@type AAsequence: string
@param AAsequence: aminoacid sequence
@rtype: dictionary
@return: A dictionary with starting positions of each triplet in the given AA sequence
"""
liste = ["A","R","N","D","C","E","Q","G","H","I","L","K","M","F","P","S","T","W","Y","V","*"]
aa_triplets = {}
# create all possibilities (triplets)
for i in range(0,len(liste)):
for k in range(0,len(liste)):
for l in range(0,len(liste)):
aa_triplets[liste[i]+liste[k]+liste[l]]= []
# create lookup dic
# key = triplet
# value = list of positions
for i in range(1,len(AAsequence),3):
if i+3 > len(AAsequence):
break
if AAsequence[i:i+3] in aa_triplets:
aa_triplets[AAsequence[i:i+3]].append(i)
return(aa_triplets)
def isvalidtriplet(codon,dictentry):
"""
Function which checks if a given triplet has max hamming distance of 1
to a other triplet. Used for generation of possible substitutions triplets
@type codon: string
@param codon: nucleotide triplet
@type dictentry: string
@param dictentry: nucleotide triplet
@rtype: bool
@return: Boolean value. True if max hamming distance 1,else False .
"""
counter = 0
for i in range (0,3):
if codon[i]== dictentry[i]:
counter+=1
else:
continue
if counter == 2:
return (True)
else:
return (False)
def trans_seq(DNA):
"""
Funtion which translates DNA to AA
@type DNA: list
@param DNA: nucleotide sequence
@rtype: prot,rest
@return: Translated aminoacid sequence,untranslated nucleotide sequence
"""
protein=[]
prot = ""
rest=""
DNA = str(DNA, 'utf-8')
for i in range(0,len(DNA),3):
# Codon exceeds length
if(i+3 > len(DNA)):
rest +=DNA[i:i+3]
break
#' found Ns in nucleotid string
if("N" in DNA[i:i+3]):
a_a = "n"
protein.append(a_a)
else:
#standard triplet translation
codon=DNA[i:i+3]
# look codon up in translation dic
a_a=genetic_code[codon]
protein.append(a_a)
# transform to string
prot = "".join(protein)
return (prot,rest)
''' DEBUG HELP FUNCTIONS '''
def savepickle(dictionary,outputname):
"""
basic pickle functions. actually for debugging and to speed up multiple simulations ( possible to load orf lists)
@type dictionary: dictionary
@param dictionary: Dictionary containg start and end positions of ORFs.
@type outputname: string
@param outputname: Filename for saving.
"""
pickle.dump( dictionary, open(outputname +".p", "wb" ) )
print(("Saved .pickle to: " + outputname +".p"))
def loadpickle(inputname):
"""
basic pickle functions. actually for debugging and to speed up multiple simulations ( possible to load orf lists)
@type inputname: string
@param inputname: Filename for loading.
@rtype: dictionary
@return: Dictionary containing start and end positions of ORFs.
"""
dictionary= pickle.load( open(inputname ))#+".p" ) )
print(("Loaded "+inputname+" pickle!"))
return (dictionary)
|