File: Prep.py

package info (click to toggle)
arden 1.0-6
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 2,128 kB
  • sloc: python: 1,555; javascript: 249; sh: 10; makefile: 2
file content (162 lines) | stat: -rw-r--r-- 4,944 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
# -*- coding: cp1252 -*-
'''
Created 2012

Contains various help functions which initialize / translate /preprocess the data


@author: Sven Giese'''

import pickle as pickle
import random

''' INIT DICTIONARIES '''
genetic_code={'GCT':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
              'CGT':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R', 'AGA':'R', 'AGG':'R',
              'AAT':'N', 'AAC':'N',
              'GAT':'D', 'GAC':'D',
              'TGT':'C', 'TGC':'C',
              'CAA':'Q', 'CAG':'Q',
              'GAA':'E', 'GAG':'E',
              'GGT':'G', 'GGC':'G','GGA':'G', 'GGG':'G',
              'CAT':'H', 'CAC':'H',
              'ATT':'I', 'ATC':'I','ATA':'I',
              'ATG':'M',
              'TTA':'L', 'TTG':'L', 'CTT':'L', 'CTC':'L', 'CTA':'L', 'CTG':'L',
              'AAA':'K', 'AAG':'K',
              'TTT':'F', 'TTC':'F',
              'CCT':'P', 'CCC':'P','CCA':'P', 'CCG':'P',
              'TCT':'S', 'TCC':'S', 'TCA':'S', 'TCG':'S', 'AGT':'S', 'AGC':'S',
              'ACT':'T', 'ACC':'T','ACA':'T', 'ACG':'T',
              'TGG':'W',
              'TAT':'Y', 'TAC':'Y',
              'GTT':'V', 'GTC':'V','GTA':'V', 'GTG':'V',
              'TAA':'*', 'TGA':'*','TAG':'*','NNN':'n'}



def createdic(AAsequence):
    """
    Creates the dictionary for the AA triplets and searches the starting indices 
    of the triplets in the given aminoacid sequence.

    @type  AAsequence: string
    @param AAsequence: aminoacid sequence
    @rtype:   dictionary
    @return:  A dictionary with starting positions of each triplet in the given AA sequence
    
    """
    
    liste = ["A","R","N","D","C","E","Q","G","H","I","L","K","M","F","P","S","T","W","Y","V","*"]
    aa_triplets = {}
    
    # create all possibilities  (triplets)
    for i in range(0,len(liste)):
        for k in range(0,len(liste)):
            for l in range(0,len(liste)):
                aa_triplets[liste[i]+liste[k]+liste[l]]= []
                
    # create lookup dic
    # key = triplet
    # value = list of positions                
    for i in range(1,len(AAsequence),3):
        if i+3 > len(AAsequence):
            break
        if AAsequence[i:i+3] in aa_triplets:
            aa_triplets[AAsequence[i:i+3]].append(i)
    return(aa_triplets)




def isvalidtriplet(codon,dictentry):
    """
    Function which checks if a given triplet has max hamming distance of 1 
    to a other triplet. Used for generation of possible substitutions triplets

    @type  codon: string
    @param codon: nucleotide triplet
    @type  dictentry: string
    @param dictentry: nucleotide triplet
    @rtype:   bool
    @return:  Boolean value. True if max hamming distance 1,else False .
    
    """
    counter = 0
    
    for i in range (0,3):
       
        if codon[i]== dictentry[i]:
            counter+=1
        else:
            continue
        
    if counter == 2:
        return (True)
    else:
        return (False)

def trans_seq(DNA):
    """
    Funtion which translates DNA to AA

    @type  DNA: list
    @param DNA: nucleotide sequence
    @rtype:   prot,rest
    @return:  Translated aminoacid sequence,untranslated nucleotide sequence
    """
    protein=[]
    prot = ""
    rest=""
    
    DNA = str(DNA, 'utf-8')
    for i in range(0,len(DNA),3):
        # Codon exceeds length 
        if(i+3 > len(DNA)):
            rest +=DNA[i:i+3]
        
            break
        #' found Ns in nucleotid string
        if("N" in DNA[i:i+3]):
            a_a = "n"
            protein.append(a_a)
        else:
            #standard triplet translation
            codon=DNA[i:i+3]
            # look codon up in translation dic
            a_a=genetic_code[codon]
            protein.append(a_a)
            
    # transform to string
    prot = "".join(protein)
    return (prot,rest)

''' DEBUG HELP FUNCTIONS '''


def savepickle(dictionary,outputname):
    """
    basic pickle functions. actually for debugging and to speed up multiple simulations ( possible to load orf lists) 

    @type  dictionary: dictionary
    @param dictionary: Dictionary containg start and end positions of ORFs.
    @type  outputname: string
    @param outputname: Filename for saving.
    
    """
    pickle.dump( dictionary, open(outputname +".p", "wb" ) )
    print(("Saved .pickle to: " + outputname +".p"))

def loadpickle(inputname):
    """
    basic pickle functions. actually for debugging and to speed up multiple simulations ( possible to load orf lists) 


    @type  inputname: string
    @param inputname: Filename for loading.
    @rtype:   dictionary
    @return:  Dictionary containing start and end positions of ORFs.
    """
    dictionary= pickle.load( open(inputname ))#+".p" ) )
    print(("Loaded "+inputname+" pickle!"))
    return (dictionary)