File: parse_splice_sites.py

package info (click to toggle)
kissplice 2.6.7-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 16,752 kB
  • sloc: cpp: 8,783; python: 1,618; perl: 389; sh: 72; makefile: 18
file content (283 lines) | stat: -rw-r--r-- 12,063 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
import os

# Once the first phase of the program was run, we test the results
filin = open('results_with_validation_no_coma.txt', 'r') # used to get the bcc numbers only...
no_mapping=0
seen=0
valid_splice_sites=0

GT_AG_non_confirmed=0
GC_AG_non_confirmed=0
ATATC_A_non_confirmed=0
GTATC_AT_non_confirmed=0

GT_AG_confirmed=0
GC_AG_confirmed=0
ATATC_A_confirmed=0
GTATC_AT_confirmed=0


# returns 1 if the couple site_left, site_right is in {GT*** ***AG}|{GC*** ***AG}|{ATATC ****A}|{GTATC ***AT}  --> Direct
# returns 2 if the couple site_left, site_right is in {CT*** ***AC}|{CT*** ***GC}|{T**** GATAT}|{AT*** GATAC}  --> Reverse
# else returns 0



def is_splicing_event(site_left, site_right, confirmed):
    global GT_AG_non_confirmed
    global GC_AG_non_confirmed
    global ATATC_A_non_confirmed
    global GTATC_AT_non_confirmed
    
    global GT_AG_confirmed
    global GC_AG_confirmed
    global ATATC_A_confirmed
    global GTATC_AT_confirmed


    
    # try Direct:
    if site_left.startswith("GT") and site_right.endswith("AG"):
        if confirmed==1: GT_AG_confirmed+=1
        else: GT_AG_non_confirmed+=1
        return 1
    if site_left.startswith("GC") and site_right.endswith("AG"):
        if confirmed==1: GC_AG_confirmed+=1
        else: GC_AG_non_confirmed+=1
        return 1
    if site_left.startswith("ATATC") and site_right.endswith("A"):
        if confirmed==1: ATATC_A_confirmed+=1
        else: ATATC_A_non_confirmed+=1
        return 1
    if site_left.startswith("GTATC") and site_right.endswith("AT"):
        if confirmed==1: GTATC_AT_confirmed+=1
        else: GTATC_AT_non_confirmed+=1
        return 1

    # try reverse
    if site_left.startswith("CT") and site_right.endswith("AC"):
        if confirmed==1: GT_AG_confirmed+=1
        else: GT_AG_non_confirmed+=1
        return 2
    if site_left.startswith("CT") and site_right.endswith("GC"):
        if confirmed==1: GC_AG_confirmed+=1
        else: GC_AG_non_confirmed+=1
        return 2
    if site_left.startswith("T") and site_right.endswith("GATAT"):
        if confirmed==1: ATATC_A_confirmed+=1
        else: ATATC_A_non_confirmed+=1
        return 2
    if site_left.startswith("AT") and site_right.endswith("GATAC"):
        if confirmed==1: GTATC_AT_confirmed+=1
        else: GTATC_AT_non_confirmed+=1
        return 2

    # did not find any 
    return 0
        
twice_one_block=0
one_block_valid_confirmed=0
one_block_valid_non_confirmed=0

# test if a line contains a succession of {CT AC}|{CT GC}|{ATATC A}|{GTATC AT}  --> Direct
# or a successsion of {GT AG}|{GC AG}|{T GATAT}|{AT GATAC}  --> Reverse
def test_line (line, confirmed):
    global IR_confirmed_valid
    global IR_non_confirmed_valid
    global IR_confirmed_invalid
    global IR_non_confirmed_invalid
    global twice_one_block
    global one_block_valid_non_confirmed
    global one_block_valid_confirmed
    
    array_line=line.split(" ")
    
    if array_line[0]=="\n":
        return False

#    print array_line
    if array_line[0].endswith("IR") and len(array_line) < 3: # INTRON RETENTION# check if its a direct or reverve transcript (or nothing)
        # spurious case: as results/bcc_34093_sequences_25_unsorted.fa.toblat.fasta
                                #chr7 130154452 130154476 1 2 130154452  24  130154452  24  3
        twice_one_block+=1
        print("BCC: ",bcc,": Twice one mapped block")
        return False
            
    
    # check if its a direct or reverve transcript (or nothing)
    direct=is_splicing_event(array_line[1],array_line[2], confirmed)
    if direct==0: 
        return False
    # in this case the positions starts on odd indices as the first is stolen by "IR"
    for i in range(3,len(array_line)-1,2):
#         if is_splicing_event(array_line[i],array_line[i+1], confirmed) == 0:
        if is_splicing_event(array_line[i],array_line[i+1], confirmed) != direct: # either the direction changed or this is not a splice site
            return False

    return True

    
    

confirmed=0

valid_confirmed_LAIR=0
confirmed_LAIR=0
valid_confirmed_SM=0
confirmed_SM=0
valid_confirmed_LA=0
confirmed_LA=0
valid_confirmed_SMIR=0
confirmed_SMIR=0
valid_confirmed_others=0
confirmed_others=0

valid_non_confirmed_LAIR=0
non_confirmed_LAIR=0
valid_non_confirmed_SM=0
non_confirmed_SM=0
valid_non_confirmed_LA=0
non_confirmed_LA=0
valid_non_confirmed_SMIR=0
non_confirmed_SMIR=0
valid_non_confirmed_others=0
non_confirmed_others=0



# one_block_small_confirmed=0
# one_block_small_non_confirmed=0
# others_confirmed=0
# others_non_confirmed=0

nb_confirmed=0
nb_confirmed_valid=0
nb_non_confirmed=0
nb_non_confirmed_valid=0
# nb_with_only_small_among_confirmed=0
# nb_with_only_small_among_non_confirmed=0
# others_valid_confirmed=0
# others_valid_non_confirmed=0


for line in filin.readlines():
    if line.startswith("results"): # a new result
        seen+=1
        bcc=line.split("_")[1]
        this_bcc_valid=False
        this_bcc_only_SM=True
        this_bcc_only_SMIR=True
        this_bcc_only_LAIR=True
        this_bcc_only_LA=True
        if os.path.isfile("bcc_"+bcc):
            file=open("bcc_"+bcc) 
            confirmed=0
            nb_non_confirmed+=1
        elif os.path.isfile("bcc_"+bcc+"_confirmed"):
            file=open("bcc_"+bcc+"_confirmed")
            confirmed=1
            nb_confirmed+=1
        else: 
            no_mapping+=1
            continue
        
        for sites in file.readlines():
            if test_line(sites.upper(), confirmed):
                if this_bcc_valid==False:
                    valid_splice_sites+=1
                this_bcc_valid=True
            if sites.startswith("SM ") == False: this_bcc_only_SM=False
            if sites.startswith("LA ") == False: this_bcc_only_LA=False
            if sites.startswith("SMIR") == False: this_bcc_only_SMIR=False
            if sites.startswith("LAIR") == False: this_bcc_only_LAIR=False
            
                    
        # compute the type of bcc:
        if confirmed:
            if this_bcc_only_SM: confirmed_SM+=1
            elif this_bcc_only_LA: confirmed_LA+=1
            elif this_bcc_only_SMIR:confirmed_SMIR+=1
            elif this_bcc_only_LAIR:confirmed_LAIR+=1
            else:confirmed_others+=1
        else:
            if this_bcc_only_SM:non_confirmed_SM+=1
            elif this_bcc_only_LA: non_confirmed_LA+=1
            elif this_bcc_only_SMIR:non_confirmed_SMIR+=1
            elif this_bcc_only_LAIR:non_confirmed_LAIR+=1
            else: non_confirmed_others+=1

        # FOR VINCENT: BCC MULTIPLE BLOCKS, NON VALID:
        if this_bcc_valid == False and this_bcc_only_LA:
            print("BCC, ",bcc, end=' ')  
            if confirmed: print(" confirmed ", end=' ')
            else: print(" non confirmed ")
            file.seek(0)
            for sites in file.readlines():
                print(sites, end=' ')
                
                
    
        # compute the type of bcc that are splice site coherent
        if this_bcc_valid:
            if confirmed:
                nb_confirmed_valid+=1
                if this_bcc_only_SM: valid_confirmed_SM+=1
                elif this_bcc_only_LA: valid_confirmed_LA+=1
                elif this_bcc_only_SMIR: valid_confirmed_SMIR+=1
                elif this_bcc_only_LAIR:valid_confirmed_LAIR+=1
                else: valid_confirmed_others+=1
            else:
                nb_non_confirmed_valid+=1
                if this_bcc_only_SM:valid_non_confirmed_SM+=1
                elif this_bcc_only_LA: valid_non_confirmed_LA+=1
                elif this_bcc_only_SMIR:valid_non_confirmed_SMIR+=1
                elif this_bcc_only_LAIR:valid_non_confirmed_LAIR+=1
                else: valid_non_confirmed_others+=1
        
                
            
                
file.close()
            


print("\n\n **************** RESULTS *****************")

print(seen,"BCC seen, among which ",no_mapping," are not mapped on genome and there was also ",twice_one_block,"paths having one unique block mapped both upper and lower")
print(valid_splice_sites," are coherent with respect to splice sites.")



total_confirmed=confirmed_SMIR+ confirmed_LAIR+confirmed_SM+confirmed_LA+confirmed_others
total_valid_confirmed=valid_confirmed_SMIR+ valid_confirmed_LAIR+valid_confirmed_SM+valid_confirmed_LA+valid_confirmed_others
total_non_confirmed=non_confirmed_SMIR+ non_confirmed_LAIR+non_confirmed_SM+non_confirmed_LA+non_confirmed_others
total_valid_non_confirmed=valid_non_confirmed_SMIR+ valid_non_confirmed_LAIR+valid_non_confirmed_SM+valid_non_confirmed_LA+valid_non_confirmed_others
print("\n\n******** too sum up : *********")
print("\t\t\t Total \t Slice site coherent \t percentage splice site coherent ")
print(" FP \t\t\t", no_mapping, "\t0 \t0")
print(" **** Among Confirmed ****")
print(" only [1 bloc/small] \t", confirmed_SMIR, "\t", valid_confirmed_SMIR, "\t NIL")#, "\t %.2f"%(valid_confirmed_SMIR*100/confirmed_SMIR)
print(" only [1 bloc/long] \t", confirmed_LAIR, "\t", valid_confirmed_LAIR, "\t %.2f"%(valid_confirmed_LAIR*100/float(confirmed_LAIR)))
print(" only [n blocs/small] \t", confirmed_SM, "\t", valid_confirmed_SM, "\t  %.2f"%(valid_confirmed_SM*100/float(confirmed_SM)))
print(" only [n blocs/long] \t", confirmed_LA, "\t", valid_confirmed_LA, "\t  %.2f"%(valid_confirmed_LA*100/float(confirmed_LA)))
print(" all others \t\t", confirmed_others, "\t", valid_confirmed_others, "\t  %.2f"%(valid_confirmed_others*100/float(confirmed_others)))
print(" total confirmed \t", total_confirmed, "\t", total_valid_confirmed, "\t  %.2f"%(total_valid_confirmed*100/float(total_confirmed)))
#print " validation total confirmed \t",  nb_confirmed , "\t", nb_confirmed_valid
print(" **** Among Non Confirmed ****")
print(" only [1 bloc/small] \t", non_confirmed_SMIR, "\t", valid_non_confirmed_SMIR, "\t %.2f"%(valid_non_confirmed_SMIR*100/float(non_confirmed_SMIR)))
print(" only [1 bloc/long] \t", non_confirmed_LAIR, "\t", valid_non_confirmed_LAIR, "\t %.2f"%(valid_non_confirmed_LAIR*100/float(non_confirmed_LAIR)))
print(" only [n blocs/small] \t", non_confirmed_SM, "\t", valid_non_confirmed_SM, "\t  %.2f"%(valid_non_confirmed_SM*100/float(non_confirmed_SM)))
print(" only [n blocs/long] \t", non_confirmed_LA, "\t", valid_non_confirmed_LA, "\t  %.2f"%(valid_non_confirmed_LA*100/float(non_confirmed_LA)))
print(" all others \t\t", non_confirmed_others, "\t", valid_non_confirmed_others, "\t  %.2f"%(valid_non_confirmed_others*100/float(non_confirmed_others)))
print(" total non confirmed \t", total_non_confirmed, "\t", total_valid_non_confirmed, "\t  %.2f"%(total_valid_non_confirmed*100/float(total_non_confirmed)))
#print " validation total non confirmed \t",  nb_non_confirmed , "\t", nb_non_confirmed_valid
print(" **** Grand total (including FP) ****\t")
print(" total \t\t\t", (total_non_confirmed+total_confirmed+no_mapping), "\t",  (total_valid_non_confirmed+total_valid_confirmed+no_mapping), "\t  %.2f"%(100*(total_valid_non_confirmed+total_valid_confirmed+no_mapping)/float(total_non_confirmed+total_confirmed+no_mapping)))


print("\n\n******** Splice sites repartition: *********")
print("\t\t{GT*** ***AG}\t{GC*** ***AG}\t{ATATC ****A}\t{GTATC ***AT} \t total")
print("total\t\t ",GT_AG_non_confirmed+GT_AG_confirmed ,"\t",  GC_AG_non_confirmed+GC_AG_confirmed ,"\t", ATATC_A_non_confirmed+ATATC_A_confirmed ,"\t", GTATC_AT_non_confirmed+GTATC_AT_confirmed ,"\t", GT_AG_non_confirmed+GT_AG_confirmed + GC_AG_non_confirmed+GC_AG_confirmed + ATATC_A_non_confirmed+ATATC_A_confirmed + GTATC_AT_non_confirmed+GTATC_AT_confirmed)
print("Among confirmed\t\t", GT_AG_confirmed ,"\t", GC_AG_confirmed ,"\t", ATATC_A_confirmed ,"\t",GTATC_AT_confirmed ,"\t", GT_AG_confirmed + GC_AG_confirmed + ATATC_A_confirmed + GTATC_AT_confirmed)
print("Among non confirmed\t", GT_AG_non_confirmed ,"\t", GC_AG_non_confirmed ,"\t", ATATC_A_non_confirmed ,"\t",GTATC_AT_non_confirmed ,"\t", GT_AG_non_confirmed + GC_AG_non_confirmed + ATATC_A_non_confirmed + GTATC_AT_non_confirmed)