1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283
|
import os
# Once the first phase of the program was run, we test the results
filin = open('results_with_validation_no_coma.txt', 'r') # used to get the bcc numbers only...
no_mapping=0
seen=0
valid_splice_sites=0
GT_AG_non_confirmed=0
GC_AG_non_confirmed=0
ATATC_A_non_confirmed=0
GTATC_AT_non_confirmed=0
GT_AG_confirmed=0
GC_AG_confirmed=0
ATATC_A_confirmed=0
GTATC_AT_confirmed=0
# returns 1 if the couple site_left, site_right is in {GT*** ***AG}|{GC*** ***AG}|{ATATC ****A}|{GTATC ***AT} --> Direct
# returns 2 if the couple site_left, site_right is in {CT*** ***AC}|{CT*** ***GC}|{T**** GATAT}|{AT*** GATAC} --> Reverse
# else returns 0
def is_splicing_event(site_left, site_right, confirmed):
global GT_AG_non_confirmed
global GC_AG_non_confirmed
global ATATC_A_non_confirmed
global GTATC_AT_non_confirmed
global GT_AG_confirmed
global GC_AG_confirmed
global ATATC_A_confirmed
global GTATC_AT_confirmed
# try Direct:
if site_left.startswith("GT") and site_right.endswith("AG"):
if confirmed==1: GT_AG_confirmed+=1
else: GT_AG_non_confirmed+=1
return 1
if site_left.startswith("GC") and site_right.endswith("AG"):
if confirmed==1: GC_AG_confirmed+=1
else: GC_AG_non_confirmed+=1
return 1
if site_left.startswith("ATATC") and site_right.endswith("A"):
if confirmed==1: ATATC_A_confirmed+=1
else: ATATC_A_non_confirmed+=1
return 1
if site_left.startswith("GTATC") and site_right.endswith("AT"):
if confirmed==1: GTATC_AT_confirmed+=1
else: GTATC_AT_non_confirmed+=1
return 1
# try reverse
if site_left.startswith("CT") and site_right.endswith("AC"):
if confirmed==1: GT_AG_confirmed+=1
else: GT_AG_non_confirmed+=1
return 2
if site_left.startswith("CT") and site_right.endswith("GC"):
if confirmed==1: GC_AG_confirmed+=1
else: GC_AG_non_confirmed+=1
return 2
if site_left.startswith("T") and site_right.endswith("GATAT"):
if confirmed==1: ATATC_A_confirmed+=1
else: ATATC_A_non_confirmed+=1
return 2
if site_left.startswith("AT") and site_right.endswith("GATAC"):
if confirmed==1: GTATC_AT_confirmed+=1
else: GTATC_AT_non_confirmed+=1
return 2
# did not find any
return 0
twice_one_block=0
one_block_valid_confirmed=0
one_block_valid_non_confirmed=0
# test if a line contains a succession of {CT AC}|{CT GC}|{ATATC A}|{GTATC AT} --> Direct
# or a successsion of {GT AG}|{GC AG}|{T GATAT}|{AT GATAC} --> Reverse
def test_line (line, confirmed):
global IR_confirmed_valid
global IR_non_confirmed_valid
global IR_confirmed_invalid
global IR_non_confirmed_invalid
global twice_one_block
global one_block_valid_non_confirmed
global one_block_valid_confirmed
array_line=line.split(" ")
if array_line[0]=="\n":
return False
# print array_line
if array_line[0].endswith("IR") and len(array_line) < 3: # INTRON RETENTION# check if its a direct or reverve transcript (or nothing)
# spurious case: as results/bcc_34093_sequences_25_unsorted.fa.toblat.fasta
#chr7 130154452 130154476 1 2 130154452 24 130154452 24 3
twice_one_block+=1
print("BCC: ",bcc,": Twice one mapped block")
return False
# check if its a direct or reverve transcript (or nothing)
direct=is_splicing_event(array_line[1],array_line[2], confirmed)
if direct==0:
return False
# in this case the positions starts on odd indices as the first is stolen by "IR"
for i in range(3,len(array_line)-1,2):
# if is_splicing_event(array_line[i],array_line[i+1], confirmed) == 0:
if is_splicing_event(array_line[i],array_line[i+1], confirmed) != direct: # either the direction changed or this is not a splice site
return False
return True
confirmed=0
valid_confirmed_LAIR=0
confirmed_LAIR=0
valid_confirmed_SM=0
confirmed_SM=0
valid_confirmed_LA=0
confirmed_LA=0
valid_confirmed_SMIR=0
confirmed_SMIR=0
valid_confirmed_others=0
confirmed_others=0
valid_non_confirmed_LAIR=0
non_confirmed_LAIR=0
valid_non_confirmed_SM=0
non_confirmed_SM=0
valid_non_confirmed_LA=0
non_confirmed_LA=0
valid_non_confirmed_SMIR=0
non_confirmed_SMIR=0
valid_non_confirmed_others=0
non_confirmed_others=0
# one_block_small_confirmed=0
# one_block_small_non_confirmed=0
# others_confirmed=0
# others_non_confirmed=0
nb_confirmed=0
nb_confirmed_valid=0
nb_non_confirmed=0
nb_non_confirmed_valid=0
# nb_with_only_small_among_confirmed=0
# nb_with_only_small_among_non_confirmed=0
# others_valid_confirmed=0
# others_valid_non_confirmed=0
for line in filin.readlines():
if line.startswith("results"): # a new result
seen+=1
bcc=line.split("_")[1]
this_bcc_valid=False
this_bcc_only_SM=True
this_bcc_only_SMIR=True
this_bcc_only_LAIR=True
this_bcc_only_LA=True
if os.path.isfile("bcc_"+bcc):
file=open("bcc_"+bcc)
confirmed=0
nb_non_confirmed+=1
elif os.path.isfile("bcc_"+bcc+"_confirmed"):
file=open("bcc_"+bcc+"_confirmed")
confirmed=1
nb_confirmed+=1
else:
no_mapping+=1
continue
for sites in file.readlines():
if test_line(sites.upper(), confirmed):
if this_bcc_valid==False:
valid_splice_sites+=1
this_bcc_valid=True
if sites.startswith("SM ") == False: this_bcc_only_SM=False
if sites.startswith("LA ") == False: this_bcc_only_LA=False
if sites.startswith("SMIR") == False: this_bcc_only_SMIR=False
if sites.startswith("LAIR") == False: this_bcc_only_LAIR=False
# compute the type of bcc:
if confirmed:
if this_bcc_only_SM: confirmed_SM+=1
elif this_bcc_only_LA: confirmed_LA+=1
elif this_bcc_only_SMIR:confirmed_SMIR+=1
elif this_bcc_only_LAIR:confirmed_LAIR+=1
else:confirmed_others+=1
else:
if this_bcc_only_SM:non_confirmed_SM+=1
elif this_bcc_only_LA: non_confirmed_LA+=1
elif this_bcc_only_SMIR:non_confirmed_SMIR+=1
elif this_bcc_only_LAIR:non_confirmed_LAIR+=1
else: non_confirmed_others+=1
# FOR VINCENT: BCC MULTIPLE BLOCKS, NON VALID:
if this_bcc_valid == False and this_bcc_only_LA:
print("BCC, ",bcc, end=' ')
if confirmed: print(" confirmed ", end=' ')
else: print(" non confirmed ")
file.seek(0)
for sites in file.readlines():
print(sites, end=' ')
# compute the type of bcc that are splice site coherent
if this_bcc_valid:
if confirmed:
nb_confirmed_valid+=1
if this_bcc_only_SM: valid_confirmed_SM+=1
elif this_bcc_only_LA: valid_confirmed_LA+=1
elif this_bcc_only_SMIR: valid_confirmed_SMIR+=1
elif this_bcc_only_LAIR:valid_confirmed_LAIR+=1
else: valid_confirmed_others+=1
else:
nb_non_confirmed_valid+=1
if this_bcc_only_SM:valid_non_confirmed_SM+=1
elif this_bcc_only_LA: valid_non_confirmed_LA+=1
elif this_bcc_only_SMIR:valid_non_confirmed_SMIR+=1
elif this_bcc_only_LAIR:valid_non_confirmed_LAIR+=1
else: valid_non_confirmed_others+=1
file.close()
print("\n\n **************** RESULTS *****************")
print(seen,"BCC seen, among which ",no_mapping," are not mapped on genome and there was also ",twice_one_block,"paths having one unique block mapped both upper and lower")
print(valid_splice_sites," are coherent with respect to splice sites.")
total_confirmed=confirmed_SMIR+ confirmed_LAIR+confirmed_SM+confirmed_LA+confirmed_others
total_valid_confirmed=valid_confirmed_SMIR+ valid_confirmed_LAIR+valid_confirmed_SM+valid_confirmed_LA+valid_confirmed_others
total_non_confirmed=non_confirmed_SMIR+ non_confirmed_LAIR+non_confirmed_SM+non_confirmed_LA+non_confirmed_others
total_valid_non_confirmed=valid_non_confirmed_SMIR+ valid_non_confirmed_LAIR+valid_non_confirmed_SM+valid_non_confirmed_LA+valid_non_confirmed_others
print("\n\n******** too sum up : *********")
print("\t\t\t Total \t Slice site coherent \t percentage splice site coherent ")
print(" FP \t\t\t", no_mapping, "\t0 \t0")
print(" **** Among Confirmed ****")
print(" only [1 bloc/small] \t", confirmed_SMIR, "\t", valid_confirmed_SMIR, "\t NIL")#, "\t %.2f"%(valid_confirmed_SMIR*100/confirmed_SMIR)
print(" only [1 bloc/long] \t", confirmed_LAIR, "\t", valid_confirmed_LAIR, "\t %.2f"%(valid_confirmed_LAIR*100/float(confirmed_LAIR)))
print(" only [n blocs/small] \t", confirmed_SM, "\t", valid_confirmed_SM, "\t %.2f"%(valid_confirmed_SM*100/float(confirmed_SM)))
print(" only [n blocs/long] \t", confirmed_LA, "\t", valid_confirmed_LA, "\t %.2f"%(valid_confirmed_LA*100/float(confirmed_LA)))
print(" all others \t\t", confirmed_others, "\t", valid_confirmed_others, "\t %.2f"%(valid_confirmed_others*100/float(confirmed_others)))
print(" total confirmed \t", total_confirmed, "\t", total_valid_confirmed, "\t %.2f"%(total_valid_confirmed*100/float(total_confirmed)))
#print " validation total confirmed \t", nb_confirmed , "\t", nb_confirmed_valid
print(" **** Among Non Confirmed ****")
print(" only [1 bloc/small] \t", non_confirmed_SMIR, "\t", valid_non_confirmed_SMIR, "\t %.2f"%(valid_non_confirmed_SMIR*100/float(non_confirmed_SMIR)))
print(" only [1 bloc/long] \t", non_confirmed_LAIR, "\t", valid_non_confirmed_LAIR, "\t %.2f"%(valid_non_confirmed_LAIR*100/float(non_confirmed_LAIR)))
print(" only [n blocs/small] \t", non_confirmed_SM, "\t", valid_non_confirmed_SM, "\t %.2f"%(valid_non_confirmed_SM*100/float(non_confirmed_SM)))
print(" only [n blocs/long] \t", non_confirmed_LA, "\t", valid_non_confirmed_LA, "\t %.2f"%(valid_non_confirmed_LA*100/float(non_confirmed_LA)))
print(" all others \t\t", non_confirmed_others, "\t", valid_non_confirmed_others, "\t %.2f"%(valid_non_confirmed_others*100/float(non_confirmed_others)))
print(" total non confirmed \t", total_non_confirmed, "\t", total_valid_non_confirmed, "\t %.2f"%(total_valid_non_confirmed*100/float(total_non_confirmed)))
#print " validation total non confirmed \t", nb_non_confirmed , "\t", nb_non_confirmed_valid
print(" **** Grand total (including FP) ****\t")
print(" total \t\t\t", (total_non_confirmed+total_confirmed+no_mapping), "\t", (total_valid_non_confirmed+total_valid_confirmed+no_mapping), "\t %.2f"%(100*(total_valid_non_confirmed+total_valid_confirmed+no_mapping)/float(total_non_confirmed+total_confirmed+no_mapping)))
print("\n\n******** Splice sites repartition: *********")
print("\t\t{GT*** ***AG}\t{GC*** ***AG}\t{ATATC ****A}\t{GTATC ***AT} \t total")
print("total\t\t ",GT_AG_non_confirmed+GT_AG_confirmed ,"\t", GC_AG_non_confirmed+GC_AG_confirmed ,"\t", ATATC_A_non_confirmed+ATATC_A_confirmed ,"\t", GTATC_AT_non_confirmed+GTATC_AT_confirmed ,"\t", GT_AG_non_confirmed+GT_AG_confirmed + GC_AG_non_confirmed+GC_AG_confirmed + ATATC_A_non_confirmed+ATATC_A_confirmed + GTATC_AT_non_confirmed+GTATC_AT_confirmed)
print("Among confirmed\t\t", GT_AG_confirmed ,"\t", GC_AG_confirmed ,"\t", ATATC_A_confirmed ,"\t",GTATC_AT_confirmed ,"\t", GT_AG_confirmed + GC_AG_confirmed + ATATC_A_confirmed + GTATC_AT_confirmed)
print("Among non confirmed\t", GT_AG_non_confirmed ,"\t", GC_AG_non_confirmed ,"\t", ATATC_A_non_confirmed ,"\t",GTATC_AT_non_confirmed ,"\t", GT_AG_non_confirmed + GC_AG_non_confirmed + ATATC_A_non_confirmed + GTATC_AT_non_confirmed)
|