1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
|
import os
#cat results_with_validation.txt | tr "," " " > results_with_validation_no_coma.txt
################## FILE results_with_validation.txt ###########################
#
# results/bcc_11983_sequences_25_unsorted.fa.toblat.fasta : a result for one BCC
# chr9 97843039 97848985 3 145 97843039,97844856,97848963, 23,145,22, 97843039,97848963, 23,22, 0 informations about mapped blocks. several such line may exist for a unique BCC
# chr start stop nb_block length_longer-length_shorter start_longer_bloc1 start_longer_block_2 start_longer_block3 ... length_longer_bloc1 length_longer_bloc2 length_longer_bloc3 -- and the same for shorter blocks
# Confirmed Exon Skipping: comment present or not, confirming existing annotations
# This program enables to for each mapped block from longer paths, the left and right dimer present in the sequences to test if they are splice sites
filin = open('results_with_validation_no_coma.txt', 'r')
#filin = open('test.fa', 'r')
bcc=-1
confirmed=0
id=0
for line in filin.readlines():
# print line
if line.startswith("results"): # a new result
if confirmed==1 and bcc != -1: # we move the previous result as confirmed
os.system("mv bcc_"+bcc+" bcc_"+bcc+"_confirmed")
bcc=line.split("_")[1] # get the current bcc number
id+=1
id_for_a_bcc=0 # several possible mapping positions
confirmed=0
if line.startswith("Confirmed") or line.startswith("Intron"):
confirmed=1
if line.startswith("chr"): # a new mapping
array_line=line.split(" ")
number_blocks=int(array_line[3])
diff_size=int(array_line[4])
# print array_line
# SMALL DIFFERENCE
if diff_size<= 5:
os.system("echo -e \"SM\c\" >> bcc_"+bcc)
else:
os.system("echo -e \"LA\c\" >> bcc_"+bcc)
if number_blocks==1:
# This is a special case: only one block in the upper path: its an intron retention in the lower. In this case we grab the splice sites from the lower case.
# we have to determine the number of blocks in the shorter path in case of multiple intron retention:
# with only two block in the lower path on has:
# ['chr3', '75263683', '75263792', '1', '66', '75263683', '', '109', '', '75263683', '75263772', '', '23', '20', '', '0\n']
# this has 16 entries.
# with more blocks in the lower path on has:
# ['chr2', '133018839', '133019035', '1', '148', '133018839', '', '196', '', '133018839', '133018981', '133019016', '', '24', '5', '19', '', '0\n']
# this has 18 entries.
# number of blocks = (number-12)/2
number_lower_blocks= (len(array_line)-12)/2
os.system("echo -e \"IR \c\" >> bcc_"+bcc) # IR = INTRON RETENTION (Guess)
for i in range(number_lower_blocks):
start=int(array_line[9+i]) # starting position of the considered block
stop=start+int(array_line[10+number_lower_blocks+i]) # ending position of the considered block
if i>0: # avoid the first left characters that are meaningless
os.system("twoBitToFa hg19.2bit:"+array_line[0]+":"+str(start-5)+"-"+str(start)+" afac"); # get the fivemer on the left
os.system("grep -v \">\" afac | tr \"\n\" \" \">> bcc_"+bcc) # add it to the result file
if i != number_lower_blocks-1: # Avoid the last characters
os.system("twoBitToFa hg19.2bit:"+array_line[0]+":"+str(stop)+"-"+str(stop+5)+" afac"); # get the fivemer on the right
os.system("grep -v \">\" afac | tr \"\n\" \" \">> bcc_"+bcc) # add it to the result file
# "BIG" DIFFERENCE AND MORE THAN ONE BLOCK
else:
os.system("echo -e \" \c\" >> bcc_"+bcc) # A SPACE
for i in range(number_blocks): #for each block
start=int(array_line[5+i]) # starting position of the considered block
stop=start+int(array_line[6+number_blocks+i]) # ending position of the considered block
if i>0: # avoid the first left characters that are meaningless
os.system("twoBitToFa hg19.2bit:"+array_line[0]+":"+str(start-5)+"-"+str(start)+" afac"); # get the fivemer on the left
os.system("grep -v \">\" afac | tr \"\n\" \" \">> bcc_"+bcc) # add it to the result file
if i != number_blocks-1: # Avoid the last characters
os.system("twoBitToFa hg19.2bit:"+array_line[0]+":"+str(stop)+"-"+str(stop+5)+" afac"); # get the fivemer on the right
os.system("grep -v \">\" afac | tr \"\n\" \" \">> bcc_"+bcc) # add it to the result file
os.system("echo "" >> bcc_"+bcc) # end line
filin.close()
###### VERSION AVEC DEUX FICHIERS left et right par BCC ########
# for line in filin.readlines():
# # print line
# if line.startswith("results"): # a new result
# if confirmed==1 and bcc != -1: # we move the previous result as confirmed
# os.system("mv bcc_"+bcc+"_left bcc_"+bcc+"_left_confirmed")
# os.system("mv bcc_"+bcc+"_right bcc_"+bcc+"_right_confirmed")
# bcc=line.split("_")[1] # get the current bcc number
# id+=1
# id_for_a_bcc=0 # several possible mapping positions
# confirmed=0
# if line.startswith("Confirmed") or line.startswith("Intron"):
# confirmed=1
# if line.startswith("chr"):
# array_line=line.split(" ")
# number_blocks=int(array_line[3])
# for i in range(number_blocks): #for each block
# start=int(array_line[5+i]) # starting position of the considered block
# stop=start+int(array_line[6+number_blocks+i]) # ending position of the considered block
# os.system("twoBitToFa hg19.2bit:"+array_line[0]+":"+str(start-2)+"-"+str(start)+" afac"); # get the dimer on the left
# if i>0: # avoid the first left character that are meaningless
# os.system("grep -v \">\" afac >> bcc_"+bcc+"_left") # add it to the left result file
# os.system("twoBitToFa hg19.2bit:"+array_line[0]+":"+str(stop)+"-"+str(stop+2)+" afac"); # get the dimer on the right
# if i != number_blocks-1:
# os.system("grep -v \">\" afac >> bcc_"+bcc+"_right") # add it to the left result file
# filin.close()
|