File: parse_result_file.py

package info (click to toggle)
kissplice 2.6.7-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 16,752 kB
  • sloc: cpp: 8,783; python: 1,618; perl: 389; sh: 72; makefile: 18
file content (117 lines) | stat: -rw-r--r-- 6,582 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import os 

#cat results_with_validation.txt | tr "," " " > results_with_validation_no_coma.txt


################## FILE results_with_validation.txt  ###########################
#
# results/bcc_11983_sequences_25_unsorted.fa.toblat.fasta : a result for one BCC
# chr9 97843039 97848985 3 145 97843039,97844856,97848963, 23,145,22, 97843039,97848963, 23,22, 0 informations about mapped blocks. several such line may exist for a unique BCC
# chr  start stop nb_block length_longer-length_shorter start_longer_bloc1 start_longer_block_2 start_longer_block3 ... length_longer_bloc1  length_longer_bloc2  length_longer_bloc3 -- and the same for shorter blocks
# Confirmed Exon Skipping: comment present or not, confirming existing annotations


# This program enables to for each mapped block from longer paths, the left and right dimer present in the sequences to test if they are splice sites

filin = open('results_with_validation_no_coma.txt', 'r')
#filin = open('test.fa', 'r')
bcc=-1
confirmed=0
id=0

for line in filin.readlines():
#    print line
    if line.startswith("results"): # a new result
        if confirmed==1 and bcc != -1: # we move the previous result as confirmed
            os.system("mv bcc_"+bcc+" bcc_"+bcc+"_confirmed")

        bcc=line.split("_")[1] # get the current bcc number
        id+=1
        id_for_a_bcc=0 # several possible mapping positions
        confirmed=0
    if line.startswith("Confirmed") or line.startswith("Intron"):
        confirmed=1
    if line.startswith("chr"):                            # a new mapping
        array_line=line.split(" ")
        number_blocks=int(array_line[3])
        diff_size=int(array_line[4])
  #      print array_line
        
        # SMALL DIFFERENCE 
        if diff_size<= 5:
            os.system("echo -e \"SM\c\" >> bcc_"+bcc)
        else:
            os.system("echo -e \"LA\c\" >> bcc_"+bcc)

        if number_blocks==1:
            # This is a special case: only one block in the upper path: its an intron retention in the lower. In this case we grab the splice sites from the lower case.
            # we have to determine the number of blocks in the shorter path in case of multiple intron retention:
            # with only two block in the lower path on has:
            # ['chr3', '75263683', '75263792', '1', '66', '75263683', '', '109', '', '75263683', '75263772', '', '23', '20', '', '0\n']
            # this has 16 entries.
            # with more blocks in the lower path on has:
            # ['chr2', '133018839', '133019035', '1', '148', '133018839', '', '196', '', '133018839', '133018981', '133019016', '', '24', '5', '19', '', '0\n']
            # this has 18 entries.
            # number of blocks = (number-12)/2
            number_lower_blocks= (len(array_line)-12)/2
            os.system("echo -e \"IR \c\" >> bcc_"+bcc) # IR = INTRON RETENTION (Guess)
            for i in range(number_lower_blocks):
                start=int(array_line[9+i])                    # starting position of the considered block
                stop=start+int(array_line[10+number_lower_blocks+i]) # ending position of the considered block
                if i>0: # avoid the first left characters that are meaningless
                    os.system("twoBitToFa hg19.2bit:"+array_line[0]+":"+str(start-5)+"-"+str(start)+" afac"); # get the fivemer on the left
                    os.system("grep -v \">\" afac | tr \"\n\" \" \">> bcc_"+bcc) # add it to the result file
                if i != number_lower_blocks-1: # Avoid the last characters
                    os.system("twoBitToFa hg19.2bit:"+array_line[0]+":"+str(stop)+"-"+str(stop+5)+" afac"); # get the fivemer on the right
                    os.system("grep -v \">\" afac | tr \"\n\" \" \">> bcc_"+bcc) # add it to the result file
            
            
            

        # "BIG" DIFFERENCE AND MORE THAN ONE BLOCK
        else: 
            os.system("echo -e \" \c\" >> bcc_"+bcc) # A SPACE
            for i in range(number_blocks):                    #for each block
                start=int(array_line[5+i])                    # starting position of the considered block
                stop=start+int(array_line[6+number_blocks+i]) # ending position of the considered block
                if i>0: # avoid the first left characters that are meaningless
                    os.system("twoBitToFa hg19.2bit:"+array_line[0]+":"+str(start-5)+"-"+str(start)+" afac"); # get the fivemer on the left
                    os.system("grep -v \">\" afac | tr \"\n\" \" \">> bcc_"+bcc) # add it to the result file
                if i != number_blocks-1: # Avoid the last characters
                    os.system("twoBitToFa hg19.2bit:"+array_line[0]+":"+str(stop)+"-"+str(stop+5)+" afac"); # get the fivemer on the right
                    os.system("grep -v \">\" afac | tr \"\n\" \" \">> bcc_"+bcc) # add it to the result file
        os.system("echo "" >> bcc_"+bcc) # end line
filin.close()


###### VERSION AVEC DEUX FICHIERS left et right par BCC ########

# for line in filin.readlines():
# #    print line
#     if line.startswith("results"): # a new result
#         if confirmed==1 and bcc != -1: # we move the previous result as confirmed
#             os.system("mv bcc_"+bcc+"_left bcc_"+bcc+"_left_confirmed")
#             os.system("mv bcc_"+bcc+"_right bcc_"+bcc+"_right_confirmed")

#         bcc=line.split("_")[1] # get the current bcc number
#         id+=1
#         id_for_a_bcc=0 # several possible mapping positions
#         confirmed=0
#     if line.startswith("Confirmed") or line.startswith("Intron"):
#         confirmed=1
#     if line.startswith("chr"):
#         array_line=line.split(" ")
#         number_blocks=int(array_line[3])
#         for i in range(number_blocks): #for each block
#             start=int(array_line[5+i]) # starting position of the considered block
#             stop=start+int(array_line[6+number_blocks+i]) # ending position of the considered block
#             os.system("twoBitToFa hg19.2bit:"+array_line[0]+":"+str(start-2)+"-"+str(start)+" afac"); # get the dimer on the left
#             if i>0: # avoid the first left character that are meaningless
#                 os.system("grep -v \">\" afac >> bcc_"+bcc+"_left") # add it to the left result file
#             os.system("twoBitToFa hg19.2bit:"+array_line[0]+":"+str(stop)+"-"+str(stop+2)+" afac"); # get the dimer on the right
#             if i != number_blocks-1:
#                 os.system("grep -v \">\" afac >> bcc_"+bcc+"_right") # add it to the left result file
# filin.close()