File: filter_out_using_MAF.py

package info (click to toggle)
discosnp 1%3A2.6.2-5
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,656 kB
  • sloc: python: 5,893; sh: 2,966; cpp: 2,692; makefile: 14
file content (61 lines) | stat: -rw-r--r-- 2,239 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import sys
import gzip

if len(sys.argv)<3:
    print("This tool filters out discoSnp prediction having a minor allele frequency lower than a provided threshold for ALL datasets.")
    print("python3 filter_out_using_MAF.py \".fa from discoSnp\" \"MAF threshold\"")
    sys.exit()


if "gz" in sys.argv[1]:
    coherent_file=gzip.open(sys.argv[1],"r")
else: 
    coherent_file=open(sys.argv[1],"r")
maf_threshold = float(sys.argv[2])
#maf_threshold_low = float(sys.argv[3])
first_coverage_field_id=0
while True:
    #>SNP_higher_path_1384|P_1:30_A/G|high|nb_pol_1|left_unitig_length_108|right_unitig_length_1156|C1_0|C2_0|C3_0|C4_0|C5_183|Q1_0|Q2_0|Q3_0|Q4_0|Q5_70|G1_1/1:19724,2972,47|G2_1/1:21024,3168,50|G3_1/1:17124,2581,42|G4_1/1:19564,2948,47|G5_0/1:16063,1163,1575|rank_0.36839
    
    comment1 = coherent_file.readline()
    if not comment1: break
    splitted_comment1 = comment1.split("|")
    path1 = coherent_file.readline()
    comment2 = coherent_file.readline()
    splitted_comment2 = comment2.split("|")
    path2 = coherent_file.readline()

    coverage_high=[]
    coverage_low= []    
    if first_coverage_field_id==0:
        while True:
            if splitted_comment1[first_coverage_field_id][0]=="C": break
            first_coverage_field_id+=1
    
    i=first_coverage_field_id
    while True:
        if splitted_comment1[i][0]!="C": break # no more a coverage
        coverage_high.append(int(splitted_comment1[i].split("_")[1]))
        coverage_low.append( int(splitted_comment2[i].split("_")[1]))
        i+=1
    # print comment1,
    #print coverage_high
    #print coverage_low
    to_output=False
    for i in range(len(coverage_high)):
        if coverage_high[i]==0 and coverage_low[i]==0: continue
        
        # print (min(coverage_high[i],coverage_low[i]) / float(max(coverage_high[i],coverage_low[i])))
        if (min(coverage_high[i],coverage_low[i]) / float(int(coverage_high[i])+int(coverage_low[i]))) >= maf_threshold:
            #print (min(coverage_high[i],coverage_low[i]) / float(int(coverage_high[i])+int(coverage_low[i])))
            to_output=True
            break
    
    if to_output:
        print((comment1,path1,comment2,path2,))