1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61
|
import sys
import gzip
if len(sys.argv)<3:
print("This tool filters out discoSnp prediction having a minor allele frequency lower than a provided threshold for ALL datasets.")
print("python3 filter_out_using_MAF.py \".fa from discoSnp\" \"MAF threshold\"")
sys.exit()
if "gz" in sys.argv[1]:
coherent_file=gzip.open(sys.argv[1],"r")
else:
coherent_file=open(sys.argv[1],"r")
maf_threshold = float(sys.argv[2])
#maf_threshold_low = float(sys.argv[3])
first_coverage_field_id=0
while True:
#>SNP_higher_path_1384|P_1:30_A/G|high|nb_pol_1|left_unitig_length_108|right_unitig_length_1156|C1_0|C2_0|C3_0|C4_0|C5_183|Q1_0|Q2_0|Q3_0|Q4_0|Q5_70|G1_1/1:19724,2972,47|G2_1/1:21024,3168,50|G3_1/1:17124,2581,42|G4_1/1:19564,2948,47|G5_0/1:16063,1163,1575|rank_0.36839
comment1 = coherent_file.readline()
if not comment1: break
splitted_comment1 = comment1.split("|")
path1 = coherent_file.readline()
comment2 = coherent_file.readline()
splitted_comment2 = comment2.split("|")
path2 = coherent_file.readline()
coverage_high=[]
coverage_low= []
if first_coverage_field_id==0:
while True:
if splitted_comment1[first_coverage_field_id][0]=="C": break
first_coverage_field_id+=1
i=first_coverage_field_id
while True:
if splitted_comment1[i][0]!="C": break # no more a coverage
coverage_high.append(int(splitted_comment1[i].split("_")[1]))
coverage_low.append( int(splitted_comment2[i].split("_")[1]))
i+=1
# print comment1,
#print coverage_high
#print coverage_low
to_output=False
for i in range(len(coverage_high)):
if coverage_high[i]==0 and coverage_low[i]==0: continue
# print (min(coverage_high[i],coverage_low[i]) / float(max(coverage_high[i],coverage_low[i])))
if (min(coverage_high[i],coverage_low[i]) / float(int(coverage_high[i])+int(coverage_low[i]))) >= maf_threshold:
#print (min(coverage_high[i],coverage_low[i]) / float(int(coverage_high[i])+int(coverage_low[i])))
to_output=True
break
if to_output:
print((comment1,path1,comment2,path2,))
|