1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122
|
import sys
#usage:
### first create connected components from disco (-A option)
#sh from_phased_alleles_to_clusters.sh phased_alleles_read_set_id_1.txt # creates file connected_components_phased_alleles_read_set_id_1.txt
### them from the .fa file, the id of the set your interested in (e.g. 1 for phased_alleles_read_set_id_1.txt, this will correspond to C1 coverage in the fa file), the file containing the connected components, and the phased_alleles_read_set_id_X.txt file, generate the fact file
#python3 format_phased_variants_for_haplotyping.py mapping_k_31_c_auto_D_100_P_10_b_0_coherent.fa 1 connected_components_phased_alleles_read_set_id_1.txt phased_alleles_read_set_id_1.txt > phased_alles_read_set_1_facts.txt
if not len(sys.argv)==5:
print ("usage: python3 format_phased_variants_for_haplotyping.py <file coherent.fa> <id number><connected_component_file><phased_allele_file>")
print (" * coherent.fa file: the file generated by discoSnp")
print (" * id number is the id of the read set, for which variants are phased. With i, this corresponds to Ci in the .fa file headers.")
print (" * connected_component_file: file obtained from \"from_phased_alleles_to_clusters.sh phased_alleles_read_set_id_1.txt\" continaing connected component of phased alleles")
print (" * phased_alleles_read_set_id_1.txt: file generated by discoSnp (with the hidden -A option. The value 1 here shoud correspond to \"id number\"")
sys.exit(0)
coherent_fa_file = open(sys.argv[1])
set_id = sys.argv[2]
cc_file = open(sys.argv[3])
phased_alleles_file = open(sys.argv[4])
def store_abundances(coherent_fa_file,set_id):
pos_coverage_determined=False
pos_coverage=-1
coverages={}
for oline in coherent_fa_file: #>SNP_higher_path_991|P_1:30_C/G|high|nb_pol_1|C1_38|C2_0|Q1_0|Q2_0|G1_0/0:6,119,764|G2_1/1:664,104,6|rank_1
if oline[0] != '>': continue
line=oline.rstrip().split('|')
id=line[0].split('_')[-1] #here 991
id+=line[0].split('_')[1][0] #'h' or 'l'
if not pos_coverage_determined:
for pos_coverage in range(len(line)):
if line[pos_coverage][0]=='C':
value=line[pos_coverage][1:].split('_')[0]
if value==set_id:
pos_coverage_determined=True
break
if not pos_coverage_determined:
print ("Set id", set_id, "not findable in header like ", oline.rstrip())
print ("ciao")
sys.exit(0)
coverages[id]=line[pos_coverage].split('_')[1] # get the right coverage corresponding to the searche read set
return coverages
def store_cc(cc_file):
cc={}
for i,oline in enumerate (cc_file): # 852 1891 3484 2641 5758 3247
oline=oline.rstrip().split()
for idf in oline:
idf=int(idf)
if idf in cc:
print("ERROR, idf is in more than one connected component")
cc[idf]=i
return cc
def store_phased_alleles(phased_alleles_file):
phased_alleles={}
for oline in phased_alleles_file: #-1187h;1001h;2178h; => 5
oline=oline.lstrip().rstrip()
if oline[0]=='#': continue
ids = oline.split(' ')[0].split(';')[:-1]
abundance = int(oline.split(' ')[-1])
idlist=[]
for aid in ids:
# if aid[0]=='-': # remove the '-'
# aid=aid[1:]
idlist.append(aid)
# canonical representation: smallest first (removing with the strip function the eventual first '-' sign):
if int(idlist[0].strip('-')[:-1])>int(idlist[-1].strip('-')[:-1]):
idlist.reverse()
# change the ortientation = change the sign:
for i in range(len(idlist)):
if idlist[i][0]=='-': idlist[i]=idlist[i][1:]
else: idlist[i]='-'+idlist[i]
list_as_string = ""
for aid in idlist:
list_as_string+=aid+';'
# add the list to the phased_alleles or increase its count if not existing:
if list_as_string in phased_alleles:
phased_alleles[list_as_string]+=abundance
else:
phased_alleles[list_as_string]=abundance
return phased_alleles
def print_djack_formated_phased_variants(coverages,cc,phased_alleles):
for aid in coverages:
current_snp_id=int(aid[:-1])
if current_snp_id in cc:
print("snp(cc"+str(cc[current_snp_id])+","+str(current_snp_id)+","+aid[-1]+","+str(coverages[aid])+").")
for i,list_as_string in enumerate(phased_alleles):#'2686l;4324h;5375h;': 3
# get the CC:
ids=list_as_string.split(';')[:-1]
abundance = phased_alleles[list_as_string]
first_id=abs(int(ids[0][:-1]))
if first_id not in cc: continue
this_cc=cc[first_id]
for j in range(1,len(ids)):
if abs(int(ids[j][:-1])) in cc and cc[abs(int(ids[j][:-1]))] != this_cc:
print("impossible all variants from ",list_as_string, "are not in the same CC")
sys.exit(0)
for node_order,aid in enumerate(ids):
print("fact(cc"+str(this_cc)+","+str(i)+","+str(node_order+1)+","+aid[:-1]+","+aid[-1]+").")
print("count("+str(i)+","+str(abundance)+").")
coverages=store_abundances(coherent_fa_file,set_id)
cc=store_cc(cc_file)
phased_alleles=store_phased_alleles(phased_alleles_file)
print_djack_formated_phased_variants(coverages,cc,phased_alleles)
|