1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
|
import sys
import K3000_common as kc
"""
Given an input phased variants, select and output only "working zones"
A working zone is defined by
1/ Coverage XXX todo
2/ Topology: conserve only parts defined with X patterns.
A node is linked to at most two other nodes.
If a node n1 is linked to two nodes n2 and n3, then n2 and n3 have at most two sons, n4 and n5
"""
def get_reverse_fact(x):
''' reverse of a fact x. Example x = "4l_0;2h_3;-6h_21", reverse(x) = "6h_0;-2h_21;-4l_3"'''
res =""
x=x.strip(";").split(";")
for i in range (len(x)):
original_id = x[-i-1]
if original_id[0]=="-": reversed_id = original_id[1:]
else: reversed_id = "-"+original_id
if i==0:
value=kc.string_allele_value(reversed_id)+"_0" # With i=0, add '_0' to the value
else:
value=kc.string_allele_value(reversed_id)+"_"+kc.distance_string_value(x[-i]) # With i=1, add '_21' to the value
res+=value+";"
return res
def store_nodes(fact_file_name):
"""
Store nodes with their min, max and mean coverages
"""
nodes = {} # key= (int) node id, value variant_id
with open(fact_file_name) as fact_file:
for line in fact_file.readlines():
if not line: break
line=line.strip()
if line[0]=="#": continue
" -1145l_0;4780h_-60;-248h_-63;-7035l_-60;4077l_-59;-6265l_-60;-2978h_-69;4958h_-59;-4221h_-60;-1376l_-60;-4664l_-59;-2207h_-60;-6257l_-60;-5529l_-59;-5106h_-59;1310h_-59;-4514h_-59;3933h_-60;-5727h_-60;2325l_-60;-3976l_-59;-3363h_-60;-716l_-59;-3526h_-51;-5819h_-57;2927l_-59;-6146l_-60;-4374h_-53; => 1"
coverage = int(line.split()[-1])
for fact_id in range(len(line.split())-2): # In case of pairend facts, deal with the two facts (in this case len((line.split()) is 4
original_fact = line.split()[fact_id]
for fact in original_fact, get_reverse_fact(original_fact):
# print ("fact", fact)
for variant in fact.strip(";").split(";"):
# print(variant, fact)
signed_variant_id=variant.split("_")[0]
if signed_variant_id not in nodes: nodes[signed_variant_id] = 0
assert len(signed_variant_id)>0,line+"\n"+str(line.split()[fact_id])
nodes[signed_variant_id] += coverage
return nodes
def f(tuple, el):
return tuple[0]==el
def isin(list, el):
for tuple in list:
if f(tuple, el): return True
return False
def exists_edge(edges, rev_edges, node_id_1, node_id_2):
for current_edges in edges, rev_edges:
if node_id_1 in current_edges:
if isin(current_edges[node_id_1], node_id_2): return True
if node_id_2 in current_edges:
if isin(current_edges[node_id_2], node_id_1): return True
return False
def store_ordered_edges(fact_file_name):
"""
Store edges
Conserve only the closest edges. In case of equalities of minimal distance edges, conserve the closest ones.
"""
edges = {} # key = (signed int) node id, value = set of (signed int) target node ids with their distance (signed int). This distance may be negative as it reflects the position of the second variant wrt the end of the bubble sequence of the first.
# Hence a key is a couple (signed_int, signed int).
rev_edges = {} # key = (signed int) node id, value = set of (signed int) target node ids (idem)
with open(fact_file_name) as fact_file:
for line in fact_file.readlines():
if not line: break
line=line.strip()
if line[0]=="#": continue
# print(line)
# print(len(line.split()))
for fact_id in range(len(line.split())-2): # In case of pairend facts, deal with the two facts (in this case len((line.split()) is 4
original_fact = line.split()[fact_id]
for used_fact in original_fact, get_reverse_fact(original_fact):
comapped_variants = used_fact.strip(";").split(";")
# print(comapped_variants)
for i in range(len(comapped_variants)-1):
from_node_id = comapped_variants[i].split("_")[0]
to_node_id = comapped_variants[i+1].split("_")[0]
distance = int(comapped_variants[i+1].split("_")[1])
# add edge:
if from_node_id not in edges: edges[from_node_id] = set()
edges[from_node_id].add((to_node_id, distance))
if to_node_id not in rev_edges: rev_edges[to_node_id] = set()
rev_edges[to_node_id].add((from_node_id, distance))
# print(edges)
## Sort each list wrt distance:
for current_edges in edges, rev_edges:
for from_variant_id in current_edges:
current_edges[from_variant_id] = sorted(current_edges[from_variant_id], key=lambda x : x[1])
sys.stderr.write(f"sorted edges {edges}\n")
## remove transitive edges:
nb_removed = 0
nb_total = 0
new_edges = [{},{}]
i=-1
for current_edges in edges, rev_edges:
i+=1
# new_edges[i] = {}
for from_variant_id in current_edges:
# if an ordered list contains more than 2 children (A->B and A->C for instance), remove A->C if B->C exists somewhere else
updated_list = set()
updated_list.add(current_edges[from_variant_id][0]) # the first one stays, whatever.
for id_in_current_edges in range(len(current_edges[from_variant_id])-1):
nb_total+=1
cur_var = current_edges[from_variant_id][id_in_current_edges]
next_var = current_edges[from_variant_id][id_in_current_edges+1]
if cur_var[1]==next_var[1]: # same distance to start, we keep it
updated_list.add(current_edges[from_variant_id][id_in_current_edges+1])
continue
if not exists_edge(edges, rev_edges, cur_var[0], next_var[0]): # this is a not a transitive edge, we keep it
updated_list.add(current_edges[from_variant_id][id_in_current_edges+1])
else:
sys.stderr.write(f"removed {from_variant_id} -> {current_edges[from_variant_id][id_in_current_edges+1]}\n")
nb_removed+=1
updated_list = sorted(updated_list, key=lambda x : x[1])
# sys.stderr.write(f"avant {from_variant_id} -> {current_edges[from_variant_id]}\n")
new_edges[i][from_variant_id]=updated_list
# sys.stderr.write(f"apres {from_variant_id} -> {new_edges[i][from_variant_id]}\n")
sys.stderr.write(f"Removed {nb_removed} among {nb_total}\n")
edges = new_edges[0]
rev_edges = new_edges[1]
## remove distances:
for current_edges in edges, rev_edges:
for from_variant_id in current_edges:
id_only_list = set()
# print("HO avant", current_edges[from_variant_id])
for to_variant_id, distance in current_edges[from_variant_id]:
id_only_list.add(to_variant_id)
current_edges[from_variant_id] = id_only_list
# print("HO apres", current_edges[from_variant_id])
## filter edges.
# for current_edges in edges, rev_edges:
# for from_variant_id in current_edges:
# # print(from_variant_id, current_edges[from_variant_id])
# ## determine min dist
# min_dist= sys.maxsize
# for to_variant_id, distance in current_edges[from_variant_id]:
# print (from_variant_id, to_variant_id, distance)
# if distance < min_dist: min_dist = distance
# print()
# ## conserve only edges equal to min dist:
# to_variants = set()
# for to_variant_id, distance in current_edges[from_variant_id]:
# if distance == min_dist:
# to_variants.add(to_variant_id)
# current_edges[from_variant_id] = to_variants
# # print(from_variant_id, current_edges[from_variant_id])
return edges,rev_edges
def store_edges(fact_file_name):
"""
Store edges
"""
edges = {} # key = (signed int) node id, value = set of (signed int) target node ids.
rev_edges = {} # key = (signed int) node id, value = set of (signed int) target node ids.
with open(fact_file_name) as fact_file:
for line in fact_file.readlines():
if not line: break
line=line.strip()
if line[0]=="#": continue
# print(line)
# print(len(line.split()))
for fact_id in range(len(line.split())-2): # In case of pairend facts, deal with the two facts (in this case len((line.split()) is 4
comapped_variants = line.split()[fact_id].strip(";").split(";")
for i in range(len(comapped_variants)-1):
from_node_id = comapped_variants[i].split("_")[0]
to_node_id = comapped_variants[i+1].split("_")[0]
# add edge:
if from_node_id not in edges: edges[from_node_id] = set()
edges[from_node_id].add(to_node_id)
if to_node_id not in rev_edges: rev_edges[to_node_id] = set()
rev_edges[to_node_id].add(from_node_id)
return edges,rev_edges
def remove_nodes_with_high_io_degree(edges, rev_edges, nodes, max_io=2):
"""
remove any node that has two or more childre and remove all the corresponding children
"""
for current_edges in edges, rev_edges:
for from_variant_id in current_edges:
if len(current_edges[from_variant_id]) > max_io:
if from_variant_id in nodes: del nodes[from_variant_id]
for to_variant_id in current_edges[from_variant_id]:
if to_variant_id in nodes: del nodes[to_variant_id]
def print_as_gfa(nodes, edges):
# cov_threshold=10
# index nodes:
node_id_to_variant_id = {} # key = node_id (1,2, ..., n) value = variant_id (eg. -14h)
variant_id_to_node_id = {} # key = variant_id (eg. -14h) value = node_id (eg. 2)
node_id=1
for node in nodes:
node_id_to_variant_id[node_id] = node
variant_id_to_node_id[node] = node_id
node_id+=1
for node_id in node_id_to_variant_id:
# if nodes[node_id_to_variant_id[node_id]] < cov_threshold: continue
print("S\t"+str(node_id)+"\t"+str(node_id_to_variant_id[node_id])+"\tRC:i:"+str(nodes[node_id_to_variant_id[node_id]]))
for from_variant_id in edges:
if from_variant_id not in nodes: continue # had been removed
# if nodes[from_variant_id] < cov_threshold: continue # too low cove
for to_variant_id in edges[from_variant_id]:
if to_variant_id not in nodes: continue # had been removed
# if nodes[to_variant_id] < cov_threshold: continue # too low cove
print("L\t"+str(variant_id_to_node_id[from_variant_id])+"\t+\t"+str(variant_id_to_node_id[to_variant_id])+"\t+\t0M")
def main():
nodes = store_nodes(sys.argv[1])
# print (nodes)
edges,rev_edges = store_ordered_edges(sys.argv[1])
remove_nodes_with_high_io_degree(edges, rev_edges, nodes, 2000)
print_as_gfa (nodes, edges)
if __name__ == '__main__':
main()
|