1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340
|
import sys
import K3000_common as kc
"""
Given an input phased variants, select and output only "working zones"
A working zone is defined by
1/ Coverage XXX todo
2/ Topology: conserve only parts defined with X patterns.
A node is linked to at most two other nodes.
If a node n1 is linked to two nodes n2 and n3, then n2 and n3 have at most two sons, n4 and n5
"""
def get_reverse_fact(x):
''' reverse of a fact x. Example x = "4l_0;2h_3;-6h_21", reverse(x) = "6h_0;-2h_21;-4l_3"'''
res = ""
x = x.strip(";").split(";")
for i in range(len(x)):
original_id = x[-i-1]
if original_id[0] == "-":
reversed_id = original_id[1:]
else:
reversed_id = "-"+original_id
if i == 0:
value=kc.string_allele_value(reversed_id)+"_0" # With i=0, add '_0' to the value
else:
value=kc.string_allele_value(reversed_id)+"_"+kc.distance_string_value(x[-i]) # With i=1, add '_21' to the value
res+=value+";"
return res
def store_nodes(fact_file_name):
"""
Store nodes with their min, max and mean coverages
"""
nodes = {} # key= (int) node id, value variant_id
with open(fact_file_name) as fact_file:
for line in fact_file.readlines():
if not line: break
line=line.strip()
if line[0]=="#": continue
" -1145l_0;4780h_-60;-248h_-63;-7035l_-60;4077l_-59;-6265l_-60;-2978h_-69;4958h_-59;-4221h_-60;-1376l_-60;-4664l_-59;-2207h_-60;-6257l_-60;-5529l_-59;-5106h_-59;1310h_-59;-4514h_-59;3933h_-60;-5727h_-60;2325l_-60;-3976l_-59;-3363h_-60;-716l_-59;-3526h_-51;-5819h_-57;2927l_-59;-6146l_-60;-4374h_-53; => 1"
coverage = int(line.split()[-1])
for fact_id in range(len(line.split())-2): # In case of pairend facts, deal with the two facts (in this case len((line.split()) is 4
original_fact = line.split()[fact_id]
for fact in original_fact, get_reverse_fact(original_fact):
# print ("fact", fact)
for variant in fact.strip(";").split(";"):
# print(variant, fact)
signed_variant_id=variant.split("_")[0]
if signed_variant_id not in nodes: nodes[signed_variant_id] = 0
assert len(signed_variant_id)>0,line+"\n"+str(line.split()[fact_id])
nodes[signed_variant_id] += coverage
return nodes
def f(tuple, el):
return tuple[0]==el
def isin(list, el):
for tuple in list:
if f(tuple, el): return True
return False
def exists_edge(edges, node_id_1, node_id_2):
if node_id_1 in edges:
if isin(edges[node_id_1], node_id_2): return True
if node_id_2 in edges:
if isin(edges[node_id_2], node_id_1): return True
return False
def store_ordered_edges(fact_file_name, nodes):
"""
Store edges
"""
edges = {} # key = (signed int) node id, value = set of (signed int) target node ids with their distance (signed int). This distance may be negative as it reflects the position of the second variant wrt the end of the bubble sequence of the first.
# Hence a key is a couple (signed_int, signed int).
# rev_edges = {} # key = (signed int) node id, value = set of (signed int) target node ids (idem)
with open(fact_file_name) as fact_file:
for line in fact_file.readlines():
if not line: break
line=line.strip()
if line[0]=="#": continue
# print(line)
# print(len(line.split()))
for fact_id in range(len(line.split())-2): # In case of pairend facts, deal with the two facts (in this case len((line.split()) is 4
original_fact = line.split()[fact_id]
for used_fact in original_fact, get_reverse_fact(original_fact):
# sys.stderr.write(f"used fact {used_fact}\n")
comapped_variants = used_fact.strip(";").split(";")
# print(comapped_variants)
for i in range(len(comapped_variants)-1):
from_node_id = comapped_variants[i].split("_")[0]
to_node_id = comapped_variants[i+1].split("_")[0]
distance = int(comapped_variants[i+1].split("_")[1])
# add edge:
if from_node_id not in edges: edges[from_node_id] = set()
edges[from_node_id].add((to_node_id, distance))
# if to_node_id not in rev_edges: rev_edges[to_node_id] = set()
# rev_edges[to_node_id].add((from_node_id, distance))
# sys.stderr.write(f"{edges}\n")
## Sort each list wrt distance:
for from_variant_id in edges:
edges[from_variant_id] = sorted(edges[from_variant_id], key=lambda x : x[1])
return edges
def remove_transitive_redundant(edges):
nb_removed = 0
nb_total = 0
nb_removed = 0
nb_total = 0
# new_edges = {}
for from_variant_id in edges:
# if an ordered list contains more than 2 children (A->B and A->C for instance), remove A->C if B->C exists somewhere else
updated_list = set()
updated_list.add(edges[from_variant_id][0]) # the first one stays, whatever.
for id_in_edges in range(len(edges[from_variant_id])-1):
nb_total+=1
cur_var = edges[from_variant_id][id_in_edges]
next_var = edges[from_variant_id][id_in_edges+1]
if cur_var[1]==next_var[1]: # same distance to start, we keep it
updated_list.add(edges[from_variant_id][id_in_edges+1])
continue
if not exists_edge(edges, cur_var[0], next_var[0]): # this is a not a transitive edge, we keep it
updated_list.add(edges[from_variant_id][id_in_edges+1])
else:
# sys.stderr.write(f"removed {from_variant_id} -> {edges[from_variant_id][id_in_edges+1]}\n")
nb_removed+=1
updated_list = sorted(updated_list, key=lambda x : x[1])
# sys.stderr.write(f"avant {from_variant_id} -> {edges[from_variant_id]}\n")
edges[from_variant_id]=updated_list
# sys.stderr.write(f"apres {from_variant_id} -> {new_edges[i][from_variant_id]}\n")
sys.stderr.write(f"Removed {nb_removed} transitive redundant edges among {nb_total}\n")
return nb_removed
# edges = new_edges
def keep_only_consecutive_edges(edges):
nb_total = 0
nb_removed = 0
for from_variant_id in edges:
# if an ordered list contains more than 2 children (A->B and A->C for instance), conserve the closest one
updated_list = set()
updated_list.add(edges[from_variant_id][0]) # the first one stays, whatever.
next_id = 0
while True: # add all soon with the same distance
next_id += 1
if next_id == len(edges[from_variant_id]): break
if edges[from_variant_id][next_id][1] == edges[from_variant_id][0][1]:
updated_list.add(edges[from_variant_id][next_id])
else: break
nb_total += len(edges[from_variant_id])
nb_removed += len(edges[from_variant_id])-next_id
updated_list = sorted(updated_list, key=lambda x : x[1])
# sys.stderr.write(f"avant {from_variant_id} -> {edges[from_variant_id]}\n")
edges[from_variant_id]=updated_list
# sys.stderr.write(f"apres {from_variant_id} -> {new_edges[i][from_variant_id]}\n")
sys.stderr.write(f"Removed {nb_removed} among {nb_total}\n")
def remove_distances(edges):
## remove distances:
for from_variant_id in edges:
id_only_list = set()
for to_variant_id, distance in edges[from_variant_id]:
id_only_list.add(to_variant_id)
edges[from_variant_id] = id_only_list
def remove_tips(edges, nodes):
iodegree = {} # key = signed variant id, value = (At least one incoming edge, at least one ougoing edge)
## Stores for all variants, if they have at least one incoming edge and at least one ougoing edge
for source_id, target_ids in edges.items():
if source_id not in nodes: continue # were removed
if source_id not in iodegree: iodegree[source_id] = [False,True]
else: iodegree[source_id][1] = True
for target_id in target_ids:
if target_id not in nodes: continue # were removed
if target_id not in iodegree: iodegree[target_id] = [True,False]
else: iodegree[target_id][0] = True
nb_tips_removed = 0
to_remove = set()
for node in nodes:
if node not in iodegree:
to_remove.add(node)
nb_tips_removed+=1
continue
if iodegree[node] != [True, True]:
to_remove.add(node)
nb_tips_removed+=1
continue
for node_to_remove in to_remove:
del nodes[node_to_remove]
sys.stderr.write(f"Removed {nb_tips_removed} tips\n")
return nb_tips_removed
# sys.exit()
def working_zones(edges, nodes):
"""
if a node has 3 or more children or 3 or more ancestors, remove all of them
"""
removed=0
for from_variant_id in edges:
if len(edges[from_variant_id]) > 2:
for to_variant_id in edges[from_variant_id]:
if to_variant_id in nodes:
removed+=1
del nodes[to_variant_id]
sys.stderr.write(f"Working zones, children removed {removed} non bi-allelic nodes\n")
### ancestors. We first have to store them
input_edges = {} # key = son, value = list of ancestors
for source_id, target_ids in edges.items():
if source_id not in nodes : continue
for target_id in target_ids:
if target_id not in nodes: continue
if target_id not in input_edges: input_edges[target_id] = set()
input_edges[target_id].add(source_id)
removed=0
for from_variant_id in input_edges:
if len(input_edges[from_variant_id]) > 2:
for to_variant_id in input_edges[from_variant_id]:
if to_variant_id in nodes:
removed+=1
del nodes[to_variant_id]
sys.stderr.write(f"Working zones, ancestor removed {removed} non bi-allelic nodes\n")
def existing_fact(fact, nodes, edges):
"""
Given a fact as:
-1145l_0;4780h_-60;-248h_-63;-7035l_-60;4077l_-59;-6265l_-60;-2978h_-69;4958h_-59;-4221h_-60;-1376l_-60;
returns True if
1/ all nodes exist in the gfa
2/ all edges exist in the gfa
else return False
"""
s_fact = fact.strip(";").split(";")
### Nodes
for shift_node in s_fact:
if shift_node.split('_')[0] not in nodes:
# sys.stderr.write(f"{shift_node.split('_')[0]} not in nodes\n")
return False
### Edges
for i in range(len(s_fact)-1):
source = s_fact[i].split('_')[0]
target = s_fact[i+1].split('_')[0]
if source in edges and target in edges[source]: continue
if target in edges and source in edges[target]: continue
return False
return True
def print_as_facts(nodes, edges, fact_file_name):
"""
Store nodes with their min, max and mean coverages
"""
with open(fact_file_name) as fact_file:
for line in fact_file.readlines():
if not line: break
line=line.strip()
if line[0]=="#": continue
" -1145l_0;4780h_-60;-248h_-63;-7035l_-60;4077l_-59;-6265l_-60;-2978h_-69;4958h_-59;-4221h_-60;-1376l_-60;-4664l_-59;-2207h_-60;-6257l_-60;-5529l_-59;-5106h_-59;1310h_-59;-4514h_-59;3933h_-60;-5727h_-60;2325l_-60;-3976l_-59;-3363h_-60;-716l_-59;-3526h_-51;-5819h_-57;2927l_-59;-6146l_-60;-4374h_-53; => 1"
coverage = int(line.split()[-1])
printed_something = False
for fact_id in range(len(line.split())-2): # In case of pairend facts, deal with the two facts (in this case len((line.split()) is 4
original_fact = line.split()[fact_id]
if existing_fact(original_fact, nodes, edges):
print(original_fact, ' ', end='')
printed_something = True
if printed_something:
print ("=> ", line.split()[-1])
def print_as_gfa(nodes, edges):
# cov_threshold=10
# index nodes:
node_id_to_variant_id = {} # key = node_id (1,2, ..., n) value = variant_id (eg. -14h)
variant_id_to_node_id = {} # key = variant_id (eg. -14h) value = node_id (eg. 2)
node_id=1
for node in nodes:
node_id_to_variant_id[node_id] = node
variant_id_to_node_id[node] = node_id
node_id+=1
nb_nodes = 0
for node_id in node_id_to_variant_id:
nb_nodes+=1
# if nodes[node_id_to_variant_id[node_id]] < cov_threshold: continue
print("S\t"+str(node_id)+"\t"+str(node_id_to_variant_id[node_id])+"\tRC:i:"+str(nodes[node_id_to_variant_id[node_id]]))
nb_edges = 0
for from_variant_id in edges:
if from_variant_id not in nodes: continue # had been removed
# if nodes[from_variant_id] < cov_threshold: continue # too low cove
for to_variant_id in edges[from_variant_id]:
if to_variant_id not in nodes: continue # had been removed
# if nodes[to_variant_id] < cov_threshold: continue # too low cove
nb_edges+=1
print("L\t"+str(variant_id_to_node_id[from_variant_id])+"\t+\t"+str(variant_id_to_node_id[to_variant_id])+"\t+\t0M")
sys.stderr.write(f"Printed {nb_nodes} nodes and {nb_edges} edges\n")
def main():
outgfa = False
if len(sys.argv) > 2 and sys.argv[2]=="--gfa":
outgfa=True
sys.stderr.write(f"Output data as a gfa graph\n")
nodes = store_nodes(sys.argv[1])
edges = store_ordered_edges(sys.argv[1], nodes)
keep_only_consecutive_edges(edges)
remove_distances(edges)
# remove_tips(edges, nodes)
working_zones(edges, nodes)
# remove_tips(edges, nodes)
if outgfa:
print_as_gfa(nodes, edges)
else:
print_as_facts(nodes, edges, sys.argv[1])
if __name__ == '__main__':
main()
|