File: gps.py

package info (click to toggle)
microbegps 1.0.0-5
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, sid
  • size: 166,876 kB
  • sloc: python: 2,786; makefile: 12
file content (707 lines) | stat: -rw-r--r-- 22,773 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
# -*- coding: utf-8 -*-
"""
Analysis tools used by MicrobeGPS
"""

import glob
import numpy as np
import valcov
import re
import os
import gzip
import taxonomy

class MappedRead:
	def __init__(self):
		self.matches = set()

class Target:
	def __init__(self, name, glen):
		self.name = name
		self.length = glen
		self.cov_homog = 1.
		self.map_qual = 0.
		self.reads = dict()
		self.unique = 0


class Reference:
	def __init__(self, name):
		self.name = name
		self.reads = 0
		self.length = 0
		self.coverage = 0.
		self.validity = 0.
		self.cov_homog = 1.
		self.map_qual = 0.
		self.targets = dict()
		self.unique = 0

class Group:
	def __init__(self):
		self.members = []
		self.usrreads = False
		self.allreads = False
class EGroup:
	def __init__(self):
		self.members = dict()
		self.reads = 0
		self.unique = 0
		self.max_val = 0
		self.cov = 0
		self.map_qual = 0
		self.cov_homog = 0
		

def gi_from_name(name):
	""" Extract a gi, if it is hidden in the name. 
	    Looks for the pattern gi| followed by a number.
	    Returns -1 if not found."""
	gi_start = name.find('gi|') + 3 # get the first position after gi|
	if gi_start >= 3:
		gi_end = name.find('|',gi_start)
		if gi_end == -1:
			gi_end = None
		gi = name[gi_start:gi_end]
	else:
		gi = -1
	return gi


def silent(text):
	pass


#def __calculate_mapping_score(read):
#	""" calculate mapping score for a mapped read. """
#	score = 0
#	# get mismatches from SAM tag 'MD'
#	score += sum(map(lambda x: x[1] if x[0]=='NM' else 0, read.tags ))
#	# get insertions/deletions from cigar (no clipping)
#	score += sum(map(lambda x: x[1] if 0<x[0]<3 else 0, read.cigar ))
#	return score/float(read.qlen)


def __calculate_mapping_score_no_pysam(cigar,nm,rlen):
	""" calculate mapping score for a mapped read. """
	score = 0.
	# get mismatches from SAM tag 'MD'
	score += nm
	# get all errors from cigar string
	for ln,er in re.findall('(\d+)([ID])',cigar):
		score += float(ln)
	return score/float(rlen)
	

def read_sam_files_no_pysam(files, status=None):
	""" Extract mapping information from SAM files. 'files' can be either
	a string (interpreted as single file name or glob pattern) or a list of
	strings (interpreted as list of file names).
	"""
	if type(files) == str:
		fns = glob.glob(files)
	elif type(files) == list:
		fns = files
	else:
		raise Exception('Could not identify format of file names:',files)
	d = len(fns)
	
	data_length = sum([os.path.getsize(f) for f in fns])
	data_position = 0
	
	# data structure to store all mapping information
	read_table = dict()
	target_table = dict()
	
	# read SAM files one by one
	for i,fn in enumerate(fns):
		if status: status(i+1,d)
		sf = open(fn,'rU')
		
		ref_len = dict() # stores all reference sequence lengths
		# first read the header of the samfile
		for line in sf:
			data_position += len(line)
			if line[0] == '@':
				frag = line.rstrip().split('\t')
				if not frag[0] == '@SQ':
					continue
				# assume NM tag in first field
				name = frag[1].split(':')[1]
				# assume sequence length tag LN in second field
				glen = int(frag[2].split(':')[1])
				ref_len[name] = glen
			else:
				break
		if line[0] == '@':
			continue # there are no alignments in this file
		
		# process the one line that we read too much
		frag = line.split()
		r_name = frag[0]
		t_name = frag[2]
		pos = int(frag[3])-1
		cigar = frag[5]
		nm = 0
		for i,tag in enumerate(frag[9:]):
			if tag.startswith('NM:i:'):
				nm = int(tag[5:])
				break
		r_len = len(frag[9])
		score = __calculate_mapping_score_no_pysam(cigar,nm,r_len)
		rd = read_table.setdefault(r_name,MappedRead())
		rd.matches.add(t_name)
		if not t_name in target_table:
			target_table[t_name] = Target(t_name,ref_len[t_name])
		gn = target_table[t_name]
		gn.reads[r_name]= [pos, r_len, score]		
		
		last_status = 0
		# process all other alignments
		for line in sf:
			data_position += len(line)
			if status:
				last_status += 1
				if last_status >= 5000: # update every 5000 reads
					status((data_position*100)/data_length,100)
					last_status = 0
			frag = line.rstrip().split('\t')
			if int(frag[1])&4:
				continue # the unmapped flag is set
			else:
				r_name = frag[0]
				t_name = frag[2]
				pos = int(frag[3])-1
				cigar = frag[5]
				nm = 0
				for i,tag in enumerate(frag[9:]):
					if tag.startswith('NM:i:'):
						nm = int(tag[5:])
						break
				r_len = len(frag[9])
				score = __calculate_mapping_score_no_pysam(cigar,nm,r_len)
				rd = read_table.setdefault(r_name,MappedRead())
				rd.matches.add(t_name)
				gn = target_table.setdefault(t_name, Target(t_name,ref_len[t_name]))
				gn.reads[r_name]= [pos, r_len, score]
	if status: status(100,100)
	return target_table,read_table


def filter_raw(target_table,read_table,max_matches=None,min_support=1, max_error=None, qual_percentile=None, pr=silent):
	""" Filter the raw mapping information according to the following criteria:
		max_matches: Reads matching to more than max_matches genomes are discarded
		min_support: Genomes with less than min_support reads are discarded
		max_error: Read alignments with more than max_error are discarded
	"""
	# discard reads with more than max_matches matches and count unique reads
	unique_reads = 0
	del_reads = []
	if max_matches:
		for mp in read_table:
			lrt = len(read_table[mp].matches)
			if lrt > max_matches:
				del_reads.append(mp)
			if lrt == 1:
				target_table[iter(read_table[mp].matches).next()].unique += 1
				unique_reads += 1
	for d in del_reads:
		for gn in read_table[d].matches:
			del target_table[gn].reads[d]
		del read_table[d]
	pr("--- Discarded %i reads with > %i matches."%(len(del_reads),int(max_matches)))
	pr("--- Found %i reads with unique matches."%(unique_reads))
	
	# discard genomes with insufficient support
	del_genomes = []
	if min_support:
		for gn in target_table:
			if len(target_table[gn].reads) < min_support:
				del_genomes.append(gn)
	for g in del_genomes:
		for rd in target_table[g].reads:
			read_table[rd].matches.remove(g)
			if len(read_table[rd].matches) == 0:
				del read_table[rd]
		del target_table[g]
	pr("--- Discarded %i targets with less than %i reads."%(len(del_genomes),int(min_support)))


def calculate_mapping_statistics(target_table):
	""" calculate coverage homogeneity and average mapping error """
	for gn in target_table:
		start_pos = []
		scores = []
		for r in target_table[gn].reads:
			start_pos.append(target_table[gn].reads[r][0])
			scores.append(target_table[gn].reads[r][2])
		target_table[gn].map_qual = np.mean(scores)
		
		start_pos.sort()
		g_len = float(target_table[gn].length)
		start_pos.append(g_len)
		n_rds = float(len(start_pos))
		
		max_dev = 0.
		for i,p in enumerate(start_pos):
			dev1 = abs(((p-1)/g_len) - (i/n_rds))
			dev2 = abs((p/g_len) - ((i+1)/n_rds))
			max_dev = max(max_dev,dev1,dev2)
		target_table[gn].cov_homog = max_dev



def get_reference_table(target_table, read_table, gmap=None):
	""" Put targets in biologically meaningful groups.
	'gmap' can be either a dict, mapping target names to genome names, or a 
	file name pointing to a tab delimited file."""
	# first create the mapping function: target name --> genome name
	if not gmap:
		# use the identity mapping
		mp = lambda x: x
	elif hasattr(gmap,'__call__'):
		# gmap is a function. Expect that gmap takes only a reference name as input
		mp = gmap
	else:
		if type(gmap) == str:
			d = dict()
			f = open(gmap)
			for line in f:
				contig_name,ref_name = line.rstrip().split('\t')
				d[contig_name] = ref_name
		elif type(gmap) == dict:
			d = gmap
		else:
			raise Exception('gmap type must be either dict or string (filename).')
		mp = lambda x: d.get(x,x)
	
	# create the reference table
	ref_table = dict()
	for t in target_table:
		r = mp(t)
		ref = ref_table.setdefault(r,Reference(r))
		ref.targets[t] = target_table[t]

	# update read_table, assign reads to References, not Targets
	for rd in read_table:
		renamed_set = set()
		for match in read_table[rd].matches:
			renamed_set.add(mp(match))
		read_table[rd].matches = renamed_set

	# calculate simple metrics over references
	for r in ref_table:
		lengths = [x.length for x in ref_table[r].targets.itervalues()]
		homog = [x.cov_homog for x in ref_table[r].targets.itervalues()]
		qual = [x.map_qual for x in ref_table[r].targets.itervalues()]
		unique = sum([x.unique for x in ref_table[r].targets.itervalues()])
		t_len = float(sum(lengths))
		
		readnames = set()
		for t in ref_table[r].targets.itervalues():
			readnames.update(t.reads.iterkeys())
		reads = len(readnames)
		
		t_homog = 0.
		t_qual = 0.
		for l,h,q in zip(lengths,homog,qual):
			t_homog += h*l/t_len
			t_qual += q*l/t_len
		ref_table[r].cov_homog = t_homog
		ref_table[r].map_qual = t_qual
		ref_table[r].unique = unique
		ref_table[r].reads = reads
		ref_table[r].length = sum(lengths)

	return ref_table


def get_reference_table_NCBI(target_table, read_table, catalog, pr=silent):
	""" Put targets in biologically meaningful groups using NCBI taxonomy.
	catalog specifies the file name of the RefSeq-releaseXX.catalog file
	"""
	
	# first collect all gis
	gis = set()
	for trg in target_table:
		gis.add(gi_from_name(trg))
	
	# read the NCBI files
	gi_to_taxid,gi_to_name = taxonomy.gis_to_taxid_and_name(gis,catalog)
	
	# create the reference table
	ref_table = dict()
	trg_to_ref = dict()
	not_found = 0
	for t in target_table:
		gi = gi_from_name(t)
		taxid = gi_to_taxid.get(gi,'[not found]')
		if taxid == '[not found]':
			not_found += 1
		ref_name = gi_to_name.get(gi,t)
		ref = ref_table.setdefault(ref_name,Reference(taxid))
		ref.targets[t] = target_table[t]
		trg_to_ref[t] = ref_name
	pr('--- Found %i IDs for %i targets'%(len(target_table)-not_found,len(target_table)))

	# update read_table, assign reads to References, not Targets
	for rd in read_table:
		renamed_set = set()
		for match in read_table[rd].matches:
			renamed_set.add(trg_to_ref[match])
		read_table[rd].matches = renamed_set

	# calculate simple metrics over references
	for r in ref_table:
		lengths = [x.length for x in ref_table[r].targets.itervalues()]
		homog = [x.cov_homog for x in ref_table[r].targets.itervalues()]
		qual = [x.map_qual for x in ref_table[r].targets.itervalues()]
		unique = sum([x.unique for x in ref_table[r].targets.itervalues()])
		t_len = float(sum(lengths))
		
		readnames = set()
		for t in ref_table[r].targets.itervalues():
			readnames.update(t.reads.iterkeys())
		reads = len(readnames)
		
		t_homog = 0.
		t_qual = 0.
		for l,h,q in zip(lengths,homog,qual):
			t_homog += h*l/t_len
			t_qual += q*l/t_len
		ref_table[r].cov_homog = t_homog
		ref_table[r].map_qual = t_qual
		ref_table[r].unique = unique
		ref_table[r].reads = reads
		ref_table[r].length = sum(lengths)

	return ref_table
	


def filter_ref_table(ref_table,read_table,filt=lambda x:True, pr=silent):
	"""Delete entries in ref_table for which the filter function filt returns
	False. The unary function filt must be able to handle Reference objects."""
	del_ref = []
	# create the set of genomes to be removed
	for rf in ref_table:
		if not filt(ref_table[rf]):
			del_ref.append(rf)
	pr('--- Removing %i of %i references.'%(len(del_ref),len(ref_table)))
	# remove the selected genomes
	for d in del_ref:
		for t in ref_table[d].targets:
			for rd in ref_table[d].targets[t].reads:
				if not rd in read_table:
					continue  # this happens if one read matched to 2 targets in one reference --> read_table[rd] was already removed when reaching the second target
				read_table[rd].matches.discard(d)
				if len(read_table[rd].matches) == 0:
					del read_table[rd]
		del ref_table[d]
	# update unique reads information
	pr('--- Updating Unique Reads information.')
	for rf in ref_table:
		ref_table[rf].unique = 0
	for rd in read_table.itervalues():
		if len(rd.matches) == 1:
			ref_table[iter(rd.matches).next()].unique += 1
				


def calculate_valcov(ref_table, status=None):
	"""Calculate validity and coverage for all References in ref_table."""
	count = 0
	for rf in ref_table:
		count += 1
		if status: status('Calculate val/cov (%i/%i): %s'%(count,len(ref_table),rf))
		dsc = valcov.DS_cov(ref_table[rf])
		val,cov = valcov.validity_from_coverage(dsc)
		if cov < 0.8:
			# use distance based method if coverage is too low
			dsd = valcov.DS_dst(ref_table[rf])
			if len(dsd.value) > 0:
				val,cov = valcov.validity_from_spaces(dsd)
			else:
				val,cov = 0.,0.
		ref_table[rf].coverage = cov
		ref_table[rf].validity = val

def calculate_valcov_one(ref):
	"""Calculate validity and coverage for one reference."""
	if ref.coverage > 0. and ref.validity > 0.:
		# validity and coverage were already calculated
		return ref.validity,ref.coverage
	dsc = valcov.DS_cov(ref)
	val,cov = valcov.validity_from_coverage(dsc)
	if cov < 0.8:
		# use distance based method if coverage is too low
		dsd = valcov.DS_dst(ref)
		if len(dsd.value) > 0:
			val,cov = valcov.validity_from_spaces(dsd)
		else:
			val,cov = 0.,0.
	ref.coverage = cov
	ref.validity = val
	return val,cov


def extract_USR(ref_table, read_table, csim=0.2, pr=silent):
	"""extract Unique Source Reads from the read_table, i.e. reads mapping to
	genomes with comparable coverage."""
	usr_table = dict()
	for rd in read_table:
		covs = np.array([ref_table[mt].coverage for mt in read_table[rd].matches])
		#vals = np.array([ref_table[mt].validity for mt in read_table[rd].matches])
		# hier noch die validity mit einbauen!!!
		diff = (np.max(covs)-np.min(covs))/(np.sum(covs)/len(covs)) 
		if diff < csim:
			usr_table[rd] = read_table[rd]
	pr('--- Found %i Unique Source Reads (total %i reads)'%(len(usr_table),len(read_table)))
	return usr_table


def get_read_matrix(reads, targets):
	"""Create a numpy matrix assigning reads to targets (or references).
	Returns the matrix and dictionaries mapping target name -> index and
     index -> target name."""
	mat = np.zeros((len(reads),len(targets)),dtype=np.uint8)
	name_to_index = dict()
	index_to_name = dict()
	for i,t in enumerate(targets):
		name_to_index[t] = i
		index_to_name[i] = t
	for i,r in enumerate(reads):
		for t in reads[r].matches:
			mat[i,name_to_index[t]] = 1
	return mat, name_to_index, index_to_name
		

def create_groups(usrmat, thr=0.1, status=None):
	"""create groups of references from matrix mat.
	Returns a dict with lists of reference IDs as enties (=columns of mat) and
	a dict of weak links between genomes."""	

	groups = dict()
	rds,rfs = usrmat.shape
	
	for i in range(rfs):
		if status: status('--- Grouping reference %i of %i'%(i+1,rfs))
		# calculate shared reads with existing groups
		mapped = np.sum(usrmat[:,i])
		shared = {gr:np.sum(np.logical_and(groups[gr].usrreads,usrmat[:,i])) for gr in groups}
		sig = {gr:shared[gr] for gr in shared if shared[gr]>mapped*thr}
		N = len(sig)
		print "Current reference:",i," Number of groups:",N
		if N == 0:
			# no significant similarity to existing group --> create new group
			gr = Group()
			gr.members.append(i)
			gr.usrreads = np.logical_or(gr.usrreads,usrmat[:,i])
			grID = len(groups)
			groups[grID] = gr
		elif N == 1:
			# significant similarity with exactly 1 group --> assign to that group
			grID = sig.keys()[0]
			groups[grID].members.append(i)
			groups[grID].usrreads = np.logical_or(groups[grID].usrreads,usrmat[:,i])
		else:
			# significant similarity with > 1 groups: assign to first group
			grID = sig.keys()[0]
			groups[grID].members.append(i)
			groups[grID].usrreads = np.logical_or(groups[grID].usrreads,usrmat[:,i])
			# ... and add other groups
			for otherID in sig.keys()[1:]:
				for m in groups[otherID].members:
					groups[grID].members.append(m)
				groups[grID].usrreads = np.logical_or(groups[grID].usrreads,groups[otherID].usrreads)
				groups[otherID].usrreads = False
				groups[otherID].members = []
	g_dict = {g:groups[g].members for g in groups if len(groups[g].members)>0}
	
	return g_dict
	

def create_groups_2(usrmat, allmat, thr=0.1, thr_all=0.6, status=None):
	"""create groups of references from matrix mat.
	Returns a dict with lists of reference IDs as enties (=columns of mat)"""	

	groups = dict()
	rds,rfs = usrmat.shape
	
	for i in range(rfs):
		if status: status('--- Grouping reference %i of %i'%(i+1,rfs))
		print "Current reference:",i," Number of groups:",len(groups)
		sig = dict()

		usr_reads = np.sum(usrmat[:,i])
		all_reads = np.sum(allmat[:,i])
		for gr in groups:
			if type(groups[gr].usrreads) == bool:
				continue # skip groups without usrreads/allreads information
			usr_shared = np.sum(np.logical_and(groups[gr].usrreads,usrmat[:,i]))
			all_shared = np.sum(np.logical_and(groups[gr].allreads,allmat[:,i]))
			gr_usr_reads = np.sum(groups[gr].usrreads)
			gr_all_reads = np.sum(groups[gr].allreads)
			if usr_shared > thr*usr_reads or usr_shared > thr*gr_usr_reads:
				sig[gr] = usr_shared
			if all_shared > thr_all*all_reads or all_shared > thr_all*gr_all_reads:
				sig[gr] = usr_shared

		N = len(sig)
		if N == 0:
			# no significant similarity to existing group --> create new group
			gr = Group()
			gr.members.append(i)
			gr.usrreads = np.logical_or(gr.usrreads,usrmat[:,i])
			gr.allreads = np.logical_or(gr.allreads,allmat[:,i])
			grID = len(groups)
			groups[grID] = gr
		elif N == 1:
			# significant similarity with exactly 1 group --> assign to that group
			grID = sig.keys()[0]
			groups[grID].members.append(i)
			groups[grID].usrreads = np.logical_or(groups[grID].usrreads,usrmat[:,i])
			groups[grID].allreads = np.logical_or(groups[grID].allreads,allmat[:,i])
		else:
			# significant similarity with > 1 groups: assign to best group
			grID = sig.keys()[np.argmax(sig.values())]
			groups[grID].members.append(i)
			groups[grID].usrreads = np.logical_or(groups[grID].usrreads,usrmat[:,i])
			groups[grID].allreads = np.logical_or(groups[grID].allreads,allmat[:,i])

	g_dict = {g:groups[g].members for g in groups if len(groups[g].members)>0}
	
	return g_dict
	
def create_groups_dc(usrmat, allmat, n2i, thr=0.1, thr_all=0.6, status=None):
	"""create groups of references from matrix mat using a divide&conquer strategy
	Returns a dict with lists of reference IDs as enties (=columns of mat)"""	

	groups = dict()
	rds,rfs = usrmat.shape
	
	# Two stage strategy
	# First stage: search for groups locally among all references sharing the first 3 letters
	if status: status('--- Clustering stage 1')
	level1 = dict()
	for name in n2i:
		item = level1.setdefault(name[0:3], list())
		item.append(n2i[name])
	level2 = dict()
	for b,bucket in enumerate(level1.itervalues()):
		l1_groups = dict()
		for i in bucket:
			sig = dict()
	
			usr_reads = np.sum(usrmat[:,i])
			all_reads = np.sum(allmat[:,i])
			for gr in l1_groups:
				if type(l1_groups[gr].usrreads) == bool:
					continue # skip l1_groups without usrreads/allreads information
				usr_shared = np.sum(np.logical_and(l1_groups[gr].usrreads,usrmat[:,i]))
				all_shared = np.sum(np.logical_and(l1_groups[gr].allreads,allmat[:,i]))
				gr_usr_reads = np.sum(l1_groups[gr].usrreads)
				gr_all_reads = np.sum(l1_groups[gr].allreads)
				if usr_shared > thr*usr_reads or usr_shared > thr*gr_usr_reads:
					sig[gr] = usr_shared
				if all_shared > thr_all*all_reads or all_shared > thr_all*gr_all_reads:
					sig[gr] = usr_shared
	
			N = len(sig)
			if N == 0:
				# no significant similarity to existing group --> create new group
				gr = Group()
				gr.members.append(i)
				gr.usrreads = np.logical_or(gr.usrreads,usrmat[:,i])
				gr.allreads = np.logical_or(gr.allreads,allmat[:,i])
				l1ID = 'bucket%i_%i'%(b,len(l1_groups))
				l1_groups[l1ID] = gr
			else:
				# significant similarity with > 1 groups: assign to best group
				l1ID = sig.keys()[np.argmax(sig.values())]
				l1_groups[l1ID].members.append(i)
				l1_groups[l1ID].usrreads = np.logical_or(l1_groups[l1ID].usrreads,usrmat[:,i])
				l1_groups[l1ID].allreads = np.logical_or(l1_groups[l1ID].allreads,allmat[:,i])		
		level2.update(l1_groups)
		
	# Second stage: merge all level 1 groups that suffice the conditions
	if status: status('--- Clustering stage 2')
	for l1ID in level2:
		sig = dict()

		usr_reads = np.sum(level2[l1ID].usrreads)
		all_reads = np.sum(level2[l1ID].allreads)
		for gr in groups:
			if type(groups[gr].usrreads) == bool:
				continue # skip groups without usrreads/allreads information
			usr_shared = np.sum(np.logical_and(groups[gr].usrreads,level2[l1ID].usrreads))
			all_shared = np.sum(np.logical_and(groups[gr].allreads,level2[l1ID].allreads))
			gr_usr_reads = np.sum(groups[gr].usrreads)
			gr_all_reads = np.sum(groups[gr].allreads)
			if usr_shared > thr*usr_reads or usr_shared > thr*gr_usr_reads:
				sig[gr] = usr_shared
			if all_shared > thr_all*all_reads or all_shared > thr_all*gr_all_reads:
				sig[gr] = usr_shared

		N = len(sig)
		if N == 0:
			# no significant similarity to existing group --> create new group
			grID = len(groups)
			groups[grID] = level2[l1ID]
		else:
			# significant similarity with > 1 groups: assign to best group
			grID = sig.keys()[np.argmax(sig.values())]
			groups[grID].members.extend(level2[l1ID].members)
			groups[grID].usrreads = np.logical_or(groups[grID].usrreads,level2[l1ID].usrreads)
			groups[grID].allreads = np.logical_or(groups[grID].allreads,level2[l1ID].allreads)

	g_dict = {g:groups[g].members for g in groups if len(groups[g].members)>0}
	
	return g_dict	
	


def enrich_groups(groups,references,reads,i2n):
	""" create list containing groups enriched with names and reference information """
	egroups = dict()			
	for g in groups:
		egroups[g] = EGroup()
		readnames = set()
		t_len = 0.
		for m in groups[g]:
			egroups[g].members[i2n[m]] = references[i2n[m]]
			for t in references[i2n[m]].targets:
				readnames.update(references[i2n[m]].targets[t].reads.iterkeys())
			t_len += float(references[i2n[m]].length)
		
		egroups[g].reads = len(readnames)
		egroups[g].max_val = max([m.validity for m in egroups[g].members.itervalues()])
		egroups[g].cov = sum([m.coverage*m.length/t_len for m in egroups[g].members.itervalues()])
		egroups[g].cov_homog = sum([m.cov_homog*m.length/t_len for m in egroups[g].members.itervalues()])
		egroups[g].map_qual = sum([m.map_qual*m.length/t_len for m in egroups[g].members.itervalues()])
		
		# count the group-unique reads, i.e. reads mapping only to that group
		group_unique = set()
		for mbm in egroups[g].members.itervalues():
			for trg in mbm.targets.itervalues():
				for rd in trg.reads:
					if rd in group_unique:
						continue
					add = True
					# check if a read has other matches than in this group
					for mtc in reads[rd].matches:
						if not mtc in egroups[g].members:
							add = False
							break
					if add: group_unique.add(rd)
		
		egroups[g].unique = len(group_unique)
	
	sgroups = sorted([egroups[g] for g in egroups], key=lambda x: -x.unique)
	return sgroups