File: samsetop.py

package info (click to toggle)
python-sqt 0.8.0-9
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 824 kB
  • sloc: python: 5,964; sh: 38; makefile: 10
file content (154 lines) | stat: -rw-r--r-- 4,892 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env python3
"""
Perform set operation on two SAM/BAM files.

The output BAM file will have the same header as file A.

WARNING: Implementation is neither very fast nor memory efficient.

Possible operations:
  union:        Output union of A and B, abort with error if
                different lines for same read are encountered.
  intersection: Output intersection of A and B, abort with error if
                different lines for same read are encountered.
  setminus:     Outputs all read in A that are not in B.
  symdiff:      Output all reads in A or B but not in both.
"""
from sqt import HelpfulArgumentParser
import sys
from pysam import Samfile

__author__ = "Tobias Marschall"

def add_arguments(parser):
	arg = parser.add_argument
	arg("-s", action="store_true", dest="sam_output", default=False,
		help="Output SAM file instead of BAM file")
	arg("-U", action="store_true", dest="exclude_unmapped_A", default=False,
		help="Exclude unmapped reads from file A")
	arg("-V", action="store_true", dest="exclude_unmapped_B", default=False,
		help="Exclude unmapped reads from file B")
	arg("-r", action="store_true", dest="remove_name_suffix", default=False,
		help="Remove trailing \"/*\" from read names. Useful if one mapper appends \"/1\" and another does not.")
	arg('bampath1', help='First BAM or SAM file')
	arg('operation', choices=('union','intersection','setminus','symdiff'))
	arg('bampath2', help='Second BAM or SAM file')
	arg('outputpath', nargs='?',
		help='Output BAM or SAM file. If omitted, only print the number of reads '
		'that would be written.')


def SamOrBam(name, mode='r'):
	if name.endswith('.bam'):
		mode += 'b'
	return Samfile(name, mode)


def remove_suffix(s):
	i = s.rfind('/')
	if i == -1: return s
	else: return s[:i]


def nop(s):
	return s


def dict_of_reads(reads, exclude_unmapped, rename):
	d = dict()
	for read in reads:
		if exclude_unmapped and read.is_unmapped: continue
		name = rename(read.qname)
		if d.has_key(name):
			raise Exception("Duplicate read in input file (%s)"%name)
		d[name] = read
	return d


def union(A, B,outfile, exclude_unmapped_A, exclude_unmapped_B, rename):
	readsB = dict_of_reads(B, exclude_unmapped_B, rename)
	readnamesA = set()
	for read in A:
		if exclude_unmapped_A and read.is_unmapped: continue
		name = rename(read.qname)
		if name in readnamesA:
			raise Error("Duplicate read in input file (%s)"%name)
		readnamesA.add(name)
		if readsB.has_key(name):
			if read.compare(readsB[name]) != 0:
				print('Content mismatch for read %s:'%name, file=sys.stderr)
				print('File A:',read, file=sys.stderr)
				print('File B:',readsB[name], file=sys.stderr)
				sys.exit(1)
			readsB.pop(name)
		outfile.write(read)
	for read in readsB.itervalues():
		outfile.write(read)


def intersection(A,B,outfile,exclude_unmapped_A,exclude_unmapped_B,rename):
	readsB = dict_of_reads(B, exclude_unmapped_B, rename)
	readnamesA = set()
	for read in A:
		if exclude_unmapped_A and read.is_unmapped: continue
		name = rename(read.qname)
		if name in readnamesA:
			raise Error("Duplicate read in input file (%s)"%name)
		readnamesA.add(name)
		if readsB.has_key(name):
			if read.compare(readsB[name])!=0:
				print('Content mismatch for read %s:'%name, file=sys.stderr)
				print('File A:',read, file=sys.stderr)
				print('File B:',readsB[name], file=sys.stderr)
				sys.exit(1)
			outfile.write(read)


def setminus(A,B,outfile,exclude_unmapped_A,exclude_unmapped_B,rename):
	if exclude_unmapped_B: readnamesB = set((rename(read.qname) for read in B if not read.is_unmapped))
	else: readnamesB = set((rename(read.qname) for read in B))
	for read in A:
		if exclude_unmapped_A and read.is_unmapped: continue
		if not rename(read.qname) in readnamesB:
			outfile.write(read)


def symdiff(A,B,outfile,exclude_unmapped_A,exclude_unmapped_B,rename):
	if exclude_unmapped_B: readsB = dict(((rename(read.qname),read) for read in B if not read.is_unmapped))
	else: readsB = dict(((rename(read.qname),read) for read in B))
	for read in A:
		if exclude_unmapped_A and read.is_unmapped: continue
		name = rename(read.qname)
		if readsB.has_key(name):
			readsB.pop(name)
		else:
			outfile.write(read)
	for read in readsB.itervalues():
		outfile.write(read)


class Counter:
	count = 0
	def write(self, x):
		self.count += 1


def main(args=None):
	if args is None:
		parser = HelpfulArgumentParser(description=__doc__)
		add_arguments(parser)
		args = parser.parse_args()
	operation = args.operation
	A = SamOrBam(args.bampath1)
	B = SamOrBam(args.bampath2)
	if args.outputpath:
		outfile = Samfile(args[3], 'wh' if args.sam_output else 'wb', template=A)
	else:
		outfile = Counter()
	globals()[operation](A, B, outfile, args.exclude_unmapped_A, args.exclude_unmapped_B, remove_suffix if args.remove_name_suffix else nop)
	if args.outputpath is None:
		print(outfile.count)


if __name__ == '__main__':
	main()