File: samfixn.py

package info (click to toggle)
python-sqt 0.8.0-9
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 824 kB
  • sloc: python: 5,964; sh: 38; makefile: 10
file content (35 lines) | stat: -rw-r--r-- 845 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#!/usr/bin/env python3
"""
Read a SAM file from standard input, replace all characters in all reads
that are not one of {a, c, g, t, n, A, C, G, T, N} with the character 'N'.
Write the modified SAM file to standard output.

This is approx. 8 times faster than an equivalent awk line using the gsub()
function.
"""
import sys
from os.path import join, dirname, realpath, isfile

from sqt import HelpfulArgumentParser


def main():
	parser = HelpfulArgumentParser(usage=__doc__)
	args = parser.parse_args()

	tab = [ord('N')] * 256
	for c in b'ACGTNacgtn':
		tab[c] = c
	trans = bytes(tab)

	for line in sys.stdin.buffer:
		if line.startswith(b'@'):
			sys.stdout.buffer.write(line)
		else:
			fields = line.split(b'\t')
			fields[9] = fields[9].translate(trans)
			sys.stdout.buffer.write(b'\t'.join(fields))


if __name__ == '__main__':
	main()