File: qualityguess.py

package info (click to toggle)
python-sqt 0.8.0-9
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 824 kB
  • sloc: python: 5,964; sh: 38; makefile: 10
file content (55 lines) | stat: -rw-r--r-- 1,516 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/env python3
"""
Guess quality encoding of one or more FASTA files.
"""
import sys
import os
import subprocess
from collections import Counter
from sqt.io.fasta import FastqReader, guess_quality_base
from sqt import HelpfulArgumentParser

__author__ = "Marcel Martin"


def get_argument_parser():
	parser = HelpfulArgumentParser(description=__doc__)
	add = parser.add_argument
	add('--verbose', '-v', default=False, action='store_true',
		help='Print histogram of found characters')
	add('--limit', '-n', default=10000, type=int,
		help='Inspect the first LIMIT records in the FASTQ file (default: %(default)s)')
	add('fastq', nargs='+', metavar='FASTQ',
		help='Input FASTQ files (may be gzipped).')
	return parser


def main():
	parser = get_argument_parser()
	args = parser.parse_args()

	for path in args.fastq:
		if args.verbose:
			print('## File:', path)
		else:
			print(path, end='')
		freqs, guess = guess_quality_base(path)
		if args.verbose:
			print()
			print('character ASCII frequency')
			for c in sorted(freqs):
				print("{} {:3} {:7}".format(chr(c), c, freqs[c]))
			print()
		else:
			print(' is ', end='')
		guess = { 33: 'phred33', 64: 'phred64', None: 'unknown'}[guess]
		if args.verbose:
			print("Quality value range assuming phred33: {}..{}".format(min(freqs) - 33, max(freqs) - 33))
			print("Quality value range assuming phred64: {}..{}".format(min(freqs) - 64, max(freqs) - 64))
			print("This is probably", guess)
		else:
			print(guess)


if __name__ == '__main__':
	main()