File: checkvcfref.py

package info (click to toggle)
python-sqt 0.8.0-9
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 824 kB
  • sloc: python: 5,964; sh: 38; makefile: 10
file content (45 lines) | stat: -rw-r--r-- 1,261 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#!/usr/bin/env python3
"""
Read in a file with variants (typically VCF) and check if the given
reference allele matches what is actually in the reference sequence
at the given position.
"""
import sys
import csv

from sqt import HelpfulArgumentParser, IndexedFasta

__author__ = "Marcel Martin"


def main():
	parser = HelpfulArgumentParser(description=__doc__)
	arg = parser.add_argument
	arg("reference", help="Reference file in FASTA format. An associated .fai index file must exist.")
	arg("variantfile", help="Input (VCF) file with variants.")
	args = parser.parse_args()

	reference = IndexedFasta(args.reference)
	with open(args.variantfile) as f:
		reader = csv.reader([row for row in f if row[0] != '#'], delimiter='\t')
		n = 0
		for row in reader:
			n += 1
			chrom = row[0]
			if chrom.startswith('chr'):
				chrom = chrom[3:]
			pos = int(row[1]) - 1
			ref = row[3]
			actual = reference.get(chrom)[pos:pos+len(ref)].decode()
			if not actual == ref:
				print('Problem with record no.', n, 'found:')
				print('CHROM={} POS={} REF={}'.format(chrom, pos+1, ref))
				print('Reference sequence in FASTA file is:', actual)
				sys.exit(1)
		else:
			print((n, 'records checked, everything ok.'))
			sys.exit(0)


if __name__ == '__main__':
	main()