File: vcfRefCorrect.py

package info (click to toggle)
snpeff 5.4.b%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 757,496 kB
  • sloc: java: 62,572; perl: 2,279; sh: 1,185; python: 744; xml: 507; makefile: 50
file content (76 lines) | stat: -rwxr-xr-x 1,895 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
#!/usr/bin/env python3

import sys


#-------------------------------------------------------------------------------
# Read fasta file
#-------------------------------------------------------------------------------

def readFasta( fasta ):
	print(f"Reading FASTA file {fasta}", file=sys.stderr)
	lineNum = 1
	chrname = ''
	chrs = dict()
	seq = []	# Read sequence as a list of strigs

	for line in open(fasta):
		if line.startswith(">"):
			# Add previous sequence, if any
			if chrname != '':	chrs[chrname] = ''.join(seq).upper()
			chrname = line[1:].strip()
			if chrname.startswith('chr'): chrname = chrname[3:]
			print(f"Chromosome '{chrname}'", file=sys.stderr)
			chrs[chrname] = ""
			seq = []
		else:
			seq.append( line.rstrip() )

		# Show something
		if lineNum % 10000 == 0:
			sys.stderr.write('.')
		lineNum += 1

	print("", file=sys.stderr)
	if chrname != '':   chrs[chrname] = ''.join(seq).upper()
	return chrs

#-------------------------------------------------------------------------------
# Main
#-------------------------------------------------------------------------------

# Read fasta file
fasta = sys.argv[1]
chrs = readFasta(fasta)

# Read VCF file
for line in sys.stdin:
	line = line.rstrip()

	if line.startswith('#'):
		# Show header
		print(line)
	else:
		# Extract REF field
		fields = line.split('\t')
		(chrom, pos, ref) = ( fields[0], fields[1], fields[3] )

		if chrom in chrs:
			chrSeq = chrs[chrom]

			# Get coordinates
			posStart = int(pos) - 1
			posEnd = posStart + len(ref)

			# Correct 'REF' sequence
			if posEnd < len(chrSeq):
				refOri = ref
				ref = chrSeq[posStart:posEnd]
				fields[3] = ref
			else:
				print(f"Position {pos} not found in chromosome '{chrom}' (chromosome length {len(chrSeq)})", file=sys.stderr)
		else:
			print(f"Chromosome '{chrom}' not found", file=sys.stderr)

		# Show corrected line
		print('\t'.join(fields))