File: ped2vcf.py

package info (click to toggle)
snpeff 5.4.b%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 757,496 kB
  • sloc: java: 62,572; perl: 2,279; sh: 1,185; python: 744; xml: 507; makefile: 50
file content (93 lines) | stat: -rwxr-xr-x 2,074 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env python3

import sys
import re


def alleles(gt1, gt2):
	"""Get reference and alternative alleles"""

	# Count genotypes 1 and 2
	count = {}
	for g in gt1 :
		count[g] = count.get(g, 0) + 1

	for g in gt2 :
		count[g] = count.get(g, 0) + 1

	# Find mayor allele (we call it 'ref')
	maxCount = 0
	ref = ''
	for g in count:
		if count[g] > maxCount:
			maxCount  = count[g]
			ref = g

	# Find minor allele (we call these one 'alt')
	alt = ''
	for g in count:
		if g != ref and g != '0': alt = g

	# Create a genotype string (VCF style)
	gtstr = ""
	for i in range(len(gt1)):
		gtstr += "\t" + gtVcf(ref, alt, gt1[i]) + "/" + gtVcf(ref, alt, gt2[i])

	return ref, alt, count[alt], gtstr


def gtVcf(ref, alt, gt):
	""" Get genotype in VCF style string"""
	if gt == ref: return "0"
	if gt == alt: return "1"
	return "."

#------------------------------------------------------------------------------
# Main
#------------------------------------------------------------------------------

# Parse comman line arguments
if len(sys.argv) != 3 :
	print(f"Usage: {sys.argv[0]} file.ped file.map", file=sys.stderr)
	sys.exit(1)

pedFile = sys.argv[1]
mapFile = sys.argv[2]

# Prepare to read data
reSplit = re.compile("\\s+")

snps = []
ids = []
gt = []

# Read MAP file
for line in open(mapFile):
	f = reSplit.split(line.rstrip())
	chr, id, cm, pos = f[0], f[1], f[2], f[3]
	snps.append( (id, chr, pos) )

# Create genotype lists
geno1 = [ [] for s in snps]
geno2 = [ [] for s in snps]

# Read PED file
for line in open(pedFile):
	f = reSplit.split(line.rstrip())
	famId, id, moId, faId, sex, pheno = f[0], f[1], f[2], f[3], f[4], f[5]
	ids.append( id )

	# Genotypes
	gt1 = f[6::2]
	gt2 = f[7::2]
	for i in range(len(snps)):
		geno1[i].append( gt1[i] )
		geno2[i].append( gt2[i] )

# Write VCF file
ids_str = '\t'.join(ids)
print(f"#CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tGT\t{ids_str}")
for i in range(len(snps)) :
	id, chr, pos = snps[i]
	ref, alt, count, gtStr = alleles( geno1[i], geno2[i] )
	printf(f"{chr}\t{pos}\t{id}\t{ref}\t{alt}\t{count}\t{gtStr}")