File: vcfFilterSamples.pl

package info (click to toggle)
snpeff 5.2.f%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 701,384 kB
  • sloc: java: 62,547; perl: 2,279; sh: 1,185; python: 744; xml: 507; makefile: 50
file content (95 lines) | stat: -rwxr-xr-x 2,389 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/perl

#-------------------------------------------------------------------------------
#
# Create a VCF subset using some samples.
# Sample IDs are provided in a TXT file (format: one sample ID per line).
#
#
#															Pablo Cingolani
#-------------------------------------------------------------------------------

$debug = 0;			# Debug mode?
$SAMPLE_COLUMN = 9;	# Column where sample information starts in VCF 
%sample2col = {};	# Sample names to column number
@samplesKeep = ();	# Samples to keep

#-------------------------------------------------------------------------------
# Select some samples in a VCF line
#-------------------------------------------------------------------------------
sub selectSamples($) {
	my($i, $l, $out, @fields);
	($l) = @_;
	@fields = split /\t/, $l;

	$out = "";
	for( $i=0 ; $i < $SAMPLE_COLUMN ; $i++ ) {
		$out .= "\t" if( $out ne '' );
		$out .= $fields[$i];
	}

	for( $i=0 ; $i <= $#samplesKeep ; $i++ ) {
		$col = $sample2col{ $samplesKeep[$i] };
		$out .= "\t$fields[$col]";
	}

	print "$out\n";
}

#-------------------------------------------------------------------------------
# Main
#-------------------------------------------------------------------------------

#---
# Parse command line 
#---
$samplesFile = @ARGV[0];
die "Usage: cat file.vcf | ./vcfFilterSamples.pl sample_IDs.txt\n" if $samplesFile eq '';

#---
# Read samples file
#---
open SAMPLES, $samplesFile or die "Cannot open samples file '$samplesFile'\n";
for( $i=0 ; $l = <SAMPLES> ; $i++ ) {
	chomp $l;
	$samplesKeep[$i] = $l;
	print "samplesKeep[$i] = '$samplesKeep[$i]'\n" if $debug;
}
close SAMPLES;

#---
# Read VCF from STDIN
#---
$sampleNamesFound = 0;
while($l = <STDIN>) {
	chomp $l;

	if($l =~ /^#/) {
		# VCF header
		if( $l =~ /#CHROM/ ) {
			@t = split /\t/, $l;

			# Create sample name to sample number mapping
			# VCF sample names start at column 9
			for( $i=$SAMPLE_COLUMN ; $i <= $#t ; $i++ ) {
				$sname = $t[$i];
				print STDERR "Sample[$i] = '$sname'\n" if $debug;
				$sample2col{$sname} = $i if $sname ne '';
			}

			# Check that all samples 'to keep' are found
			foreach $sname ( @samplesKeep ) {
				die "Error: Cannot find sample '$sname' in VCF header\n" if( $sample2col{$sname} eq '' );
			}

			# Print header line
			selectSamples($l);
		} else {
			print "$l\n";
		}
	} else {
		# VCF body
		selectSamples($l);
	}
}