File: filterMaf.pl

package info (click to toggle)
augustus 3.5.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 777,036 kB
  • sloc: cpp: 80,063; perl: 21,491; python: 4,368; ansic: 1,244; makefile: 1,126; sh: 171; javascript: 32
file content (122 lines) | stat: -rwxr-xr-x 2,923 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/perl
# filter blocks from a MAF alignment that intersect with a given genomic interval
# or filter sequences for a given subset of species
# Stefanie Koenig 23.06.2013

use strict;
use warnings;

use Getopt::Long; # for parameter specification on the command line

my $usage = <<'ENDUSAGE';
filterMaf.pl	filter blocks from a MAF alignment that intersect with a given genomic interval
		or filter sequences for a given subset of species
SYNOPSIS

filterMaf.pl < alignment.maf > filter.maf

OPTIONS

    --help             			output this help message
    --species=species1,species2,...	a comma separated list of the species which shall be kept in the filtered alignment (at least 2 species,
                                        by default all species are kept)		
    --min-seq N				only blocks that contain at list N sequences are taken (N=2 by default)
    --interval=seqID:start-end	        a genomic interval (seqID is equal to the first field of an 's' line in the alignment)

DESCRIPTION
      
  Example:

    filterMaf.pl --species=hg19,mm9,rheMac2,bosTau4 < alignment.maf > filter.maf
    filterMaf.pl --interval=hg19.chr21:43490000-43560000 < alignment.maf > filter.maf
    filterMaf.pl --min-seq 5 < alignment.maf > filter.maf
    filterMaf.pl --species=hg19,mm9,rheMac2,bosTau4 --min-seq 4 --interval=mm9.chr17:43490000-43560000 < alignment.maf > filter.maf

ENDUSAGE

my ($specieslist, $interval, $help); # options
my $minseq=2;

GetOptions('interval=s'=>\$interval,
	   'species=s'=>\$specieslist,
	   'min-seq:f' =>\$minseq,
           'help!'=>\$help);

if ($help){
    	print $usage;
    	exit(1);
}

my @slist=();
if (defined($specieslist)){
    @slist=split(/,/,$specieslist);
    if(@slist < 2){
	print "$specieslist has to be a comma separated list of at least two species.\n$usage";
    }
}


my ($s,$b,$e);
if (defined($interval)){
    if($interval=~m/(.+):(\d+)\-(\d+)/){
	$s=$1;
	$b=$2;
	$e=$3;
    }
    else{
	print "$interval has the wrong syntax.\n$usage";
    }
}

my @block=();
my $overlap = 0;

while(<>){
    if(/##maf/){
	print $_;
    }
    elsif(/^a\s/){
	if ((@block >= $minseq+1) && $overlap){
	    for (my $j=0; $j<@block; $j++){
		print $block[$j];
	    }
	    print "\n";
    	}
	@block=();
	push @block, $_;
	$overlap=0;
    }
    elsif (/^s\s/){
	my @f = split(/\s+/,$_);
	my ($seqID, $start, $alen, $strand, $slen) = ($f[1], $f[2], $f[3], $f[4], $f[5]);
	if($strand eq '-'){
	    $start= $slen - $start - $alen;
	}
	my $end=$start + $alen;
	$start++;
	if (!defined($specieslist)){
	    push @block, $_;
	}else{
	    foreach my $species (@slist){
		if($seqID=~m/$species/){
		    push @block, $_;
		    last;
		}
	    }
	}
	if (!defined($interval)){
	    $overlap=1;
	}
	else{
	    if(!($end<$b || $start>$e) && ($s eq $seqID)){
		$overlap=1;
	    }
	}
    }
}

if ((@block >= $minseq+1) && $overlap){
    for (my $j=0; $j<@block; $j++){
      print $block[$j];
  }
}