1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348
|
#!/usr/bin/perl
# this script works on top of the 'halTools' toolbox
# and exports a hal alignment to maf by splitting the alignment into several smaller
# alignment chunks of a certain size. An overlap between two consecutive alignment chunks can
# be specified. Furthermore, a list of 'genic' regions/intervals can be passed.
# In this case, the splitting is restricted to intergenic regions, e.g. position
# outside of the given intervals.
# Stefanie Koenig 2.06.2015
use strict;
use warnings;
use Getopt::Long; # for parameter specification on the command line
my $usage = <<'ENDUSAGE';
hal2maf_split.pl this script works on top of the 'halTools' toolbox
and exports a hal alignment to maf by splitting the alignment into several smaller
alignment chunks of a certain size. An overlap between two consecutive alignment chunks can
be specified. Furthermore, a list of 'genic' regions/intervals can be passed.
In this case, the splitting is restricted to intergenic regions, e.g. position
outside of the given intervals.
SYNOPSIS
hal2maf_split.pl --halfile aln.hal --refGenome genome
--halfile F F is the input hal file
--refGenome S S is the name of the reference genome
OPTIONS
--help output this help message
--keepDupes keep duplicates, i.e. alignments of a sequence with itself (default: off)
--keepAncestors export ancestral sequences (default: off)
--refSequence S S is the name of the reference sequence within the reference genome
(default: all sequences in the reference genome)
--outdir D D is the directory to which the output MAF files are written (default: current directory)
--chunksize N size of the aligment chunk. N is the number of bases in the reference
genome that are covered by the alignment chunks (default: 2500000)
--overlap N overlap between to consecutive alignment chunks. N is the nunber of overlapping
bases in the reference genome (default: 500000)
--cpus N number of cpus (default: 1)
--hal_exec_dir D D is the path to the hal executables. If not specified it must be in \$PATH environment variable.
--no_split_list L list of 'genic' intervals. The splitting of the alignment is not allowed
within these regions. L is a file with the following format:
seqname <tab> start <tab> end <newline>. Example:
chr2 120567671 120601255
chr2 120604238 120609520
chr5 65261850 65335670
chr5 56530780 865308994
...
DESCRIPTION
Example:
hal2maf_split.pl --halfile flies.hal --refGenome dmel --refSequence 3L --cpus 8 --hal_exec_dir /home/stefanie/tools/progressiveCactus/submodules/hal/bin
ENDUSAGE
sub cannot_overlap;
my ($halfile, $refGenome, $keepDupes, $keepAncestors, $refSequence, $no_split_list, $help); # options
my $h2m_param = "";
my $hal_exec_dir;
my $chunksize = 2500000;
my $overlap = 500000;
my $padding = 10000;
my $outdir;
my $min_intergenic = 2000; # when a list of genic intervals is given,
# all intervals that are within this distance are combined to a single interval
my $cpus = 1;
GetOptions('halfile:s'=>\$halfile,
'refGenome:s'=>\$refGenome,
'refSequence:s'=>\$refSequence,
'keepDupes!'=>\$keepDupes,
'keepAncestors!'=>\$keepAncestors,
'chunksize:i' =>\$chunksize,
'overlap:i' =>\$overlap,
'cpus:i' =>\$cpus,
'no_split_list:s' =>\$no_split_list,
'hal_exec_dir:s' =>\$hal_exec_dir,
'outdir:s' =>\$outdir,
'help!'=>\$help);
if ($help){
print $usage;
exit(1);
}
if(!defined($halfile)){
print "hal alignment missing.\n$usage";
exit(1);
}
if(!defined($refGenome)){
print "reference genome missing.\n$usage";
exit(1);
}
if(!defined($refGenome)){
print "reference genome missing.\n$usage";
exit(1);
}
if(!defined($keepDupes)){
$h2m_param.=" --noDupes";
}
if(!defined($keepAncestors)){
$h2m_param.=" --noAncestors";
}
if(defined($outdir)){
if($outdir =~/[^\/]$/){
$outdir .= '/';
}
unless (-e $outdir) {
mkdir $outdir;
}
}
else{
$outdir = "";
}
# check whether this perl module for paralell execution is installed
my $got_ForkManager = 0;
eval { require Parallel::ForkManager };
unless ($@) {
$got_ForkManager = 1;
Parallel::ForkManager->import();
}
if (!$got_ForkManager && $cpus > 1){
print STDERR "The perl module Parallel::ForkManager is required to run hal2maf_split.pl in parallel.\n";
print STDERR "Install this module first. On Ubuntu linux install with\n";
print STDERR "sudo apt-get install libparallel-forkmanager-perl\n";
print STDERR "Will now run sequentially (--cpus=1)...\n";
$cpus = 1;
}
if(defined($hal_exec_dir)){
if($hal_exec_dir =~/[^\/]$/){
$hal_exec_dir .= '/';
}
}
else{
$hal_exec_dir = "";
}
my $halStats = $hal_exec_dir . "halStats";
my $hal2maf = $hal_exec_dir . "hal2maf";
# check whether halStats and hal2maf are properly installed
if (qx(which "$halStats") !~ /halStats$/){
die ("$halStats is not executable. Please add the directory which contains the executable halStats to the PATH environment variable or specify the path with --hal_exec_dir.");
}
if (qx(which "$hal2maf") !~ /hal2maf$/){
die ("$hal2maf is not executable. Please add the directory which contains the executable hal2maf to the PATH environment variable or specify the path with --hal_exec_dir.");
}
# reading in the name and length of each sequence in the reference genome ...
my @seqlist = qx($halStats --chromSizes $refGenome $halfile);
my $r_c = $?; # return code
if($r_c != 0){
print "terminated after an error in halStats.\n";
exit(1);
}
# ... and store it in a hash
my %seqHash = ();
foreach(@seqlist){
chomp;
my($seqname,$len)=split("\t",$_);
next if(defined($refSequence) && $refSequence ne $seqname); # if a reference sequence is specified, skip all others
$seqHash{$seqname}=$len;
}
if(defined($refSequence) && !exists($seqHash{$refSequence})){
print "Reference sequence $refSequence not found in reference genome $refGenome\n";
exit(1);
}
# reading in no_split_list
my @intervals = ();
if(defined($no_split_list)){
open(IN, "<$no_split_list") or die ("Could not open $no_split_list");
while(<IN>){
chomp;
my($chr,$start,$end)=split("\t",$_);
next if(defined($refSequence) && $refSequence ne $chr); # if a reference sequence is specified, skip all others
push @intervals,[$chr, $start, $end];
}
close(IN);
}
# join neighboring genic intervals within a distance of atmost $min_intergenic bases
# sort intervals by 1. chromosome, 2. start
@intervals = sort {$a->[0] cmp $b->[0] || $a->[1] <=> $b->[1]} @intervals;
my @joined=(); # array of sorted, joined genic intervals: Elements: <seq> <start> <end> <end of preceeding interval> <start of succeeding interval>
my ($chr, $start, $end);
my $pred_end = 0;
foreach (@intervals){
if(defined($chr) && $_->[0] eq $chr && $_->[1] - $end <= $min_intergenic ) {
if($end < $_->[2]){
$end = $_->[2];
}
} else {
if (defined($chr)){
my $succ_start = $seqHash{$chr} - 1; # start position of succeeding interval on this sequence
if($_->[0] eq $chr){
$succ_start = $_->[1];
}
push @joined,[$chr, $start, $end, $pred_end, $succ_start];
$pred_end = $end; # end position of preceeding interval on this sequence
if($_->[0] ne $chr){
$pred_end = 0;
}
}
($chr, $start, $end) = ($_->[0], $_->[1], $_->[2]);
}
}
if (defined($chr)){
my $succ_start = $seqHash{$chr} - 1;
push @joined,[$chr, $start, $end, $pred_end, $succ_start];
}
# calculate start and end positions of alignment chunks
#
# if a list of genic intervals is specified, the start/end of an alignment chunk is shifted to
# an intergenic region if it is contained in a genic interval:
#
# case 1)
# the start position s of an alignment chunk A is contained in a genic interval
#
# s
# |----------------------| alignment chunk A
# |---------| |---------| genic intervals I1,...,Im (all disjunct)
# y x
#
# => s is shifted to the upstream intergenic region: s=max{(x+y)/2, x-10kb}
#
# case 2)
# the end position e of an alignment chunk A is contained in a genic interval
#
# e
# |----------------------| alignment chunk A
# |---------| |---------| genic intervals I1,...,Im (all disjunct)
# x y
#
# => e is shifted to the downstream intergenic region: e=min{(x+y)/2, x+10kb}
my @aln_chunks = (); # each element is one alignment chunk, e.g. sequence segment (seqname:start-end) in the reference
foreach my $seq (sort {$a cmp $b} keys %seqHash ){
my $seqlen = $seqHash{$seq};
my $start = 0;
while($start + $chunksize <= $seqlen){
my $end = $start + $chunksize - 1;
# go to first genic interval that may contain the start position of the alignment chunk
while (@joined && cannot_overlap($joined[0], [$seq, $start, $end])){
shift @joined;
}
foreach(@joined) {
last if($_->[0] gt $seq || ($_->[0] eq $seq && $_->[1] > $start));
if($_->[0] eq $seq && $_->[1] <= $start && $_->[2] >= $start){ # start position of alignment chunk within a genic interval
$start = int(($_->[1]+$_->[3])/2); #shift start position fo upstream intergenic region
if($_->[1]-$padding > $start){
$start=$_->[1]-$padding;
}
last;
}
}
foreach(@joined) {
last if($_->[0] gt $seq || ($_->[0] eq $seq && $_->[1] > $end));
if($_->[0] eq $seq && $_->[1] <= $end && $_->[2] >= $end){ # end position of alignment chunk within a genic interval
$end = int(($_->[2]+$_->[4])/2); # shift end position to downstream intergenic region
if($_->[2]+$padding<$end){
$end=$_->[2]+$padding;
}
last;
}
}
push @aln_chunks,[$seq, $start, $end]; # add alignment chunk
$start = $end + 1 - $overlap; # start position of next alignment chunk
}
# last alignment chunk
my $end = $seqlen - 1;
foreach(@joined) {
last if($_->[0] gt $seq || ($_->[0] eq $seq && $_->[1] > $start));
if($_->[0] eq $seq && $_->[1] <= $start && $_->[2] >= $start){
$start = int(($_->[1]+$_->[3])/2);
if($_->[1]-$padding > $start){
$start=$_->[1]-$padding;
}
last;
}
}
push @aln_chunks,[$seq, $start, $end]; # add alignment chunk
}
# export alignment chunks in parallel to maf format
if($cpus > 1){
my $pm = new Parallel::ForkManager($cpus);
foreach(@aln_chunks){
# this part is done in parallel by the child process
my $pid = $pm->start and next;
my ($seq, $start, $end) = ($_->[0], $_->[1], $_->[2]);
my $chunksize = $end - $start + 1;
my $cmd = "$hal2maf --refGenome $refGenome $h2m_param --refSequence $seq --start $start --length $chunksize $halfile $outdir$seq.$start-$end.maf";
print "$cmd\n";
qx($cmd);
my $r_c = $?; # return code
if($r_c != 0){
print "terminated after an error in hal2maf.\n";
exit(1);
}
$pm->finish; # terminate the child process
}
$pm->wait_all_children;
}
else{ # export alignment chunks sequentially to maf format
foreach(@aln_chunks){
my ($seq, $start, $end) = ($_->[0], $_->[1], $_->[2]);
my $chunksize = $end - $start + 1;
my $cmd = "$hal2maf --refGenome $refGenome $h2m_param --refSequence $seq --start $start --length $chunksize $halfile $outdir$seq.$start-$end.maf";
print "$cmd\n";
qx($cmd);
my $r_c = $?;
if($r_c != 0){
print "terminated after an error in hal2maf.\n";
exit(1);
}
}
}
# returns true if interval a
# - comes before interval b in the sorting order and
# - does not overlap with b
sub cannot_overlap {
my $a = shift;
my $b = shift;
if ($a->[0] lt $b->[0] or ( $a->[0] eq $b->[0] and $a->[2] < $b->[1])){
return 1;
}
return 0;
}
|