File: sga-bam2de.pl

package info (click to toggle)
sga 0.10.15-4
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 62,932 kB
  • sloc: cpp: 48,538; ansic: 2,035; python: 1,037; perl: 744; makefile: 349; sh: 185
file content (113 lines) | stat: -rwxr-xr-x 3,290 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#! /usr/bin/perl

use strict;
use Getopt::Long;

my $n = 5;
my $k = 99;
my $minLength = 200;
my $prefix = "";
my $numThreads = 1;
my $mind = -99; # minimum gap size to pass to abyss
my $mina = 100; # minimum alignment length to pass to abyss
my $help = ''; # (false)

# Filter the abyss distance est histogram to remove insert sizes
# with fewer than hist_min data points
my $hist_min = 3;

GetOptions("prefix=s" => \$prefix,
           "k=i"      => \$k,
           "n=i"      => \$n,
           "m=i"      => \$minLength,
           "mind=i"   => \$mind,
           "mina=i"   => \$mina,
           "t=i"      => \$numThreads,
           "help"     => \$help);
if($help)
{
   usage();
   exit(0);
}

checkDependency("abyss-fixmate");
checkDependency("DistanceEst");

my $bFail = 0;
if($prefix eq "")
{
    print("Error: A prefix must be specified\n");
    $bFail = 1;
}
my $bamFile = $ARGV[0];
if($bamFile eq "")
{
    print "Error: A bam file must be provided\n";
    $bFail = 1;
}

if($minLength <= 0)
{
    print "Error: minimum contig length (-m) must be greater than 0\n";
    $bFail = 1;
}

if($bFail)
{
    usage();
    exit(1);
}

my $cmd;

# fix-mate info
$cmd = "abyss-fixmate -h $prefix.tmp.hist $bamFile | samtools view -Sb - > $prefix.diffcontigs.bam";
runCmd($cmd);

# DistanceEst is very slow when the learned insert size distribution is very wide
# To work around this, remove singleton data points from the histogram
runCmd("awk \'\$2 >= $hist_min\' $prefix.tmp.hist > $prefix.hist");

# sort 
$cmd = "samtools sort -\@$numThreads $prefix.diffcontigs.bam -o $prefix.diffcontigs.sorted.bam";
runCmd($cmd);

# distance est
my $mind_opt = "--mind $mind";
$cmd = "DistanceEst -s $minLength $mind_opt -n $n -k $k -j $numThreads -o $prefix.de $prefix.hist -l $mina $prefix.diffcontigs.sorted.bam";
runCmd($cmd);

sub usage
{
    print "sga-bam2de.pl - Make a distance estimate file from a bam file of reads aligned to contigs\n";
    print "sga-bam2de.pl -n N --prefix lib300 lib300.bam\n";
    print "Options:\n";
    print "                -n N             Minimum number of pairs required to consider two contigs linked\n";
    print "                -m LEN           Only find links between contigs with length at least LEN bp (default: 200)\n";
    print "                -t NUM           Use NUM threads for computing the distance estimates\n";
    print "                --prefix NAME    Use NAME as the prefix for the outfiles\n";
    print "                --mind D         Set the minimum distance estimate to test to be D. This should be a negative\n";
    print "                                 number if contigs are expected to overlap. Defaults to -99bp.\n";
    print "                --mina N         Set the minimum alignment length to be N.\n";
    print "                --help           This help output.\n";
}

sub runCmd
{
    my($c) = @_;
    print "$c\n";
    system($c);
}

# Check whether the programs are found and executable
sub checkDependency
{
    while(my $program = shift) {
        my $ret = system("/bin/bash -c \"hash $program\"");
        if($ret != 0) {
            print STDERR "Could not find program $program. Please install it or update your PATH.\n";
            print STDERR "Return: $ret\n";
            exit(1);
        }
    }
}