File: bam_coverage_windows.pl

package info (click to toggle)
libbio-graphics-perl 2.40-3
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 2,884 kB
  • sloc: perl: 21,801; makefile: 14
file content (131 lines) | stat: -rwxr-xr-x 2,941 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/perl -w
use strict;
use List::Util 'sum';
use Getopt::Long;

use vars qw/$start $end $current_chr @scores %seen %highest $win $normal $bam/;

use constant WIN => 25;

# A script to make bam coverage windows in WIG/BED 4 format
# requires that samtools be installed
# This script operates on one bam file at a time.  If you are comparing
# across bam files of diffenrent sizes (read numbers), take note
# of the normalization option.
# Sheldon McKay (sheldon.mckay@gmail.com)


BEGIN {
    die "samtools must be installed!" unless `which samtools`;
}


GetOptions ("normalize=i" => \$normal,
            "bam=s"       => \$bam,
            "window=i"    => \$win);

$bam or usage();

open BAM, "samtools depth $bam |";

$win ||= WIN;
$start = 1;
$end   = $win;

my $factor = normalization_factor($normal); 

chomp(my $name = `basename $bam .bam`);
print qq(track type=wiggle_0 name="$name" description="read coverage for $bam (window size $win)"\n);

while (<BAM>) {
    chomp;
    my ($chr,$coord,$score) = split;
    $current_chr ||= $chr;

    check_sorted($chr,$coord);

    if ( $chr ne $current_chr ||
	 $coord > $end ) {
	open_window($chr,$coord);
    }

    push @scores, $score;
    $current_chr = $chr;
}


sub open_window {
    my ($chr,$coord) = @_;

    if ($chr ne $current_chr) {
	$seen{$current_chr}++;
    }

    # close the last window, if needed
    if (@scores > 0) {
	close_window();
    }

    $start = nearest_start($coord);
    $end   = $start + $win;
}

sub close_window {
    my $sum = sum(@scores) or return 0;
    my $score = $sum/$win;
    $score *= $factor;
    print join("\t",$current_chr,$start,$end,$score), "\n";
    @scores = ();
    exit 0 unless $score;
}

sub nearest_start {
    my $start = shift;

    return 1 if $start < $win;

    while  ($start % $win) {
	$start--;
    }

    return $start;
}

sub normalization_factor {
    return 1 unless my $nahm = shift;
    print STDERR "Calculating total number of reads in $bam\n";
    chomp(my $total = `samtools view -c $bam`);
    print STDERR "$bam has $total reads\n";
    return $total/$nahm;
}

# sanity check for unsorted bam
sub check_sorted {
    my ($chr,$coord) = @_;

    return 1 if $coord > ($highest{$chr} || 0);
    $highest{$chr} = $coord;

    return 1 unless $seen{$chr};

    die_unsorted($chr,$coord);
}

sub die_unsorted {
    my ($chr,$coord) = @_;
    die "$chr $coord: $bam does not appear to be sorted\n",
    "Please try 'samtools sort' first";
}

sub usage {
    die '
Usage: perl bam_coverage_windows.pl -b bamfile -n 10_000_000 -w 25
    -b name of bam file to read REQUIRED
    -w window size (default 25)
    -n normalized read number -- if you will be comparing multiple bam files 
                                 select the read number to normalize against.
                                 all counts will be adjusted by a factor of:
                                 actual readnum/normalized read num

'
}