File: plot-bamstats

package info (click to toggle)
samtools 1.22.1-1
links: PTS, VCS
area: main
in suites: forky, sid
size: 22,092 kB
sloc: ansic: 36,084; perl: 7,882; sh: 475; makefile: 302; java: 158
file content (1708 lines) | stat: -rwxr-xr-x 59,243 bytes
#!/usr/bin/env perl
#
#    Copyright (C) 2012-2014,2018, 2024 Genome Research Ltd.
#
#    Author: Petr Danecek <pd3@sanger.ac.uk>
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
# DEALINGS IN THE SOFTWARE.

use strict;
use warnings;
use Carp;
use POSIX qw/floor ceil/;
use URI::Escape qw(uri_escape);
use List::Util qw(max);

my $opts = parse_params();
parse_bamcheck($opts);
if ( @{$$opts{bamcheck}} > 1 ) { merge_bamcheck($opts); exit; }

plot_qualities($opts);
plot_acgt_cycles($opts);
plot_gc($opts);
plot_gc_depth($opts);
plot_isize($opts);
plot_coverage($opts);
plot_mismatches_per_cycle($opts);
plot_indel_dist($opts);
plot_indel_cycles($opts);
plot_read_len_hist($opts);
create_html($opts);

exit;

#--------------------------------

sub error
{
    my (@msg) = @_;
    if ( scalar @msg ) { confess @msg; }
    die
        "About: Parses output of samtools stats (former bamcheck) and calls gnuplot to create graphs.\n",
        "Usage: plot-bamstats [OPTIONS] file.bam.bc\n",
        "       plot-bamstats -p outdir/ file.bam.bc\n",
        "Options:\n",
        "   -k, --keep-files                    Do not remove temporary files.\n",
        "   -l, --log-y                         Set the Y axis scale of the Insert Size plot to log 10.\n",
        "   -m, --merge                         Merge multiple bamstats files and output to STDOUT.\n",
        "   -p, --prefix <path>                 The output files prefix, add a slash to create new directory.\n",
        "   -r, --ref-stats <file.fa.gc>        Optional reference stats file with expected GC content (created with -s).\n",
        "   -s, --do-ref-stats <file.fa>        Calculate reference sequence GC for later use with -r\n",
        "   -t, --targets <file.tab>            Restrict -s to the listed regions (tab-delimited chr,from,to. 1-based, inclusive)\n",
        "   -q, --alt-qual-avg-method           Calculate average base quality using average base error probabilities rather than average quality scores.\n",
        "   -h, -?, --help                      This help message.\n",
        "\n";
}


sub parse_params
{
    $0 =~ s{^.+/}{};
    my $opts =
    {
        args => join(' ',$0,@ARGV),

        # for merging
        sections =>
        [
            {
                id=>'SN',
                header=>
                    "# Summary Numbers. Use `grep ^SN | cut -f 2-` to extract this part.\n",
            },
            {
                id=>'FFQ',
                header=>
                    "# First Fragment Qualities. Use `grep ^FFQ | cut -f 2-` to extract this part.\n" .
                    "# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n",
            },
            {
                id=>'LFQ',
                header=>
                    "# Last Fragment Qualities. Use `grep ^LFQ | cut -f 2-` to extract this part.\n" .
                    "# Columns correspond to qualities and rows to cycles. First column is the cycle number.\n",
            },
            {
                id=>'MPC',
                header=>
                    "# Mismatches per cycle and quality. Use `grep ^MPC | cut -f 2-` to extract this part.\n" .
                    "# Columns correspond to qualities, rows to cycles. First column is the cycle number, second\n" .
                    "# is the number of N's and the rest is the number of mismatches\n",
            },
            {
                id=>'GCF',
                header=>"# GC Content of first fragments. Use `grep ^GCF | cut -f 2-` to extract this part.\n",
            },
            {
                id=>'GCL',
                header=>"# GC Content of last fragments. Use `grep ^GCL | cut -f 2-` to extract this part.\n",
            },
            {
                id=>'GCC',
                header=>"# ACGT content per cycle. Use `grep ^GCC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [\%]; and N and O counts as a percentage of all A/C/G/T bases [\%]\n",
            },
	    {
		id => 'FBC',
		header=>"# ACGT content per cycle for first fragments. Use `grep ^FBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [\%]; and N and O counts as a percentage of all A/C/G/T bases [\%]\n",
	    },
	    {
		id => 'LBC',
		header=>"# ACGT content per cycle for last fragments. Use `grep ^LBC | cut -f 2-` to extract this part. The columns are: cycle; A,C,G,T base counts as a percentage of all A/C/G/T bases [\%]; and N and O counts as a percentage of all A/C/G/T bases [\%]\n",
	    },
            {
                id=>'IS',
                header=>"# Insert sizes. Use `grep ^IS | cut -f 2-` to extract this part. The columns are: insert size, pairs total, inward oriented pairs, outward oriented pairs, other pairs\n",
            },
            {
                id=>'RL',
                header=>"# Read lengths. Use `grep ^RL | cut -f 2-` to extract this part. The columns are: read length, count\n",
            },
            {
                id=>'FRL',
                header=>"# Read lengths - first fragments. Use `grep ^FRL | cut -f 2-` to extract this part. The columns are: read length, count\n",
            },
            {
                id=>'LRL',
                header=>"# Read lengths - last fragments. Use `grep ^LRL | cut -f 2-` to extract this part. The columns are: read length, count\n",
            },
            {
                id=>'ID',
                header=>"# Indel distribution. Use `grep ^ID | cut -f 2-` to extract this part. The columns are: length, number of insertions, number of deletions\n",
            },
            {
                id=>'IC',
                header=>"# Indels per cycle. Use `grep ^IC | cut -f 2-` to extract this part. The columns are: cycle, number of insertions (fwd), .. (rev) , number of deletions (fwd), .. (rev)\n",
            },
            {
                id=>'COV',
                header=>"# Coverage distribution. Use `grep ^COV | cut -f 2-` to extract this part.\n",
            },
            # {
            #     id=>'GCD',
            #     header=>"# GC-depth. Use `grep ^GCD | cut -f 2-` to extract this part. The columns are: GC%, unique sequence percentiles, 10th, 25th, 50th, 75th and 90th depth percentile\n",
            # },
        ],

        # for merging
        merge_keys =>
        {
            'sum' =>
            [
                'raw total sequences:',
                'filtered sequences:',
                'sequences:',
                '1st fragments:',
                'last fragments:',
                'reads mapped:',
                'reads mapped and paired:',
                'reads unmapped:',
                'reads properly paired:',
                'reads paired:',
                'reads duplicated:',
                'reads MQ0:',
                'reads QC failed:',
                'non-primary alignments:',
                'total length:',
                'total first fragment length:',
                'total last fragment length:',
                'bases mapped:',
                'bases mapped (cigar):',
                'bases trimmed:',
                'bases duplicated:',
                'mismatches:',
                'inward oriented pairs:',
                'outward oriented pairs:',
                'pairs with other orientation:',
                'pairs on different chromosomes:',
            ],
            'min' =>
            [
                'is sorted:',
            ],
            'max' =>
            [
                'maximum length:',
            ],
        },
    };
    for my $key (keys %{$$opts{merge_keys}})
    {
        for my $val (@{$$opts{merge_keys}{$key}}) { $$opts{hmerge}{$val} = $key; }
    }
    while (defined(my $arg=shift(@ARGV)))
    {
        if ( $arg eq '-k' || $arg eq '--keep-files' ) { $$opts{keep_files}=1; next; }
        if ( $arg eq '-l' || $arg eq '--log-y' ) { $$opts{y_axis_log10}=1; next; }
        if ( $arg eq '-m' || $arg eq '--merge' ) { $$opts{merge}=1; next; }
        if ( $arg eq '-r' || $arg eq '--ref-stats' ) { $$opts{ref_stats}=shift(@ARGV); next; }
        if ( $arg eq '-s' || $arg eq '--do-ref-stats' ) { $$opts{do_ref_stats}=shift(@ARGV); next; }
        if ( $arg eq '-t' || $arg eq '--targets' ) { $$opts{targets}=shift(@ARGV); next; }
        if ( $arg eq '-p' || $arg eq '--prefix' ) { $$opts{prefix}=shift(@ARGV); next; }
	if ( $arg eq '-q' || $arg eq '--alt-qual-avg-method' ) { $$opts{alt_qual_avg_method}=1; next; }
        if ( $arg eq '-?' || $arg eq '-h' || $arg eq '--help' ) { error(); }
        if ( $arg eq '-'  || -e $arg ) { push @{$$opts{bamcheck}},$arg; next; }
        error("Unknown parameter or non-existent file \"$arg\". Run -h for help.\n");
    }
    if ( exists($$opts{do_ref_stats }) ) { do_ref_stats($opts); exit; }
    if ( !exists($$opts{bamcheck}) ) { error("No samtools stats file?\n") }

    if ( !exists($$opts{prefix}) )
    {
        if ( !$$opts{merge} ) { error("Expected -p parameter.\n") }
        $$opts{prefix} = './';
    }
    elsif ( $$opts{merge} ) { error("Only one of -p or -m should be given.\n"); }
    if ( $$opts{merge} )
    {
        if ( @{$$opts{bamcheck}} < 2 ) { error("Nothing to merge\n") }
    }
    else
    {
        if ( !exists($$opts{prefix}) ) { error("Expected -p parameter.\n") }
        if ( $$opts{prefix}=~m{/$} ) { `mkdir -p $$opts{prefix}`; }
        elsif ( !($$opts{prefix}=~/-$/) ) { $$opts{prefix} .= '-'; }
    }
    return $opts;
}


# Creates GC stats for either the whole reference or only on target regions for exome QC
sub do_ref_stats
{
    my ($opts) = @_;


    my %targets = ();
    if ( exists($$opts{targets}) )
    {
        my ($prev_chr,$prev_pos);
        open(my $fh,'<',$$opts{targets}) or error("$$opts{targets}: $!");
        while (my $line=<$fh>)
        {
            if ( $line=~/^#/ ) { next; }
            my ($chr,$from,$to) = split(/\s+/,$line);
            chomp($to);
            push @{$targets{$chr}}, $from,$to;
            if ( !defined $prev_chr or $chr ne $prev_chr ) { $prev_chr=$chr; $prev_pos=$from }
            if ( $prev_pos > $from ) { error("The file must be sorted: $$opts{targets}\n"); }
            $prev_pos = $from;
        }
        close($fh);
    }

    my $nlen = 0;
    my %lens = ();
    my %gc_counts = ();
    my ($skip_chr,$pos,$ireg,$regions);
    open(my $fh,'<',$$opts{do_ref_stats}) or error("$$opts{do_ref_stats}: $!");
    while (my $line=<$fh>)
    {
        if ( $line=~/^>/ )
        {
            if ( !scalar %targets ) { next; }

            if ( !($line=~/>(\S+)/) ) { error("FIXME: could not determine chromosome name: $line"); }
            if ( !exists($targets{$1}) ) { $skip_chr=$1; next; }
            undef $skip_chr;
            $pos = 0;
            $ireg = 0;
            $regions = $targets{$1};
            next;
        }
        if ( defined $skip_chr ) { next; }

        # Only $_len sized lines are considered and no chopping for target regions.
        chomp($line);
        my $len = length($line);
        $lens{$len}++;
        $nlen++;

        if ( scalar %targets )
        {
            while ( $ireg<@$regions && $$regions[$ireg+1]<=$pos ) { $ireg += 2; }
            $pos += $len;
            if ( $ireg==@$regions ) { next; }
            if ( $pos < $$regions[$ireg] ) { next; }
        }

        my $gc_count = 0;
        for (my $i=0; $i<$len; $i++)
        {
            my $base = substr($line,$i,1);
            if ( $base eq 'g' || $base eq 'G' || $base eq 'c' || $base eq 'C' ) { $gc_count++; }
        }
        $gc_counts{$gc_count}++;
    }

    # Calculate median length
    my $n = 0;
    my $median_len = 0;
    for my $len (sort { $a<=>$b } keys %lens)
    {
        $n += $lens{$len};
        if ( $n >= $nlen ) { $median_len = $len; last; }
    }
    if ( !$median_len ) { error("FIXME: median_len=$median_len??\n"); }

    print "# Generated by $$opts{args}\n";
    print "# The columns are: GC content bin, normalized frequency\n";
    my $max;
    for my $count (values %gc_counts)
    {
        if ( !defined $max or $count>$max ) { $max=$count; }
    }
    for my $gc (sort {$a<=>$b} keys %gc_counts)
    {
        if ( $gc==0 ) { next; }
        printf "%f\t%f\n", $gc*100./$median_len, $gc_counts{$gc}/$max;
    }
}

sub plot
{
    my ($cmdfile) = @_;
    my $cmd = "gnuplot $cmdfile";
    system($cmd);
    if ( $? ) { error("The command exited with non-zero status $?:\n\t$cmd\n\n"); }
}

sub open_file
{
    my ($file) = @_;
    my $fh;
    if ( $file eq '-' ) { $fh = *STDIN; }
    elsif ( $file=~/\.gz$/i ) { open($fh, "gunzip -c $file |") or error("gunzip -c $file: $!"); }
    else { open($fh,'<',$file) or error("$file: $!"); }
    return $fh;
}

sub parse_bamcheck1
{
    my ($opts, $i) = @_;
    my $file = $$opts{bamcheck}[$i];
    print STDERR "Parsing samtools stats output: $file\n" unless !$$opts{verbose};
    my $fh = open_file($file);
    my $line = <$fh>;
    if ( !($line=~/^# This file was produced by (\S+)/) ) { error("Sanity check failed: was this file generated by samtools stats or plot-bamstats?"); }
    if ( $1 ne 'plot-bamstats' && $1 ne 'samtools' ) { error("Sanity check failed: was this file generated by samtools stats or plot-bamstats?"); }
    my %dat;
    my $sn_order = $$opts{sn_order} || [];
    my %sn_seen = map { $_ => 1 } @$sn_order;
    while ($line=<$fh>)
    {
        if ( $line=~/^#/ ) { next; }
        my @items = split(/\t/,$line);
        chomp($items[-1]);
        if ( $items[0] eq 'SN' )
        {
            $dat{SN}{$items[1]} = $items[2];
	    if (!exists($sn_seen{$items[1]})) {
		push(@$sn_order, $items[1]);
		$sn_seen{$items[1]} = 1;
	    }
            next;
        }
        push @{$dat{$items[0]}}, [splice(@items,1)];
    }
    close($fh) unless $file eq '-';

    $$opts{sn_order} = $sn_order;

    # Check sanity
    if ( !exists($dat{'SN'}{'sequences:'}) or !$dat{'SN'}{'sequences:'} )
    {
        error("Sanity check failed: no sequences found by samtools stats??\n");
    }
    my $nseq_new = $dat{'SN'}{'sequences:'};
    my $nseq_ori = exists($$opts{dat}{'SN'}) ? $$opts{dat}{'SN'}{'sequences:'} : 0;

    my %add = map { $_ => 1 } (qw(FFQ LFQ MPC GCF GCL IS ID IC RL FRL LRL));
    for my $a (keys %dat)
    {
        if ( !exists($$opts{dat}{$a}) ) { $$opts{dat}{$a} = $dat{$a}; next; } # first bamcheck file
        if ( $a eq 'SN' )
        {
            for my $b (keys %{$dat{$a}})
            {
                if ( exists($$opts{hmerge}{$b}) )
                {
                    if ( $$opts{hmerge}{$b} eq 'sum' ) { $$opts{dat}{$a}{$b} += $dat{$a}{$b}; next; }
                    if ( $$opts{hmerge}{$b} eq 'min' ) { $$opts{dat}{$a}{$b} = $$opts{dat}{$a}{$b} < $dat{$a}{$b} ? $$opts{dat}{$a}{$b} : $dat{$a}{$b}; }
                    if ( $$opts{hmerge}{$b} eq 'max' ) { $$opts{dat}{$a}{$b} = $$opts{dat}{$a}{$b} > $dat{$a}{$b} ? $$opts{dat}{$a}{$b} : $dat{$a}{$b}; }
                }
            }
            next;
        }
        if ( $add{$a} ) { add_to_matrix($$opts{dat}{$a}, $dat{$a}, 0); next; }
        if ( $a eq 'COV' ) { merge_coverage($$opts{dat}{$a}, $dat{$a}); next; }
        if ( $a eq 'GCC' || $a eq 'FBC' || $a eq 'LBC' ) {
	    merge_gcc($nseq_ori, $$opts{dat}{$a}, $nseq_new, $dat{$a});
	    next;
	}
        warn("Not processed: $a\n");
    }
    if ( $i==0 ) { return; }

    my $bases_mapped_cigar = get_value($opts,'SN','bases mapped (cigar):');
    $$opts{dat}{SN}{'error rate:'} = $bases_mapped_cigar ? sprintf "%e", get_value($opts,'SN','mismatches:') / $bases_mapped_cigar : 0;
    update_average_length($opts);
    update_average_quality($opts);
    update_average_isize($opts);
}

sub parse_bamcheck
{
    my ($opts) = @_;
    for (my $i=0; $i<@{$$opts{bamcheck}}; $i++) { parse_bamcheck1($opts,$i); }

    # Determine the default title, for now use the first BAM name:
    #       5446_6/5446_6.bam.bc.gp -> 5446_6
    #       test.aaa.png -> test.aaa
    if ( !($$opts{bamcheck}[0]=~m{([^/]+?)(?:\.bam)?(?:\.bc)?$}i) ) { error("FIXME: Could not determine the title from [$$opts{bamcheck}[0]]\n"); }
    $$opts{title} = $1;
}

sub add_to_matrix
{
    my ($dst,$src,$key) = @_;
    my $id = 0;
    my $is = 0;
    while ($is<@$src)
    {
        while ( $id<@$dst && $$src[$is][$key] > $$dst[$id][$key] ) { $id++; }
        if ( $id<@$dst && $$src[$is][$key] == $$dst[$id][$key] )
        {
            for (my $j=0; $j<@{$$src[$is]}; $j++)
            {
                if ( $j==$key ) { next; }
                $$dst[$id][$j] += $$src[$is][$j];
            }
        }
        else { splice(@$dst,$id,0,$$src[$is]); }
        $is++;
    }
}

sub rebin_values
{
    my ($vals,$bin_size,$col) = @_;
    error("pd3\@sanger todo: rebin_values\n");
}


sub merge_coverage
{
    my ($dst,$src) = @_;
    if ( !($$dst[0][0]=~/^\[(\d+)-(\d+)\]$/) ) { error("Could not determine bin size in COV: $$dst[0][0]\n"); }
    my $dst_bin = $2 - $1 + 1;
    if ( !($$src[0][0]=~/^\[(\d+)-(\d+)\]$/) ) { error("Could not determine bin size in COV: $$src[0][0]\n"); }
    my $src_bin = $2 - $1 + 1;
    my (@dst, @src);
    for my $row (@$dst) { splice(@$row,0,1); push @dst,$row; }
    for my $row (@$src) { splice(@$row,0,1); push @src,$row; }
    my $dst_out  = pop(@dst);
    my $src_out  = pop(@src);
    my $bin_size = $dst_bin;
    if ( $src_bin > $dst_bin ) { my $vals = rebin_values(\@dst,$src_bin,0); @dst = @$vals; $bin_size = $src_bin; }
    elsif ( $src_bin < $dst_bin ) { my $vals = rebin_values(\@src, $dst_bin, 0); @src = @$vals; }
    add_to_matrix(\@dst, \@src, 0);
    for my $row (@dst) { unshift(@$row, sprintf("[%d-%d]", $$row[0],$$row[0]+$bin_size-1)); }
    push @dst, [sprintf("[%d<]", $dst[-1][1]), $dst[-1][1], $$dst_out[1]+$$src_out[1] ];
    @$dst = @dst;
}

sub merge_gcc
{
    my ($ndst,$dst,$nsrc,$src) = @_;
    if ( @$dst != @$src ) { error("todo: improve me\n"); }
    for (my $i=0; $i<@$dst; $i++)
    {
        if ( $$dst[$i][0] ne $$src[$i][0] ) { error("todo: improve me\n"); }
        for (my $j=1; $j<@{$$dst[$i]}; $j++)
        {
            $$dst[$i][$j] = sprintf("%.2f", ($$dst[$i][$j]*$ndst + $$src[$i][$j]*$nsrc) / ($ndst + $nsrc));
        }
    }
}

sub merge_gcd
{
    # todo
}

sub get_value
{
    my ($opts, $id, $key) = @_;
    if ( !exists($$opts{dat}{$id}) ) { return undef; }
    if ( !exists($$opts{dat}{$id}{$key}) ) { return undef}
    return $$opts{dat}{$id}{$key};
}

sub get_values
{
    my ($opts, $id) = @_;
    if ( !exists($$opts{dat}{$id}) ) { return (); }
    # todo: add version sanity check here
    return (@{$$opts{dat}{$id}});
}

sub update_average_length
{
    my ($opts) = @_;
    my @vals = get_values($opts,'RL');
    my $sum = 0;
    my $avg = 0;
    for my $val (@vals) { $sum += $$val[1]; }
    for my $val (@vals) { $avg += $$val[0]*$$val[1] / $sum; }
    $$opts{dat}{SN}{'average length:'} = sprintf "%.1f", $avg;
}

sub update_average_quality
{
    my ($opts) = @_;
    my @vals;
    push @vals, get_values($opts, 'FFQ'), get_values($opts,'LFQ');

    my $qsum = 0;
    for my $row (@vals)
    {
        for (my $i=1; $i<@$row; $i++) { $qsum += $$row[$i] }
    }
    my $qavg = 0;
    for my $row (@vals)
    {
        for (my $i=1; $i<@$row; $i++) { $qavg += ($i-1)*$$row[$i] / $qsum; }
    }
    $$opts{dat}{SN}{'average quality:'} = sprintf "%.1f", $qavg;
}

sub update_average_isize
{
    my ($opts) = @_;
    my @vals = get_values($opts, 'IS');
    my $sum = 0;
    for my $row (@vals) { $sum += $$row[1]; }
    my $avg = 0;
    for my $row (@vals) { $avg += $$row[0]*$$row[1] / $sum; }
    my $dev = 0;
    for my $row (@vals) { $dev += ($avg - $$row[0])*($avg - $$row[0]) * $$row[1] / $sum; }
    $$opts{dat}{SN}{'insert size average:'} = sprintf "%.1f", $avg;
    $$opts{dat}{SN}{'insert size standard deviation:'} = sprintf "%.1f", sqrt($dev);
}

sub get_defaults
{
    my ($opts,$img_fname,%args) = @_;

    if ( !($img_fname=~/\.png$/i) ) { error("FIXME: currently only PNG supported. (Easy to extend.)\n"); }

    # Determine the gnuplot script file name
    my $gp_file = $img_fname;
    $gp_file =~ s{\.[^.]+$}{.gp};
    if ( !($gp_file=~/.gp$/) ) { $gp_file .= '.gp'; }

    my $title = $$opts{title};

    my $dir = $gp_file;
    $dir =~ s{/[^/]+$}{};
    if ( $dir && $dir ne $gp_file ) { `mkdir -p $dir`; }

    my $wh = exists($args{wh}) ? $args{wh} : '600,400';

    open(my $fh,'>',$gp_file) or error("$gp_file: $!");
    return {
        title => $title,
        gp    => $gp_file,
        img   => $img_fname,
        fh    => $fh,
        terminal => qq[set terminal png size $wh truecolor],
        grid  => 'set grid xtics ytics y2tics back lc rgb "#cccccc"',
    };
}

sub percentile
{
    my ($p,@vals) = @_;
    my $N = 0;
    for my $val (@vals) { $N += $val; }
    my $n = $p*($N+1)/100.;
    my $k = int($n);
    my $d = $n-$k;
    if ( $k<=0 ) { return 0; }
    if ( $k>=$N ) { return scalar @vals-1; }
    my $cnt;
    for (my $i=0; $i<@vals; $i++)
    {
        $cnt += $vals[$i];
        if ( $cnt>=$k ) { return $i; }
    }
    error("FIXME: this should not happen [percentile]\n");
}

sub plot_qualities
{
    my ($opts) = @_;

    my @ffq = get_values($opts, 'FFQ');
    if ( !@ffq ) { return; }

    my $yrange = @{$ffq[0]} > 50 ? @{$ffq[0]} : 50;
    my $is_paired = get_value($opts,'SN','reads paired:');

    # Average quality per cycle, forward and reverse reads in one plot
    my $args = get_defaults($opts,"$$opts{prefix}quals.png");
    my $fh = $$args{fh};
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set ylabel "Average Quality"
            set xlabel "Cycle"
            set yrange [0:$yrange]
            set title "$$args{title}" noenhanced
            plot '-' using 1:2 with lines title 'Forward reads' ] . ($is_paired ? q[, '-' using 1:2 with lines title 'Reverse reads'] : '') . q[
        ];
    my (@fp75,@fp50,@fmean);
    my (@lp75,@lp50,@lmean);
    my ($fmax,$fmax_qual,$fmax_cycle);
    my ($lmax,$lmax_qual,$lmax_cycle);
    for my $cycle (@ffq)
    {
        my $sum=0; my $n=0;
        for (my $iqual=1; $iqual<@$cycle; $iqual++)
        {
            if ( exists($$opts{alt_qual_avg_method }) ) {
                my $prob_base_error = 10 ** (-$iqual/10);
                $sum += $$cycle[$iqual]*$prob_base_error;
            } else {
                $sum += $$cycle[$iqual]*$iqual;
            }
            $n += $$cycle[$iqual];
            if ( !defined $fmax or $fmax<$$cycle[$iqual] ) { $fmax=$$cycle[$iqual]; $fmax_qual=$iqual; $fmax_cycle=$$cycle[0]; }
        }
        my $p25 = percentile(25,(@$cycle)[1..$#$cycle]);
        my $p50 = percentile(50,(@$cycle)[1..$#$cycle]);
        my $p75 = percentile(75,(@$cycle)[1..$#$cycle]);
        if ( !$n ) { next; }
        push @fp75, "$$cycle[0]\t$p25\t$p75\n";
        push @fp50, "$$cycle[0]\t$p50\n";
        if ( exists($$opts{alt_qual_avg_method }) ) {
            my $mean_prob_base_error = $sum/$n;
            my $mean_error_quality_score = -10 * ( log($mean_prob_base_error) / log(10) );
            push @fmean, sprintf "%d\t%.2f\n", $$cycle[0],$mean_error_quality_score;
        } else {
            push @fmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n;
        }
        printf $fh $fmean[-1];
    }
    print $fh "end\n";
    my @lfq = ();
    if ( $is_paired )
    {
        @lfq = get_values($opts, 'LFQ');
        for my $cycle (@lfq)
        {
            my $sum=0; my $n=0;
            for (my $iqual=1; $iqual<@$cycle; $iqual++)
            {
		if ( exists($$opts{alt_qual_avg_method }) ) {
		    my $prob_base_error = 10 ** (-$iqual/10);
		    $sum += $$cycle[$iqual]*$prob_base_error;
		} else {
		    $sum += $$cycle[$iqual]*$iqual;
		}
                $n += $$cycle[$iqual];
                if ( !defined $lmax or $lmax<$$cycle[$iqual] ) { $lmax=$$cycle[$iqual]; $lmax_qual=$iqual; $lmax_cycle=$$cycle[0]; }
            }
            my $p25 = percentile(25,(@$cycle)[1..$#$cycle]);
            my $p50 = percentile(50,(@$cycle)[1..$#$cycle]);
            my $p75 = percentile(75,(@$cycle)[1..$#$cycle]);
            if ( !$n ) { next; }
            push @lp75, "$$cycle[0]\t$p25\t$p75\n";
            push @lp50, "$$cycle[0]\t$p50\n";
	    if ( exists($$opts{alt_qual_avg_method }) ) {
		my $mean_prob_base_error = $sum/$n;
		my $mean_error_quality_score = -10 * ( log($mean_prob_base_error) / log(10) );
		push @lmean, sprintf "%d\t%.2f\n", $$cycle[0],$mean_error_quality_score;
	    } else {
		push @lmean, sprintf "%d\t%.2f\n", $$cycle[0],$sum/$n;
	    }
            printf $fh $lmean[-1];
        }
        print $fh "end\n";
    }
    close($fh);
    plot($$args{gp});



    # Average, mean and quality percentiles per cycle, forward and reverse reads in separate plots
    $args = get_defaults($opts,"$$opts{prefix}quals2.png",wh=>$is_paired ? '700,500' : '600,400');
    $fh = $$args{fh};
    my $pos_size = " set rmargin 0; set lmargin 0; set tmargin 0; set bmargin 0; set origin 0.1,0.1; set size 0.4,0.8";
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set multiplot
            $pos_size
            set yrange [0:$yrange]
            set ylabel "Quality"
            set xlabel "Cycle (fwd reads)"
            plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 1 t 'Mean'
        ];
    print $fh join('',@fp75),"end\n";
    print $fh join('',@fp50),"end\n";
    print $fh join('',@fmean),"end\n";
    if ( $is_paired )
    {
        print $fh qq[
                set origin 0.55,0.1
                set size 0.4,0.8
                unset ytics
                set y2tics mirror
                set y2range [0:$yrange]
                unset ylabel
                set xlabel "Cycle (rev reads)"
                set label "$$args{title}" at screen 0.5,0.95 center noenhanced
                plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#cccccc" t '25-75th percentile' , '-' using 1:2 with lines lc rgb "#000000" t 'Median', '-' using 1:2 with lines lt 2 t 'Mean'
            ];
        print $fh join('',@lp75),"end\n";
        print $fh join('',@lp50),"end\n";
        print $fh join('',@lmean),"end\n";
    }
    close($fh);
    plot($$args{gp});



    # Quality distribution per cycle, the distribution is for each cycle plotted as a separate curve
    $args = get_defaults($opts,"$$opts{prefix}quals3.png",wh=>$is_paired ? '600,600' : '600,400');
    $fh = $$args{fh};
    my $nquals = @{$ffq[0]}-1;
    my $ncycles = @ffq;
    $pos_size = $is_paired ? " set rmargin 0; set lmargin 0; set tmargin 0; set bmargin 0; set origin 0.15,0.52; set size 0.8,0.4" : '';
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set multiplot
            $pos_size
            set title "$$args{title}" noenhanced
            set ylabel "Frequency (fwd reads)"
            set label "Cycle $fmax_cycle" at $fmax_qual+1,$fmax
            unset xlabel
            set xrange [0:$nquals]
            set format x ""
            plot '-' using 1:2:3 with lines linecolor variable title ''
        ];
    for my $cycle (0..$#ffq)
    {
        for (my $iqual=1; $iqual<$nquals; $iqual++) {
	    printf $fh "%d\t%d\t%d\n", $iqual, $ffq[$cycle]->[$iqual], $cycle + 1;
	}
	print $fh "\n";
    }
    print $fh "end\n";
    if ( $is_paired )
    {
        print $fh qq[
                set origin 0.15,0.1
                set size 0.8,0.4
                unset title
                unset format
                set xtics
                set xlabel "Quality"
                unset label
                set label "Cycle $lmax_cycle" at $lmax_qual+1,$lmax
                set ylabel "Frequency (rev reads)"
                plot '-' using 1:2:3 with lines linecolor variable title ''
            ];
        for my $cycle (0..$#lfq)
        {
            for (my $iqual=1; $iqual<$nquals; $iqual++)
            {
		printf $fh "%d\t%d\t%d\n", $iqual, $lfq[$cycle]->[$iqual], $cycle + 1;
            }
            print $fh "\n";
        }
	print $fh "end\n";
    }
    close($fh);
    plot($$args{gp});

    # Heatmap qualitites
    $args = get_defaults($opts,"$$opts{prefix}quals-hm.png", wh=>'600,500');
    $fh = $$args{fh};
    my $max = defined $lmax && $lmax > $fmax ? $lmax : $fmax;
    my $ytics = generate_ticks([ map { $_->[0] } @ffq ], $is_paired);
    my $colorbox = "set colorbox vertical user origin first ($nquals+1),0 size screen 0.025,0.812";
    $pos_size = $is_paired
       ? " set origin 0.05,0.5\n set size 0.85,0.5" :
         " set origin 0.05,0.05\n set size 0.85,1.0\n$colorbox" ;
    my $set_xtics = $is_paired ? "unset" : "set";

    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            unset key
            unset colorbox

            # Perceptually uniform heatmap "plasma" by Nathaniel J. Smith & Stefan van der Walt
            # used in Python's matplotlib:  https://github.com/BIDS/colormap/blob/master/colormaps.py
            # Values from:  https://github.com/Gnuplotting/gnuplot-palettes/blob/master/plasma.pal
            # Dark end squashed to make low values more distinguishable.
            set palette model RGB
            set palette defined ( \\
                 0 '#000000' \\
              , .7 '#0c0847' \\
              ,  4 '#4b0381' \\
              , 10 '#7d03a8' \\
              , 20 '#a82296' \\
              , 30 '#cb4679' \\
              , 40 '#e56b5d' \\
              , 50 '#f89441' \\
              , 60 '#fdc328' \\
              , 70 '#f0f921' )

            set cbrange [0:$max]
            set yrange  [0:$ncycles]
            set xrange  [0:$nquals]
            set view map
            set multiplot
            set rmargin 0
            set lmargin 0
            set tmargin 0
            set bmargin 0
            $pos_size
            set obj 1 rectangle behind from first 0,0 to first $nquals,$ncycles
            set obj 1 fillstyle solid 1.0 fillcolor rgbcolor "black"
            set ylabel "Cycle (fwd reads)" offset character -1,0
            unset ytics
            set ytics ($ytics)
            $set_xtics xtics
            set title "$$args{title}" noenhanced
            splot '-' matrix with image
        ];
    for my $cycle (@ffq)
    {
        for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; }
        print $fh "\n";
    }
    print $fh "\nend\n";
    if ( $is_paired )
    {
        my $ytics = generate_ticks([ map { $_->[0] } @lfq ], $is_paired);
        print $fh qq[
                set origin 0.05,0.05
                set size 0.85,0.5
                set ylabel "Cycle (rev reads)" offset character -1,0
                set xlabel "Base Quality"
                unset title
                unset ytics
                set ytics ($ytics)
                set xrange  [0:$nquals]
                set xtics
                $colorbox
                set cblabel "Number of bases"
                splot '-' matrix with image
            ];
        for my $cycle (@lfq)
        {
            for (my $iqual=1; $iqual<@$cycle; $iqual++) { print $fh "\t$$cycle[$iqual]"; }
            print $fh "\n";
        }
        print $fh "\nend\n";
    }
    close($fh);
    plot($$args{gp});
}

sub generate_ticks {
    my ($cycles, $skip_even) = @_;

    my @marks;
    my $div = 10;
    while (scalar(@$cycles)/$div > 20) {
       $div *= 2;
    }
    $div = nice_num($div);
    foreach my $v (@$cycles) {
        if ($v % $div == 0) {
            if ($skip_even) {
                my $label = $v % 20 ? $v : '';
                push @marks, qq{"$label" $v};
            }
            else {
                push @marks, qq{"$v" $v};
            }
        }
    }
    return join(', ', @marks);
}

sub plot_acgt_cycles
{
    my ($opts) = @_;

    my @gcc = get_values($opts, 'GCC');
    if ( !@gcc ) { return; }

    my $args = get_defaults($opts,"$$opts{prefix}acgt-cycles.png");
    my $fh = $$args{fh};
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set style line 1 linecolor rgb "green"
            set style line 2 linecolor rgb "red"
            set style line 3 linecolor rgb "black"
            set style line 4 linecolor rgb "blue"
            set style increment user
            set ylabel "Base content [%]"
            set xlabel "Read Cycle"
            set yrange [0:100]
            set title "$$args{title}" noenhanced
            plot '-' w l ti 'A', '-' w l ti 'C', '-' w l ti 'G', '-' w l ti 'T'
        ];
    for my $base (1..4)
    {
        for my $cycle (@gcc)
        {
            print $fh $$cycle[0]+1,"\t",$$cycle[$base],"\n";
        }
        print $fh "end\n";
    }
    close($fh);
    plot($$args{gp});
}


sub plot_gc
{
    my ($opts) = @_;

    my $is_paired = get_value($opts,'SN','reads paired:');
    my $args = get_defaults($opts,"$$opts{prefix}gc-content.png");
    my $fh = $$args{fh};
    my ($gcl_max,$gcf_max,$lmax,$fmax);
    my @gcf = get_values($opts, 'GCF');
    my @gcl = get_values($opts, 'GCL');
    for my $gc (@gcf) { if ( !defined $gcf_max or $gcf_max<$$gc[1] ) { $gcf_max=$$gc[1]; $fmax=$$gc[0]; } }
    for my $gc (@gcl) { if ( !defined $gcl_max or $gcl_max<$$gc[1] ) { $gcl_max=$$gc[1]; $lmax=$$gc[0]; } }
    my $gcmax = $is_paired && $gcl_max > $gcf_max ? $lmax : $fmax;
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set title "$$args{title}" noenhanced
            set ylabel "Normalized Frequency"
            set xlabel "GC Content [%]"
            set yrange [0:1.1]
            set label sprintf("%.1f",$gcmax) at $gcmax,1 front offset 1,0
            plot ]
                 . (exists($$opts{ref_stats}) ? q['-' smooth csplines with lines lt 0 title 'Reference', ] : '')
                 . q['-' smooth csplines with lines lc 1 title 'First fragments' ]
                 . ($is_paired ? q[, '-' smooth csplines with lines lc 2 title 'Last fragments'] : '')
                 . q[
        ];
    if ( exists($$opts{ref_stats}) )
    {
        open(my $ref,'<',$$opts{ref_stats}) or error("$$opts{ref_stats}: $!");
        while (my $line=<$ref>) { print $fh $line }
        close($ref);
        print $fh "end\n";
    }
    for my $cycle (@gcf) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcf_max; }
    print $fh "end\n";
    if ( $is_paired )
    {
        for my $cycle (@gcl) { printf $fh "%d\t%f\n", $$cycle[0],$$cycle[1]/$gcl_max; }
        print $fh "end\n";
    }
    close($fh);
    plot($$args{gp});
}


sub plot_gc_depth
{
    my ($opts) = @_;

    my @gcd = get_values($opts,'GCD');
    if ( @gcd <= 1 ) { return; }

    # Find unique sequence percentiles for 30,40, and 50% GC content, just to draw x2tics.
    my @tics = ( {gc=>30},{gc=>40},{gc=>50} );
    for my $gc (@gcd)
    {
        for my $tic (@tics)
        {
            my $diff = abs($$gc[0]-$$tic{gc});
            if ( !exists($$tic{pr}) or $diff<$$tic{diff} ) { $$tic{pr}=$$gc[1]; $$tic{diff}=$diff; }
        }
    }

    my @x2tics;
    for my $tic (@tics) { push @x2tics, qq["$$tic{gc}" $$tic{pr}]; }
    my $x2tics = join(',',@x2tics);

    my $args = get_defaults($opts,"$$opts{prefix}gc-depth.png", wh=>'600,500');
    my $fh = $$args{fh};
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set ylabel "Mapped depth"
            set xlabel "Percentile of mapped sequence ordered by GC content"
            set x2label "GC Content [%]"
            set title "$$args{title}" noenhanced
            set x2tics ($x2tics)
            set xtics nomirror
            set xrange [0.1:99.9]

            plot '-' using 1:2:3 with filledcurve lt 1 lc rgb "#dedede" t '10-90th percentile' , \\
                 '-' using 1:2:3 with filledcurve lt 1 lc rgb "#bbdeff" t '25-75th percentile' , \\
                 '-' using 1:2 with lines lc rgb "#0084ff" t 'Median'
        ];
    for my $gc (@gcd) { print $fh "$$gc[1]\t$$gc[2]\t$$gc[6]\n"; } print $fh "end\n";
    for my $gc (@gcd) { print $fh "$$gc[1]\t$$gc[3]\t$$gc[5]\n"; } print $fh "end\n";
    for my $gc (@gcd) { print $fh "$$gc[1]\t$$gc[4]\n"; } print $fh "end\n";
    close($fh);
    plot($$args{gp});
}


sub plot_isize
{
    my ($opts) = @_;

    my $is_paired = get_value($opts,'SN','reads paired:');
    my @isizes = get_values($opts, 'IS');
    if ( !$is_paired or !@isizes ) { return; }

    my ($isize_max,$isize_cnt);
    for my $isize (@isizes)
    {
        if ( !defined $isize_max or $isize_cnt<$$isize[1] ) { $isize_cnt=$$isize[1]; $isize_max=$$isize[0]; }
    }

    my $args = get_defaults($opts,"$$opts{prefix}insert-size.png");
    my $fh = $$args{fh};
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set rmargin 5
            set label sprintf("%d",$isize_max) at $isize_max+10,$isize_cnt
            set ylabel  "Number of pairs"
            set xlabel  "Insert Size"
            set title "$$args{title}" noenhanced];
    print $fh qq[
            set logscale y 10] if exists $$opts{y_axis_log10};
    print $fh qq[
            plot \\
                '-' with lines lc rgb 'black' title 'All pairs', \\
                '-' with lines title 'Inward', \\
                '-' with lines title 'Outward', \\
                '-' with lines title 'Other'
        ];
    for my $isize (@isizes) { print $fh "$$isize[0]\t$$isize[1]\n"; } print $fh "end\n";
    for my $isize (@isizes) { print $fh "$$isize[0]\t$$isize[2]\n"; } print $fh "end\n";
    for my $isize (@isizes) { print $fh "$$isize[0]\t$$isize[3]\n"; } print $fh "end\n";
    for my $isize (@isizes) { print $fh "$$isize[0]\t$$isize[4]\n"; } print $fh "end\n";
    close($fh);
    plot($$args{gp});
}


sub plot_coverage
{
    my ($opts) = @_;

    my @cov = get_values($opts, 'COV');
    if ( !@cov ) { return; }

    my @vals;
    for my $cov (@cov) { push @vals,$$cov[2]; }
    my $i = percentile(99.8,@vals);
    my $p99 = $cov[$i][1];

    my $args = get_defaults($opts,"$$opts{prefix}coverage.png");
    my $fh = $$args{fh};
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set ylabel "Number of mapped bases"
            set xlabel "Coverage"
            set log y
            set style fill solid border -1
            set title "$$args{title}" noenhanced
            set xrange [:$p99]
            plot '-' with lines notitle
        ];
    for my $cov (@cov)
    {
        if ( $$cov[2]==0 ) { next; }
        print $fh "$$cov[1]\t$$cov[2]\n";
    }
    print $fh "end\n";
    close($fh);
    plot($$args{gp});
}


sub plot_mismatches_per_cycle
{
    my ($opts) = @_;

    my @mpc = get_values($opts, 'MPC');
    if ( !@mpc ) { return; }

    my $nquals = @{$mpc[0]} - 2;
    my $ncycles = @mpc;
    my ($style,$with);
    if ( $ncycles>100 ) { $style = ''; $with = 'w l'; }
    else { $style = 'set style data histogram; set style histogram rowstacked'; $with = ''; }

    my $args = get_defaults($opts,"$$opts{prefix}mism-per-cycle.png");
    my $fh = $$args{fh};
    print $fh qq[
            $$args{terminal}
            set output "$$args{img}"
            $$args{grid}
            set style line 1 linecolor rgb "#e40000"
            set style line 2 linecolor rgb "#ff9f00"
            set style line 3 linecolor rgb "#bbbb00"
            set style line 4 linecolor rgb "#4ebd68"
            set style line 5 linecolor rgb "#0061ff"
            set style increment user
            set key left top
            $style
            set ylabel "Number of mismatches"
            set xlabel "Read Cycle"
            set style fill solid border -1
            set title "$$args{title}" noenhanced
            set xrange [-1:$ncycles]
            plot '-' $with ti 'Base Quality>30', \\
                 '-' $with ti '30>=Q>20', \\
                 '-' $with ti '20>=Q>10', \\
                 '-' $with ti '10>=Q', \\
                 '-' $with ti "N's"
        ];
    for my $cycle (@mpc)
    {
        my $sum; for my $idx (31..$#$cycle) { $sum += $$cycle[$idx]; }
        print $fh "$sum\n";
    }
    print $fh "end\n";
    for my $cycle (@mpc)
    {
        my $sum; for my $idx (22..31) { $sum += $$cycle[$idx]; }
        print $fh "$sum\n";
    }
    print $fh "end\n";
    for my $cycle (@mpc)
    {
        my $sum; for my $idx (12..21) { $sum += $$cycle[$idx]; }
        print $fh "$sum\n";
    }
    print $fh "end\n";
    for my $cycle (@mpc)
    {
        my $sum; for my $idx (2..11) { $sum += $$cycle[$idx]; }
        print $fh "$sum\n";
    }
    print $fh "end\n";
    for my $cycle (@mpc) { print $fh "$$cycle[1]\n"; }
    print $fh "end\n";
    close($fh);
    plot($$args{gp});
}

sub plot_indel_dist
{
    my ($opts) = @_;

    my @indels = get_values($opts, 'ID');
    if ( !@indels ) { return; }

    my $args = get_defaults($opts,"$$opts{prefix}indel-dist.png");
    my $fh = $$args{fh};
    print $fh qq[
        $$args{terminal}
        set output "$$args{img}"
        $$args{grid}
        set style line 1 linetype 1  linecolor rgb "red"
        set style line 2 linetype 2  linecolor rgb "black"
        set style line 3 linetype 3  linecolor rgb "green"
        set style increment user
        set ylabel "Indel count [log]"
        set xlabel "Indel length"
        set y2label "Insertions/Deletions ratio"
        set log y
        set y2tics nomirror
        set ytics nomirror
        set title "$$args{title}" noenhanced
        plot '-' w l ti 'Insertions', '-' w l ti 'Deletions', '-' axes x1y2 w l ti "Ins/Dels ratio"
    ];
    for my $len (@indels) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n";
    for my $len (@indels) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n";
    for my $len (@indels) { printf $fh "%d\t%f\n", $$len[0],$$len[2]?$$len[1]/$$len[2]:0; } print $fh "end\n";
    close($fh);
    plot($$args{gp});
}

sub plot_indel_cycles
{
    my ($opts) = @_;

    my @indels = get_values($opts, 'IC');
    if ( !@indels ) { return; }

    my $is_paired = get_value($opts,'SN','reads paired:');

    my $args = get_defaults($opts,"$$opts{prefix}indel-cycles.png");
    my $fh = $$args{fh};
    print $fh qq[
        $$args{terminal}
        set output "$$args{img}"
        $$args{grid}
        set style line 1 linetype 1  linecolor rgb "red"
        set style line 2 linetype 2  linecolor rgb "black"
        set style line 3 linetype 3  linecolor rgb "green"
        set style line 4 linetype 4  linecolor rgb "blue"
        set style increment user
        set ylabel "Indel count"
        set xlabel "Read Cycle"
        set title "$$args{title}" noenhanced
    ];
    if ( $is_paired )
    {
        print $fh qq[plot '-' w l ti 'Insertions (fwd)', '' w l ti 'Insertions (rev)', '' w l ti 'Deletions (fwd)', '' w l ti 'Deletions (rev)'\n];
        for my $len (@indels) { print $fh "$$len[0]\t$$len[1]\n"; } print $fh "end\n";
        for my $len (@indels) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n";
        for my $len (@indels) { print $fh "$$len[0]\t$$len[3]\n"; } print $fh "end\n";
        for my $len (@indels) { print $fh "$$len[0]\t$$len[4]\n"; } print $fh "end\n";
    }
    else
    {
        print $fh qq[plot '-' w l ti 'Insertions', '' w l ti 'Deletions'\n];
        for my $len (@indels) { print $fh "$$len[0]\t$$len[2]\n"; } print $fh "end\n";
        for my $len (@indels) { print $fh "$$len[0]\t$$len[4]\n"; } print $fh "end\n";
    }
    close($fh);
    plot($$args{gp});
}


sub nice_num
{
    my ($num) = @_;
    my $ret;

    if ($num < 1) {
        $ret = 1;
    } else {
        my $expon_num = floor(log($num)/log(10));
        my $fract_num = ($num / (10.0 ** $expon_num));

        my $nf = ceil($fract_num);

        $ret = ($nf * (10.0 ** $expon_num));
    }

    return $ret;
}


sub plot_read_len_hist
{
    my ($opts) = @_;

    my @rl = get_values($opts, 'RL');
    if ( !@rl ) { return; }

    # TODO: Add modified version for paired end reads.
    # my $is_paired = get_value($opts,'SN','reads paired:');

    my $args = get_defaults($opts,"$$opts{prefix}read-length.png");
    my $fh = $$args{fh};

    # anonymous subroutines for code readibility
    my $log_bin = sub {
        my $x = $_[0];
        return floor((log($x) / log(10)) * 10);
    };
    my $inv_log_bin = sub {
        my $x = $_[0];
        return floor(10**($x/10));
    };

    # Open files to store data: this is more elegant than copying four sets of
    # data into a single GNUplot file.
    open(my $rd_fh, '>', "$$opts{prefix}read_len_rawdata.txt");
    open(my $hd_fh, '>', "$$opts{prefix}read_len_histdata.txt");

    # Create gnuplot command file options
    my $min_x = $rl[0][0];
    my $max_x = $rl[-1][0];
    my $min_y = $rl[0][1];
    my $max_y = $min_y;


    my @logscale_data = (0)x&$log_bin($max_x);
    my $log_x = 0;
    my $start_i = 1;
    for my $line (@rl)
    {
        my @deref_line = @{$line};
        my $curr_x = $deref_line[0];
        my $curr_y = $deref_line[1];

        if ($min_y > $curr_y) {
            $min_y = $curr_y;
        }

        if ($max_y < $curr_y) {
            $max_y = $curr_y;
        }

        print $rd_fh "$curr_x $curr_y\n";
        $log_x = max(&$log_bin($curr_x), 5);
        @logscale_data[floor($log_x)] += $curr_y;
        $start_i = $curr_x + 1;
    }

    # Copy binned data into its own file

    # This is a special case to deal with the problem that reads of len < 5 do
    # not cooperate with the log10(x)*10 transformation and cause small
    # fragments to graph badly.
    my $start = 1;
    my $end = 3;
    my $div_binw_val = $logscale_data[5]/($end - $start);
    my $start_idx = -1;
    for my $i (0..@logscale_data-1) {
        if($logscale_data[$i] != 0)
        {
            $start_idx = $i;
            if ($i == 5)
            {
                print $hd_fh "5 $logscale_data[5] $div_binw_val $start $end\n";
                $start_idx += 1;
            }
            last;
        }
    }

    my $bin_min_x     = &$inv_log_bin($start_idx);
    my $bin_max_x     = &$inv_log_bin($start_idx + 1);
    my $log_scale_max = $logscale_data[$start_idx];
    my $log_bin_max   = $logscale_data[$start_idx] / max(($bin_max_x - $bin_min_x), 1);

    for my $i ($start_idx..@logscale_data-1) {
        $start = &$inv_log_bin($i);
        $end = &$inv_log_bin($i+1);

        if ($bin_max_x < $end) {
            $bin_max_x = $end;
        }

        if ($log_scale_max < $logscale_data[$i]) {
            $log_scale_max = $logscale_data[$i];
        }

        $div_binw_val = $logscale_data[$i]/max(($end - $start), 1);

        if ($log_bin_max < $div_binw_val) {
            $log_bin_max = $div_binw_val;
        }

        print $hd_fh "$i $logscale_data[$i] $div_binw_val $start $end\n";
    }

    if ($max_x < $bin_max_x) {
        $max_x = $bin_max_x;
    }

    if ($min_x > $bin_min_x) {
        $min_x = $bin_min_x;
    }

    $max_y = nice_num($max_y);
    $log_scale_max = nice_num($log_scale_max);
    $log_bin_max = nice_num($log_bin_max);

    my $logscale = "";

    if (($max_x - $min_x) > 100) {
        $logscale = "set logscale x\n";
    }

    # Gnuplot commands for normalized binned data
    # The normalized plot bins reads by read length with bin widths on a
    # logarithmic scale (i.e. bins 10-11 bp, 12-14 bp, 15-18 bp, etc.), then
    # bin height is normalized by the width of the bins.
    # The end goal is to smooth the distribution of reads compared to plotting
    # each length without binning.  Visualization of unbinned data is included
    # in order to make single value spikes more visible, for example in the
    # case that spike ins are used as a control.
    print $fh qq[
        set ytics nomirror
        set y2tic nomirror
        set terminal png size 800,1000 truecolor
        set output "$$args{img}"
        $$args{grid}
        set grid noy2tics
        set multiplot layout 2,1 columnsfirst scale 1,0.9
        set yrange[0: $max_y]
        set ylabel "Read Count"
        $logscale
        set xrange[$min_x: $max_x]
        set xtics rotate by -45 out scale 1 nomirror
        set xlabel "Read Length"
        set boxwidth .8 relative
        set title "$$args{title}" noenhanced
        set title offset -15,-.5
        set key at graph 1, 1.15
        set y2range[0:$log_bin_max]
        plot "$$opts{prefix}read_len_rawdata.txt" using 1:2 lt rgb "orange" with boxes fs solid .7 title 'unbinned', "$$opts{prefix}read_len_histdata.txt" using 4:3 lt rgb "blue" with boxes title 'normalized bins(count/bin width)' axis x1y2
    ];
    print $fh qq[];

    # Gnuplot commands for non-normalized binned data
    # The un-normalized plot has reads assigned to bins identically to the
    # normalized plot, however, the bin height is not divided by bin width. The
    # goal of this visualization is to more accurately reflect the distribution
    # of reads by volume, and will generally collect around the mean read
    # length.
    # Inclusion of unbinned data is to give point of visual comparison to
    # normalized histogram.
    print $fh qq[
        set boxwidth 1 relative
        set y2range[0:$log_scale_max]
        plot "$$opts{prefix}read_len_rawdata.txt" using 1:2 lt rgb "orange" with boxes fs solid .7 title 'unbinned', "$$opts{prefix}read_len_histdata.txt" using 4:2 lt rgb "blue" with boxes title 'unnormalized bins' axis x1y2
    ];

    # Close file buffers and clean up workspace
    close($fh);
    close($hd_fh);
    close($rd_fh);
    plot($$args{gp});
    unlink("$$opts{prefix}read_len_rawdata.txt");
    unlink("$$opts{prefix}read_len_histdata.txt");
}

sub merge_bamcheck
{
    my ($opts) = @_;
    my $fh = *STDOUT;
    if ( !$$opts{merge} )
    {
        open($fh,'>',"$$opts{prefix}merge.bchk") or error("$$opts{prefix}merge.bchk: $!\n");
    }

    print $fh "# This file was produced by plot-bamstats and can be plotted using plot-bamstats\n# The command line was $$opts{args}\n";

    for my $sec (@{$$opts{sections}})
    {
        my $sid = $$sec{id};
        if ( !exists($$opts{dat}{$sid}) ) { next; }

        print $fh $$sec{header};
	if ($sid eq 'SN') {
	    for my $key (@{$$opts{sn_order}}) {
		if (exists($$opts{dat}{$sid}{$key})) {
		    print "$sid\t$key\t$$opts{dat}{$sid}{$key}\n";
		}
	    }
	    next;
	}
        elsif ( ref($$opts{dat}{$sid}) eq 'HASH' )
        {
            for my $key (keys %{$$opts{dat}{$sid}})
            {
                print "$sid\t$key\t$$opts{dat}{$sid}{$key}\n";
            }
            next;
        }
        elsif ( ref($$opts{dat}{$sid}) eq 'ARRAY' )
        {
            for my $rec (@{$$opts{dat}{$sid}})
            {
                print $fh "$sid\t", join("\t",@$rec), "\n";
            }
        }

        #
        #   if ( $sid eq 'ID' )
        #   {
        #       print $fh "# $$opts{id2sec}{SN}{header}\n$$opts{id2sec}{SN}{exp}";
        #       # output summary numbers here
        #       for my $id (keys %{$$opts{dat}})
        #       {
        #           if ( exists($$opts{exp}{$id}) ) { next; }
        #           for my $key (keys %{$$opts{dat}{$id}})
        #           {
        #               print $fh "SN\t$id\t$key\t$$opts{dat}{$id}{$key}\n";
        #           }
        #       }
        #   }
    }

}

sub bignum
{
    my ($num) = @_;
    if ( !defined $num ) { return 0; }
    my $out = '';
    my $slen = length($num);
    for (my $i=0; $i<$slen; $i++)
    {
        $out .= substr($num,$i,1);
        if ( $i+1<$slen && ($slen-$i-1)%3==0 ) { $out .= ','; }
    }
    return $out;
}

sub create_html
{
    my ($opts) = @_;
    my ($fname,$prefix);
    if ( $$opts{prefix}=~m{/$} )
    {
        $fname  = "$$opts{prefix}index.html";
        $prefix = '';
    }
    else
    {
        $prefix = $$opts{prefix};
        $prefix =~ s{^.*/}{};
        $fname = $$opts{prefix};
        $fname =~ s/-$/.html/;
    }
    open(my $fh,'>',$fname) or error("$fname: $!");
    print $fh q[
        <html>
        <head>
            <style>
                .thumbnail
                {
                    text-decoration:none;
                    color:black;
                    font-weight:bold;
                }
                .thumbnail span  /*CSS for enlarged image*/
                {
                    visibility: hidden;
                    position: absolute;
                    padding: 5px;
                    border: 1px solid #000;
                    background-color: #e5e5e5;
                }
                .thumbnail:hover span  /*CSS for enlarged image on hover*/
                {
                    visibility: visible;
                    left: 550px;
                    top: 10px;
                }
                .imgs td
                {
                    vertical-align:middle;
                    padding: 0.5em;
                    border: 1px solid black;
                }
                table.imgs
                {
                    border-collapse:collapse;
                    margin-left:20px;
                }
                .nums th
                {
                    text-align: left;
                }
                table.nums
                {
                    margin-top: 1em;
                    margin-left:20px;
                    border: 1px dotted #83A4C3;
                    background-color: #F5F5F5;
                    padding: 0.5em;
                }
                .pad { padding-left:1em; vertical-align:top; }
                .right { text-align:right; padding-left:1em; }
            </style>
        </head>
        <body>
            <table class="imgs">
    ];

    my %imgs =
    (
        'insert-size'    => 'Insert size',
        'gc-content'     => 'GC content',
        'acgt-cycles'    => 'Per-base sequence content',
        'mism-per-cycle' => 'Mismatches per cycle',
        'quals'          => 'Quality per cycle',
        'quals2'         => 'Quality per cycle',
        'quals3'         => 'Quality per cycle',
        'quals-hm'       => 'Quality per cycle',
        'indel-cycles'   => 'Indels per cycle',
        'indel-dist'     => 'Indel lengths',
        'gc-depth'       => 'Mapped depth vs GC',
        'read-length'    => 'Read lengths',
# I think this may be broken: 'coverage'       => '',
    );
    my @imgs = (qw(insert-size gc-content acgt-cycles mism-per-cycle quals quals2 quals3 quals-hm indel-cycles indel-dist gc-depth read-length));
    for (my $i=0; $i<@imgs; $i++)
    {
        if ( $i % 3 == 0 )
        {
            # new row
            if ( $i>0 ) { print $fh "</tr>\n"; }
            print $fh "<tr>\n";
        }
        if ( -e "$$opts{prefix}$imgs[$i].png" )
        {
            my $desc = $imgs{$imgs[$i]};
            my $link = uri_escape("$prefix$imgs[$i].png");
            print $fh qq[  <td><a class="thumbnail" href="$link"><img src="$link" width="150px"><span>$desc<br><img src="$link"></span></a>\n];
        }
        else { print $fh "  <td />\n"; }
    }
    print $fh "</tr></table>\n";

    my $reads_total      = get_value($opts,'SN','raw total sequences:');
    my $reads_filtered   = get_value($opts,'SN','filtered sequences:');
    my $percent_filtered = sprintf "(%.1f%%)", $reads_filtered * 100. / $reads_total;
    my $reads_mapped     = get_value($opts,'SN','reads mapped:');
    my $percent_mapped   = sprintf "(%.1f%%)", $reads_mapped * 100. / ($reads_total - $reads_filtered);
    my $reads_dup        = get_value($opts,'SN','reads duplicated:');
    my $percent_dup      = sprintf "(%.1f%%)", $reads_dup * 100. / ($reads_total - $reads_filtered);
    my $reads_mq0        = get_value($opts,'SN','reads MQ0:');
    my $percent_mq0      = sprintf "(%.1f%%)", ($reads_mapped ? $reads_mq0 * 100. / $reads_mapped : 0);
    my $reads_nonprim    = get_value($opts,'SN','non-primary alignments:');
    my $read_avglen      = get_value($opts,'SN','average length:');
    my $bases_total      = get_value($opts,'SN','total length:');
    my $bases_mapped     = get_value($opts,'SN','bases mapped (cigar):');
    my $bpercent_mapped  = sprintf "(%.1f%%)", $bases_mapped * 100. / $bases_total;
    my $error_rate       = sprintf "%.2f%%", 100.*get_value($opts,'SN','error rate:');

    $reads_total    = bignum($reads_total);
    $reads_filtered = bignum($reads_filtered);
    $reads_mapped   = bignum($reads_mapped);
    $reads_dup      = bignum($reads_dup);
    $reads_mq0      = bignum($reads_mq0);
    $reads_nonprim  = bignum($reads_nonprim);
    $bases_total    = bignum($bases_total);
    $bases_mapped   = bignum($bases_mapped);

    print $fh qq[

        <table class="nums">
            <tr><th>Reads</tr>
            <tr>
                <td class="pad"><table>
                <tr><td>total:       <td class="right"> $reads_total    <td class="right"></tr>
                <tr><td>filtered:    <td class="right"> $reads_filtered <td class="right"> $percent_filtered</tr>
                <tr><td>non-primary: <td class="right"> $reads_nonprim  <td class="right"> </tr>
                <tr><td>duplicated:  <td class="right"> $reads_dup      <td class="right"> $percent_dup</tr>
                <tr><td>mapped:      <td class="right"> $reads_mapped   <td class="right"> $percent_mapped</tr>
                <tr><td>zero MQ:     <td class="right"> $reads_mq0      <td class="right"> $percent_mq0</tr>
                <tr><td>avg read length: <td class="right"> $read_avglen    <td class="right"></tr>
                </table></tr>

            <tr><th>Bases</tr>
            <tr>
                <td class="pad"><table>
                <tr><td>total:       <td class="right"> $bases_total    <td class="right"></tr>
                <tr><td>mapped:      <td class="right"> $bases_mapped   <td class="right"> $bpercent_mapped</tr>
                <tr><td>error rate:      <td class="right"> $error_rate  <td class="right"></tr>
                </table></tr>
        </table>

        </body>
        </html>
    ];

    close($fh);
}