#!/usr/bin/env perl

use strict;
use warnings;

use Cwd;
use File::Basename;
use Carp;
use Data::Dumper;

use Getopt::Long qw(:config no_ignore_case bundling);

$ENV{LC_ALL} = 'C';  # critical for proper sorting using [system "sort -k1,1 ..."] within the perl script


my $output_prefix = "bowtie2";


my $usage = <<_EOUSAGE_;

################################################################################################################
#
#  --left and --right <string>   reads
#
#        or
#
#  --single <string>             reads
#
#  Required inputs:
#   
#  --target <string>            multi-fasta file containing the target sequences (should be named {refName}.fa )
#
#  --seqType <string>           fa | fq    (fastA or fastQ format)
#
#
# Optional:
#
#  --SS_lib_type <string>       strand-specific library type:  single: F or R  paired: FR or RF
#                                   examples:  single RNA-Ligation method:  F
#                                              single dUTP method: R
#                                              paired dUTP method: RF
#
#  --num_top_hits <int>         (default: 20) 
#
#  --retain_intermediate_files     retain all the intermediate sam files produced (they take up lots of space! and there's lots of them)
#
#  --max_dist_between_pairs             default (2000) 
#
#  --just_prep_build               just prepare the bowtie-build and stop.
#
#  --output_prefix|o <string>      prefix for output filename (default: $output_prefix)
#
#  --CPU <int>                     number of threads
#
#  ## General options
#
#  Any options after '--' are passed onward to the alignments programs (except BLAT -which has certain options exposed above).
#  For example, to set the number of processors used by Bowtie to 16 then use:
#     -- -p 16
#
####################################################################################################################
#
#  Example commands:
#
#     $0 --seqType fq --left left.fq --right right.fq --target Trinity.fasta 
#
#
####################################################################################################################



_EOUSAGE_

    ;


my $help_flag;
my $target_db; 
my $left_file;
my $right_file;
my $single_file;

my $num_top_hits = 20;
my $max_dist_between_pairs = 2000;
my $seqType;
my $SS_lib_type;
my $retain_intermediate_files_flag = 0;

my $trinity_mode;
my $CPU = 1;

my $JUST_PREP_BUILD = 0;

unless (@ARGV) {
    die $usage;
}


&GetOptions ( 'h' => \$help_flag,
              
              ## required inputs
              'left=s' => \$left_file,
              'right=s' => \$right_file,
              'single=s' => \$single_file,
              
              "target=s" => \$target_db,
              "seqType=s" => \$seqType,
              
              "CPU=i" => \$CPU,
              

              ## Optional:
              "SS_lib_type=s" => \$SS_lib_type,
              
              'output_prefix|o=s' => \$output_prefix,
              
              'num_top_hits=i' => \$num_top_hits,
                            
              'max_dist_between_pairs=i' => \$max_dist_between_pairs,
              
              'retain_intermediate_files' => \$retain_intermediate_files_flag,
              
              'just_prep_build' => \$JUST_PREP_BUILD,
              
              );


if ($help_flag) { die $usage; }


unless ($target_db && -s $target_db) { 
    die $usage . "Must specify target_db and it must exist at that location";
}

unless ($JUST_PREP_BUILD || ($seqType && $seqType =~ /^(fq|fa)$/)) {
    die $usage . ", sorry do not understand seqType $seqType";
}


unless ($JUST_PREP_BUILD 
        || 
        ($left_file && $right_file)
        ||
        ($single_file)

    )  {
    die $usage . "sorry, cannot find files $left_file $right_file $single_file";
}


if ($SS_lib_type && $SS_lib_type !~ /^(F|R|FR|RF)$/) {
    die "Error, SS_lib_type must be one of the following: (F, R, FR, RF)  ";
}



## check for required programs
{
    
    my @required_progs = qw(samtools bowtie2-build bowtie2);
    
    foreach my $prog (@required_progs) {
        my $path = `which $prog`;
        unless ($path =~ /^\//) {
            die "Error, path to required $prog cannot be found";
        }
    }
}


my $util_dir = "/usr/lib/trinityrnaseq/util/support_scripts";

my ($start_dir, $work_dir, $num_hits);


main: {
    $start_dir = cwd();

    $left_file = &build_full_paths($left_file, $start_dir) if $left_file;
    $right_file = &build_full_paths($right_file, $start_dir) if $right_file;
    $target_db = &build_full_paths($target_db, $start_dir);
    
    $single_file = &build_full_paths($single_file, $start_dir) if $single_file;
    
    unless (-s "$target_db.fai") {
        &process_cmd("samtools faidx $target_db");
    }
    
    
    ######################################
    ## Prep the bowtie index of the target
    ######################################
    
    my $index_ext = "bt2";
    
    my @bowtie_build_files = <$target_db.*.$index_ext>;
    print STDERR "bt2 index files: " . Dumper(\@bowtie_build_files);
    my $index_file_checkpoint = "$target_db._${index_ext}_idx_.ok";
    unless (@bowtie_build_files && -e $index_file_checkpoint) {
        
        print STDERR "Note - bowtie-build indices do not yet exist. Indexing genome now.\n";
        ## run bowtie-build:
        my $builder = "bowtie2-build";
        
        my $cmd = "$builder -q $target_db $target_db";
        &process_cmd($cmd);
        
        &process_cmd("touch $index_file_checkpoint");
        
    }
    
    if ($JUST_PREP_BUILD) {
        print STDERR "Just preparing build, stopping now.\n";
        exit(0);
    }
    
    my @entries;
    
    my $format = ($seqType eq "fq") ? "-q" : "-f";
    
    my $output_file = "$output_prefix.coordSorted.bam";

    my $cmd;
    if ($left_file && $right_file) {

        $cmd = "bash -c \"set -o pipefail; bowtie2 --local -k $num_top_hits --threads $CPU --no-unal $format -X $max_dist_between_pairs -x $target_db -1 $left_file -2 $right_file | samtools sort -@ $CPU -o - - > $output_file\" ";
        

    }
    else {
        
        $cmd = "bash -c \"set -o pipefail; bowtie2 --local -k $num_top_hits --threads $CPU --no-unal $format -x $target_db -U $single_file | samtools sort -o - - > $output_file\" ";
        
    }
    
    &process_cmd($cmd) unless (-e "$output_file.ok");

    &process_cmd("touch $output_file.ok") unless (-e "$output_file.ok");
    
    
    my @to_delete;
    
    if ($SS_lib_type) {
        
        ## strand-specific
        ## separate the sam based on strand, and create separate bam files.  (for convenience sake)
        
        $cmd = "$util_dir/SAM_strand_separator.pl $output_file $SS_lib_type";
        &process_cmd($cmd);
        
                
        foreach my $sam_file ("$output_file.+.sam", "$output_file.-.sam") {
            
            my $bam_file = $sam_file;
            $bam_file =~ s/\.sam$/\.bam/; # add suffix below

            unless (-s $sam_file) { next; } # empty file
                
            push (@to_delete, $sam_file);
        
            $cmd = "samtools view -bt $target_db.fai $sam_file | samtools sort -o - - > $bam_file"; # .bam ext added auto
                        
            &process_cmd($cmd);
            
            $cmd = "samtools index $bam_file";
            &process_cmd($cmd);
            
        }
    }
    
        
    ## do final cleanpup of intermediate files
    unless ($retain_intermediate_files_flag) {
        &purge_files(@to_delete);
    }    
    @to_delete = ();        
    
    
    
    exit(0);
}


####
sub process_cmd {
    my ($cmd) = @_;

    print STDERR "CMD: $cmd\n";

    my $ret = system($cmd);

    if ($ret) {
        confess "Error, cmd: $cmd died with ret $ret";
    }

    return;
}


    
####
sub build_full_paths {
    my ($path, $start_dir) = @_;
    
    my @paths;
    
    foreach my $p (split(/,/, $path)) {

        if ($p !~ /^\//) {
            $p = "$start_dir/$p";
        }
        push (@paths, $p);
    }
    
    $path = join(",", @paths);
    
    return($path);
}

####
sub purge_files {
    my @to_delete = @_;
    
    foreach my $file (@to_delete) {
        if (-e $file || -l $file) {
            print STDERR "-cleaning up and removing intermediate file: $file\n";
            unlink($file);
        }
        else {
            #print STDERR "-warning: cannot locate file $file targeted for deletion.\n";
        }
    }

    return;
}
