#!/usr/bin/perl

##
# Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
#
# This file is part of Bowtie 2.
#
# Bowtie 2 is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Bowtie 2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
#

#
# The paired-end data is made by (a) changing to the reads subdirectory and (b)
# running 'perl simulate.pl --ref=../reference/lambda_virus.fa'.
#

#
# The long-read data is made by (a) changing to the reads subdirectory and (b)
# running 'perl simulate.pl --ref=../reference/lambda_virus.fa --long
# --unpaired --prefix=longreads'.
#

use strict;
use warnings;
use Carp;
use Getopt::Long;
use List::Util qw(max min);

my @fa_fn = ();        # files with reference FASTA
my $rf = "";           # reference sequence
my $long = 0;          # 1 -> generate long reads
my $paired = 1;        # 1 -> generate paired-end reads
my $prefix = "reads";  # output files start with this string
my $nreads    = undef; # # reads
my $rdlen_av  = undef; # average to use when drawing from exponential
my $rdlen_exact = undef; # exact length for all reads, overrides randomness
my $rdlen_min = undef; # minimum read length (added to exponential draw)
my $frag_av   = undef; # mean fragment len
my $frag_sd   = undef; # s.d. to use when drawing frag len from normal dist
my $verbose   = 0;     # be talkative

GetOptions (
	"fasta|reference=s" => \@fa_fn,
	"long"              => \$long,
	"verbose"           => \$verbose,
	"nreads=i"          => \$nreads,
	"read-avg=i"        => \$rdlen_av,
	"read-len=i"        => \$rdlen_exact,
	"read-min=i"        => \$rdlen_min,
	"frag-avg=i"        => \$frag_av,
	"frag-sd=i"         => \$frag_sd,
	"unpaired"          => sub { $paired = 0; },
	"prefix=s"          => \$prefix
) || die "Bad option";

scalar(@fa_fn) > 0 || die "Must specify at least one reference FASTA file with --fasta";

print STDERR "Loading reference files...\n";
for my $fn (@fa_fn) {
	open(FN, $fn) || confess;
	my $name = "";
	while(<FN>) {
		chomp;
		$rf .= $_ if substr($_, 0, 1) ne ">";
	}
	close(FN);
}

my %revcompMap = (
	"A" => "T", "T" => "A", "a" => "t", "t" => "a",
	"C" => "G", "G" => "C", "c" => "g", "g" => "c",
	"R" => "Y", "Y" => "R", "r" => "y", "y" => "r",
	"M" => "K", "K" => "M", "m" => "k", "k" => "m",
	"S" => "S", "W" => "W", "s" => "s", "w" => "w",
	"B" => "V", "V" => "B", "b" => "v", "v" => "b",
	"H" => "D", "D" => "H", "h" => "d", "d" => "h",
	"N" => "N", "." => ".", "n" => "n" );

sub comp($) {
	my $ret = $revcompMap{$_[0]} || confess "Can't reverse-complement '$_[0]'";
	return $ret;
}

sub revcomp {
	my ($ret) = @_;
	$ret = reverse $ret;
	for(0..length($ret)-1) { substr($ret, $_, 1) = comp(substr($ret, $_, 1)); }
	return $ret;
}

$nreads    = $nreads    || 10000; # number of reads/end to generate
$rdlen_av  = $rdlen_av  || 75;    # average when drawing from exponential
$rdlen_min = $rdlen_min || 40;    # min read length (added to exponential draw)
$frag_av   = $frag_av   || 250;   # mean fragment len
$frag_sd   = $frag_sd   || 45;    # s.d. when drawing frag len from normal dist
my @fraglens  = ();     # fragment lengths (for paired)

if($long) {
	$nreads = 6000;
	$rdlen_av = 300;
	$rdlen_min = 40;
}

sub rand_dna($) {
	my $ret = "";
	for(1..$_[0]) { $ret .= substr("ACGT", int(rand(4)), 1); }
	return $ret;
}

#
# Mutate the reference
#

print STDERR "Adding single-base substitutions...\n";
my $nsnp = 0;
for(0..length($rf)-1) {
	if(rand() < 0.0012) {
		my $oldc = substr($rf, $_, 1);
		substr($rf, $_, 1) = substr("ACGT", int(rand(4)), 1);
		$nsnp++ if substr($rf, $_, 1) ne $oldc;
	}
}

print STDERR "Adding microindels...\n";

print STDERR "Adding larger rearrangements...\n";

print STDERR "Added $nsnp SNPs\n";

#
# Simulate reads
#

sub rand_quals($) {
	my $ret = "";
	my $upper = (rand() < 0.2 ? 11 : 40);
	$upper = 4 if rand() < 0.02;
	for(1..$_[0]) {
		$ret .= chr(33+int(rand($upper)));
	}
	return $ret;
}

sub add_seq_errs($$) {
	my($rd, $qu) = @_;
	my $origLen = length($rd);
	for(0..length($rd)-1) {
		my $c = substr($rd, $_, 1);
		my $q = substr($qu, $_, 1);
		$q = ord($q)-33;
		my $p = 10 ** (-0.1 * $q);
		if(rand() < $p) {
			$c = substr("ACGTNNNNNN", int(rand(10)), 1);
		}
		substr($rd, $_, 1) = $c;
		substr($qu, $_, 1) = $q;
	}
	length($rd) == $origLen || die;
	return $rd;
}

# Now simulate 
print STDERR "Simulating reads...\n";
my $rflen = length($rf);
if($paired) {
	open(RD1, ">${prefix}_1.fq") || die;
	open(RD2, ">${prefix}_2.fq") || die;
	for(my $i = 0; $i < scalar(@fraglens); $i++) {
		# Extract fragment
		my $flen = $fraglens[$i];
		my $off = int(rand($rflen - ($flen-1)));
		my $fstr = substr($rf, $off, $flen);
		# Check if it has too many Ns
		my %ccnt = ();
		for my $j (1..$flen) {
			my $c = uc substr($fstr, $j, 1);
			$ccnt{tot}++;
			$ccnt{non_acgt}++ if ($c ne "A" && $c ne "C" && $c ne "G" && $c ne "T");
			$ccnt{$c}++;
		}
		# Skip if it has >10% Ns
		if(1.0 * $ccnt{non_acgt} / $ccnt{tot} > 0.10) {
			$i--;
			next;
		}
		# Possibly reverse complement
		$fstr = revcomp($fstr) if (int(rand(2)) == 0);
	}
	close(RD1);
	close(RD2);
	print STDERR "Made pairs: reads_1.fq/reads_2.fq\n";
}

print STDERR "DONE\n";
