File: Sim.pm

package info (click to toggle)
bowtie2 2.5.4-1
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 27,492 kB
sloc: cpp: 63,838; perl: 7,232; sh: 1,131; python: 987; makefile: 541; ansic: 122
file content (557 lines) | stat: -rw-r--r-- 17,510 bytes
#!/usr/bin/perl -w

#
# Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
#
# This file is part of Bowtie 2.
#
# Bowtie 2 is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Bowtie 2 is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
#

package Sim;
use strict;
use Carp;
use FindBin qw($Bin); 
use lib $Bin;
use DNA;
use Test;
use RandDNA;
use SampleRead;
#use Mutate;
use AlignmentCheck;
#use Math::Random;
use List::Util qw(max min);
use POSIX;
use Sys::Info;
use Sys::Info::Constants qw( :device_cpu );

##
# Replacement for "die" that additionally writes error message to file so that
# run.pl can read it later.
#
sub mydie($) {
	my $fn = ".run.pl.child.$$";
	open(EO, ">$fn") || die "Could not open $fn for writing";
	print EO "$_[0]\n";
	close(EO);
	confess $_[0];
}

# Generates random printable strings of a given length
sub randStr($) {
	my $len = shift;
	my @chars = ('a'..'z', 'A'..'Z', '0'..'9', '_');
	my $str = "";
	foreach (1..$len) {
		$str .= $chars[int(rand(scalar(@chars)))];
	}
	return $str;
}

##
# Default function for generating random sequence to insert into a gap.
#
sub defaultSeqGen($) {
	my $len = shift;
	($len == int($len) && $len > 0) ||
		mydie("Bad length for sequence generator: $len");
	my $ret = "";
	for (1..$len) {
		$ret .= substr("ACGT", int(rand(4)), 1);
	}
	return $ret;
}

sub rfnumgen   { return $_[0]->{_rfnumgen}   }
sub rflengen   { return $_[0]->{_rflengen}   }
sub rdnumgen   { return $_[0]->{_rdnumgen}   }
sub rdlengen   { return $_[0]->{_rdlengen}   }
sub fraglengen { return $_[0]->{_fraglengen} }
sub ngen       { return $_[0]->{_ngen}       }
sub iupacgen   { return $_[0]->{_iupacgen}   }
sub atgen      { return $_[0]->{_atgen}      }
sub agen       { return $_[0]->{_agen}       }
sub cgen       { return $_[0]->{_cgen}       }
sub snpgen     { return $_[0]->{_snpgen}     }
sub rdgapgen   { return $_[0]->{_rdgapgen}   }
sub rfgapgen   { return $_[0]->{_rfgapgen}   }
sub rearrgen   { return $_[0]->{_rearrgen}   }
sub gaplengen  { return $_[0]->{_gaplengen}  }
sub seqgen     { return $_[0]->{_seqgen}     }
sub seqmm      { return $_[0]->{_seqmm}      }

##
# Generate DNA generator.
#
sub genDNAgen {
	my $self = shift;
	my $nfrac     = $self->ngen->();
	my $iupacfrac = $self->iupacgen->();
	my $atfrac    = $self->atgen->();
	my $afrac     = $self->agen->();
	my $cfrac     = $self->cgen->();
	my $refdnagen = RandDNA->new(
		"Sim.pm gen",
		$nfrac,
		$iupacfrac,
		$atfrac,
		$afrac,
		$cfrac);
	printf STDERR "Created DNA generator\n";
	printf STDERR "  N frac: %0.3f\n", $nfrac;
	printf STDERR "  IUPAC frac: %0.3f\n", $iupacfrac;
	printf STDERR "  AT/ACGT frac: %0.3f\n", $atfrac;
	printf STDERR "  A/AT frac: %0.3f\n", $afrac;
	printf STDERR "  C/CG frac: %0.3f\n", $cfrac;
	return $refdnagen;
}


##
# Given a fasta filename, an index basename, and a path to the
# bowtie2-build executable, build nucleotide-space and colorpace
# indexes for the sequences in the fasta file.
#
sub build {
	my ($self, $fa, $idx, $conf, $args) = @_;
	my $argstr = "";
	for (keys %$args) {
		$argstr .= " $_";
		if($args->{$_} ne "") {
			$argstr .= " ".$args->{$_};
		}
	}
	# Build nucleotide index
	my $cmd = "$conf->{bowtie2_build} $argstr $fa $idx";
	print STDERR "Executing: $cmd\n";
	system($cmd);
	$? == 0 || mydie("Error running '$cmd'; exitlevel=$?");
	print STDERR "Built nucleotide index '$idx'\n";
	# Build colorspace index
	unless($conf->{no_color}) {
		$cmd = "$conf->{bowtie2_build} $argstr -C $fa ${idx}.c";
		print STDERR "$cmd\n";
		system($cmd);
		$? == 0 || mydie("Error running '$cmd'; exitlevel=$?");
		print STDERR "Built colorspace index '$idx'\n";
	}
}

##
# Given a hash of sequences, flatten all IUPAC codes into unambiguous
# nucleotides.
#
sub flattenIUPAC() {
	my ($self, $h) = @_;
	for my $c (keys %$h) {
		my $len = length($h->{$c});
		for my $i (0..$len-1) {
			my $ch = uc substr($h->{$c}, $i, 1);
			my $nc = $ch;
			if(DNA::isIUPAC($ch) || $ch eq "N") {
				if(rand() < $self->snpgen->()) {
					$nc = DNA::pickIncompat($ch);
					defined($nc) || mydie("Couldn't find incompatible base for $ch");
				} else {
					$nc = DNA::pickCompat($ch);
					defined($nc) || mydie("Couldn't find compatible base for $ch");
				}
			}
			if($ch ne $nc) {
				substr($h->{$c}, $i, 1) = $nc;
			}
		}
	}
}

##
# Mutate reference genome into a subject genome.
#
sub mutate() {
	my ($self, $refs) = @_;
	my %subj = %$refs;
	$self->flattenIUPAC(\%subj);
	print STDERR "Flattened IUPAC characters\n";
	my $mutator = Mutate->new(
		"Sim.pm mutator",
		$self->snpgen,
		$self->rdgapgen,
		$self->rfgapgen,
		$self->rearrgen,
		$self->gaplengen,
		$self->seqgen);
	my ($nsnp, $nrfgap, $nrdgap, $nrearr) = (0, 0, 0, 0);
	for(keys %subj) {
		print STDERR "  Mutating sequence $_\n";
		my ($nsnp_, $nrfgap_, $nrdgap_, $nrearr_) = $mutator->mutateSeq($_, \%subj);
		$nsnp   += $nsnp_;
		$nrfgap += $nrfgap_;
		$nrdgap += $nrdgap_;
		$nrearr += $nrearr_;
	}
	print STDERR "Mutated reference genome to subject genome\n";
	print STDERR "  SNPs introduced: $nsnp\n";
	print STDERR "  Reference gaps introduced: $nrfgap\n";
	print STDERR "  Read gaps introduced: $nrdgap\n";
	print STDERR "  Rearrangements introduced: $nrearr\n";
	return \%subj;
}

sub dumpFastq {
	my ($self, $input, $fh1, $fh2) = @_;
	for (1..scalar(@{$input->{seq1s}})) {
		my $seq1 = $input->{seq1s}->[$_-1];
		my $qual1 = $input->{qual1s}->[$_-1];
		print {$fh1} "\@$_\n";
		print {$fh1} "$seq1\n";
		print {$fh1} "+$_\n";
		print {$fh1} "$qual1\n";
		if($input->{paired}) {
			my $seq2 = $input->{seq2s}->[$_-1];
			my $qual2 = $input->{qual2s}->[$_-1];
			print {$fh2} "\@$_\n";
			print {$fh2} "$seq2\n";
			print {$fh2} "+$_\n";
			print {$fh2} "$qual2\n";
		}
	}
}

sub dumpQseq {
	my ($self, $input, $fh1, $fh2) = @_;
	for (1..scalar(@{$input->{seq1s}})) {
		my $seq1 = $input->{seq1s}->[$_-1];
		my $qual1 = $input->{qual1s}->[$_-1];
		print {$fh1} "R\t1\t1\t1\t$_\t$_\t1\t1\t$seq1\t$qual1\t1\n";
		if($input->{paired}) {
			my $seq2 = $input->{seq2s}->[$_-1];
			my $qual2 = $input->{qual2s}->[$_-1];
			print {$fh2} "R\t1\t1\t1\t$_\t$_\t1\t1\t$seq2\t$qual2\t1\n";
		}
	}
}

sub dumpFasta {
	my ($self, $input, $fh1, $fh2) = @_;
	for (1..scalar(@{$input->{seq1s}})) {
		my $seq1 = $input->{seq1s}->[$_-1];
		print {$fh1} ">$_\n";
		print {$fh1} "$seq1\n";
		if($input->{paired}) {
			my $seq2 = $input->{seq2s}->[$_-1];
			print {$fh2} ">$_\n";
			print {$fh2} "$seq2\n";
		}
	}
}

sub dumpRaw {
	my ($self, $input, $fh1, $fh2) = @_;
	for (1..scalar(@{$input->{seq1s}})) {
		my $seq1 = $input->{seq1s}->[$_-1];
		print {$fh1} "$seq1\n";
		if($input->{paired}) {
			my $seq2 = $input->{seq2s}->[$_-1];
			print {$fh2} "$seq2\n";
		}
	}
}

##
# Generate the input (reads plus paired/fragment information)
#
sub genInput {
	my ($self, $refs, $conf) = @_;
	# Select whether we're doing colorspace
	my $color = int(rand(2));
	$color = 0 if $conf->{no_color};
	# Select whether we're doing unpaired or paired-end.
	my $paired = int(rand(2));
	$paired = 0 if $conf->{no_paired};
	# Select format for read file
	my @formats    = ("fastq",   "qseq", "fasta", "raw");
	my @format_arg = (   "-q", "--qseq",    "-f",  "-r");
	my $formati = int(rand(scalar(@formats)));
	my $format     = $formats[$formati];
	my $format_arg = $format_arg[$formati];
	my $tmprdfn1 = "$conf->{tempdir}/Sim.pm.$conf->{randstr}_1.$format";
	my $tmprdfn2 = "$conf->{tempdir}/Sim.pm.$conf->{randstr}_2.$format";
	# Generate reads from the subject genome; no sequencing error yet
	my %input = (
		seq1s      => [],
		seq2s      => [],
		qual1s     => [],
		qual2s     => [],
		mate1fw    => 1,
		mate2fw    => 0,
		paired     => $paired,
		color      => $color,
		format     => $format,
		format_arg => $format_arg,
		file1      => $tmprdfn1,
		file2      => $tmprdfn2 );
	my $read_sampler = SampleRead->new(
		"Sim.pm read sampler",
		$self->fraglengen,
		$self->rdlengen,
		$self->rdlengen);
	print STDERR "Created read sampler\n";
	my $numreads = $self->rdnumgen->();
	$numreads = ceil(sqrt($numreads)) if $conf->{small};
	$numreads == int($numreads) || mydie("numreads $numreads not a number");
	my $tmp = int(rand(3));
	if($tmp == 0) {
		$input{mate2fw} = 1;
	} elsif($tmp == 1) {
		$input{mate1fw} = 0;
		$input{mate2fw} = 1;
	}
	print STDERR "Sampling $numreads reads\n";
	ref($refs) eq "HASH" || mydie("Reference input must be hash ref");
	if($paired) { 
		$read_sampler->genReadPairs(
			$numreads,       # number of reads/fragments to generate
			$input{color},   # colorize?
			$refs,           # hash ref holding reference sequences
			$input{mate1fw}, # orientation of mate 1 when fragment comes from Watson strand
			$input{mate2fw}, # orientation of mate 2 when fragment comes from Watson strand
			$input{seq1s},   # put generated mate1 sequences here
			$input{seq2s},   # put generated mate2 sequences here
			$input{qual1s},  # put generated mate1 quality sequences here
			$input{qual2s}); # put generated mate2 quality sequences here
	} else {
		$read_sampler->genReads(
			$numreads,       # number of reads/fragments to generate
			$input{color},   # colorize?
			$refs,           # hash ref holding reference sequences
			$input{seq1s},   # put generated sequences here
			$input{qual1s}); # put generated quality sequences here
	}
	# TODO: with some probability, sort the reads
	print STDERR "Dumping reads to temporary files $tmprdfn1 & $tmprdfn2\n";
	# Dump reads to output file
	my ($fh1, $fh2);
	open($fh1, ">$tmprdfn1") || mydie("Could not open '$tmprdfn1' for writing");
	open($fh2, ">$tmprdfn2") || mydie("Could not open '$tmprdfn2' for writing");
	if($format eq "fastq") {
		$self->dumpFastq(\%input, $fh1, $fh2);
	} elsif($format eq "qseq") {
		$self->dumpQseq(\%input, $fh1, $fh2);
	} elsif($format eq "fasta") {
		$self->dumpFasta(\%input, $fh1, $fh2);
	} elsif($format eq "raw") {
		$self->dumpRaw(\%input, $fh1, $fh2);
	}
	close($fh1);
	close($fh2);
	return \%input;
}

##
# Mutate reads according to sequencing error model.
#
sub mutateSeq {
	my ($self, $input) = @_;
	return $input;
}

##
# Align the given input set against the given index using the given
# bowtie2 binary and arguments.  Sanity-check the SAM output.
#
sub align {
	my ($self, $fa, $idx, $input, $conf, $args) = @_;
	my $argstr = "";
	for (keys %$args) {
		if(defined($args->{$_})) {
			$argstr .= " $_";
			if($args->{$_} ne "") {
				$argstr .= " ".$args->{$_};
			}
		}
	}
	$argstr .= " -C" if $input->{color};
	$argstr .= " ".$input->{format_arg};
	$idx .= ".c" if $input->{color};
	my $inputfn;
	if($input->{paired}) {
		$inputfn = "-1 $input->{file1} -2 $input->{file2}";
	} else {
		$inputfn = $input->{file1};
	}
	# Create object that will help us sanity-check alignments
	my $ac = AlignmentCheck->new(
		"Sim.pm alignment checker", # name
		[ $fa ],                    # fasta
		"sam",                      # SAM-formatted alignments
		0,                          # no bis-C
		0                           # no bis-CpG
	);
	$ac->nrefs() > 0 || mydie("No references");
	# Run normal (non-debug) Bowtie
	defined($conf->{tempdir}) || mydie("No tmp dir");
	my $als       = "$conf->{tempdir}/Sim.pm.$conf->{randstr}.als";
	my $als_debug = "$conf->{tempdir}/Sim.pm.$conf->{randstr}.debug.als";
	my $als_px    = "$conf->{tempdir}/Sim.pm.$conf->{randstr}.px.als";
	my $als_px_reord = "$conf->{tempdir}/Sim.pm.$conf->{randstr}.px.reord.als";
	my $cmd = "$conf->{bowtie2} $argstr -x $idx $inputfn";
	print "$cmd\n";
	open(ALSDEB, ">$als_debug") || mydie("Could not open '$als_debug' for writing");
	open(ALSDEBCMD, "$cmd |") || mydie("Could not open pipe '$cmd |'");
	my $ival = 50;
	my $nals = 0;
	my @lines = ();
	while(<ALSDEBCMD>) {
		# Remove @PG line because CL: tag can legitimately differ
		print ALSDEB $_ unless /^\@PG/;
		push @lines, $_;
		$nals++;
		print STDERR "  Read $nals alignments...\n" if ($nals % $ival) == 0;
	}
	close(ALSDEBCMD);
	$ac->checkAlignments(\@lines, 0);
	$? == 0 || mydie("bowtie2-align exited with exitlevel $?:\n$cmd\n");
	close(ALSDEB);
	$ac->printSummary();
	# With some probability, also run debug Bowtie and check that
	# results are identical
	if(int(rand(3)) == 0) {
		print STDERR "ALSO checking that bowtie2 and bowtie2-align match up\n";
		# Remove @PG line because CL: tag can legitimately differ
		$cmd = "$conf->{bowtie2} $argstr -x $idx $inputfn | grep -v '^\@PG' > $als";
		print "$cmd\n";
		system($cmd);
		$? == 0 ||
			mydie("Command '$cmd' failed with exitlevel $?");
		$cmd = "diff -uw $als $als_debug";
		print "$cmd\n";
		system($cmd);
		$? == 0 ||
			mydie("diff found a difference between bowtie2 and bowtie2-align ".
			      "output for same input (above)\n");
	}
	# With some probability, also run debug Bowtie in -p X mode with  X > 1 and
	# without the --reorder argument and check that results are identical
	if(int(rand(3)) == 0) {
		print STDERR "ALSO checking that bowtie2 and bowtie2 -p X w/ X > 1 match up\n";
		my $p = int(rand(3))+2;
		$cmd = "$conf->{bowtie2} $argstr -p $p -x $idx $inputfn | grep -v '^\@PG' > $als_px";
		print "$cmd\n";
		system($cmd);
		$? == 0 ||
			mydie("Command '$cmd' failed with exitlevel $?");
		# Sort the $als_px and $als_debug files to guarantee that reads and
		# alignments for a given read appear in the same order in both
		$cmd = "sort -k 1,1 -n -k 2,2 -k 3,3 -k 4,4 < $als_px | grep -v '^\@PG' > $als_px.sorted";
		print "$cmd\n";
		system($cmd);
		$? == 0 ||
			mydie("Failed to sort alignment file $als_px\n");
		# Sort the $als_px and $als_debug files to guarantee that reads and
		# alignments for a given read appear in the same order in both
		$cmd = "sort -k 1,1 -n -k 2,2 -k 3,3 -k 4,4 < $als_debug | grep -v '^\@PG' > $als_debug.sorted";
		print "$cmd\n";
		system($cmd);
		$? == 0 ||
			mydie("Failed to sort alignment file $als_debug\n");
		$cmd = "diff -uw $als_debug.sorted $als_px.sorted";
		print "$cmd\n";
		system($cmd);
		$? == 0 ||
			mydie("diff found a difference between bowtie2-align and bowtie2 ".
			      "-p output for same input (above)\n");
	}

	# With some probability, also run debug Bowtie in -p X mode with X > 1 and
	# with the --reorder argument and check that results are identical
	if(int(rand(3)) == 0) {
		print STDERR "ALSO checking that bowtie2 and bowtie2 -p X --reorder w/ X > 1 match up\n";
		my $p = int(rand(3))+2;
		$cmd = "$conf->{bowtie2} $argstr -p $p -x $idx --reorder $inputfn | grep -v '^\@PG' > $als_px_reord";
		print "$cmd\n";
		system($cmd);
		$? == 0 || mydie("Command '$cmd' failed with exitlevel $?");
		$cmd = "diff -uw $als_debug $als_px_reord";
		print "$cmd\n";
		system($cmd);
		$? == 0 ||
			mydie("diff found a difference between bowtie2-align and bowtie2 ".
			      "-p --reorder output for same input (above)\n");
	}
}

##
# Generate a new test case
#
# Possible key/value pairs in $conf hash:
#
# 1. bowtie2_build:       path to bowtie2-build binary
# 2. bowtie2:             path to bowtie2 binary
# 5. tempdir:             temporary directory for reference/reads/index
# 6. no_paired:           defined & non-0 -> don't generate paired-end datasets
# 7. no_color:            defined & non-0 -> don't generate colorspace datasets
# 8. single_thread:       defined & non-0 -> don't use -p X where X > 1
#
sub nextCase {
	my ($self, $conf) = @_;

	$conf->{bowtie2_build}       = "bowtie2-build"  unless defined($conf->{bowtie2_build});
	$conf->{bowtie2}             = "bowtie2-align"  unless defined($conf->{bowtie2});
	$conf->{tempdir}             = "/tmp"           unless defined($conf->{tempdir});
	srand(time ^ $$);
	$conf->{randstr} = randStr(8);

	print "*** TEST CASE ***\n";
	
	# Build a large index?
	my $large_index = int(rand(2)) == 0;
	
	# Generate the references
	my $refdnagen = $self->genDNAgen();
	# Generate references and write them to a temporary fasta file
	my $tmpfn = "$conf->{tempdir}/Sim.pm.$conf->{randstr}.fa";
	my %refs = ();
	$self->genRef(\%refs, $refdnagen, $conf, $tmpfn);
	# Run bowtie2-build
	my $tmpidxfn = "$conf->{tempdir}/Sim.pm.$conf->{randstr}";
	my $buildArgs = $self->genBuildArgs($large_index);
	$self->build($tmpfn, $tmpidxfn, $conf, $buildArgs);
	my $numruns = 10;
	$numruns *= 10 if $conf->{small}; # Lots of short runs
	# For each batch of reads / bowtie options
	for(1..$numruns) {
		print "*** Run $_ of $numruns\n";
		# Generate mutated version of the reference as our subject genome
		my $subj = $self->mutate(\%refs);
		# Generate all the input, including reads, pairedness,
		# fragment information, whether it's colorspace, etc
		my $input = $self->genInput($subj, $conf);
		# Mutate the input
		my $mutinput = $self->mutateSeq($input);
		# Select Bowtie arguments
		my $args = $self->genAlignArgs($mutinput, $input->{color}, $large_index, $conf);
		$self->align($tmpfn, $tmpidxfn, $mutinput, $conf, $args);
		# Sanity check output.  Possible sanity checks are:
		# 1. Check alignments & edits against reference
		# 2. Compare bowtie2 and bowtie2-debug
		# 3. Compare -p X>1 and -p 1
	}
}

if($0 =~ /Sim\.pm$/) {
	print "Running unit tests\n";
	# Run unit tests
}

1;