File: simplegraph.pl

package info (click to toggle)
murasaki 1.68.6-17
links: PTS, VCS
area: main
in suites: forky, sid
size: 15,780 kB
sloc: cpp: 16,010; perl: 8,365; makefile: 187; sh: 31
file content (506 lines) | stat: -rwxr-xr-x 13,783 bytes
parent folder | download | duplicates (7)
#!/usr/bin/perl -w

#Copyright (C) 2006-2008 Keio University
#(Kris Popendorf) <comp@bio.keio.ac.jp> (2006)
#
#This file is part of Murasaki.
#
#Murasaki is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#
#Murasaki is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with Murasaki.  If not, see <http://www.gnu.org/licenses/>.

##################
## dot plotting mojo (originally built for Mauve, but that might not work anymore) -- krisp
##################

use File::Basename;
use Getopt::Long;
use Pod::Usage;

#use Data::Dump qw{dump};

BEGIN {
  unshift(@INC,(fileparse($0))[1].'perlmodules');
}
use Murasaki qw{getProg writeOut $root max};

use strict;

our $geneparser=getProg('geneparse');
warn "Couldn't find geneparse. Finding lengths will fail" unless $geneparser;
my ($help,$man,$opt_prefix,$align_type,$keepGnuplot,$interactive,$seqOrder,$fullRange);

our $flexible=0;
our $signed=0;
our ($no_stitch);
our %useFormats;
sub addFormat {
  $useFormats{$_[0]}=1;
}
GetOptions('help|?' => \$help, man => \$man, 'output=s' => \$opt_prefix, 'type=s' => \$align_type,
	   'signed' => \$signed, 'nostitch' => \$no_stitch,
	   png=>\&addFormat,
	   pdf=>\&addFormat,
	   ps=>\&addFormat,
	  'keepplot!'=>\$keepGnuplot,
	   'interactive:s'=>sub{$interactive=($_[1] ? $_[1]:'all')},
	   'order=s'=>\$seqOrder,
	   'fullrange'=>\$fullRange,
	  );
pod2usage(1) if $help or $#ARGV<0;
pod2usage(-exitstatus => 0, -verbose => 2) if $man;

$useFormats{png}=1 unless keys(%useFormats);
our @formats=keys(%useFormats);
@formats=grep {$_ ne 'ps'} @formats if $useFormats{ps} and $useFormats{pdf};
our $out_prefix=$opt_prefix;
my $alignment_src=shift(@ARGV);
if(-d $alignment_src){ #output from quickrun
  $alignment_src=~m!^(.*?/?)([^/]+)/?$!;
  $alignment_src="$1$2/$2";
  $align_type='mauve' if !$align_type and -e $alignment_src;
  if(!-e $alignment_src){
    $alignment_src="$1$2/$2.anchors";
    $align_type='murasaki' if !$align_type and -e $alignment_src;
  }
  die "Alignment not found" unless -e $alignment_src;
  print "Derived $alignment_src as source\n";
}
$out_prefix="$alignment_src.graph" unless $opt_prefix;
$align_type="murasaki" unless $align_type;
our %mauve=%{loadMurasakiAlignment($alignment_src)};
our @seqs=@{$mauve{seqs}};
our @LCBs=@{$mauve{LCBs}};
our @seqOrder=$seqOrder ? ( ($seqOrder=~m/\D/) ? (split(/\D+/,$seqOrder))
						  :split(//,$seqOrder)
			  ):(0..$#seqs);

print "Sequence order: @seqOrder\n" if $seqOrder;

die "Need at least 2 sequences to plot a graph" unless @seqOrder>1;
die "Invalid order specification" if grep {$_>$#seqs} @seqOrder;

#because of laziness
our (%allGenes,%quickOrder);

print "$align_type file describes ".($#seqs+1)." sequences and ".($#LCBs+1)." LCBs\n";

die "No LCBs?" unless $#LCBs>=0;

our $datafile="$out_prefix.data.LCB";
writeOut($datafile,join("\n\n",map { #all LCBs
  join("\n",
    join("\t", map {#all sequences inside LCBs
      join("\t",$$_{start});
    } @{$_}),
    join("\t", map {#all sequences inside LCBs
      join("\t",$$_{stop});
    } @{$_})
      );
} @LCBs));

our %formats=(png => 'png transparent size 800,800',
	      ps => 'postscript',
	      pdf =>'postscript');

for($interactive ? mclassToI($interactive):(1..mclassToI('all'))){
  my @m=mclass($_);
  next unless magnitude(@m)==2 or $interactive or magnitude(@m)==scalar(@seqOrder);
  my $m=mclassToStr(@m);
  my @slice=map {$seqOrder[$_]} mclassToSlice(@m);
  my @col=map {$_+1} @slice; #gnuplot starts at 1
  my @names=map {${$seqs[$_]}{seqName}} @slice;

  our $divfile="$out_prefix.data.divs";
  my $divPlots=addDivPlots($divfile,@slice) unless $no_stitch;
  my $plotfile="$out_prefix.$m";
  my $xlab=$names[0];
  my $ylab=join(", ",@names[1..$#names]);
  my @ranges;
  my @lengths=map {$_->{length}} @seqs[@slice];
  if($fullRange and !grep {!defined $_} @lengths){
    @ranges=map {"[1:$_]"} ($lengths[0],max(@lengths[1..$#lengths]));
  }
  my @series=map {"'$datafile' using $col[0]:$col[$_] with lp lw 1 pointtype 6 pointsize 1 title '$names[0]-$names[$_]'"}
    (1..$#slice);
  my $plotline=join(", \\\n",@series,($divPlots ? ($divPlots):()));
  writeOut($plotfile,<<ENDTEXT);
set xlabel "$xlab"
set ylabel "$ylab"
plot @ranges $plotline
ENDTEXT


  if($interactive){
    print "**********************************************\n";
    print "***** Type quit and press enter to quit! *****\n";
    print "**********************************************\n";
    system("gnuplot $plotfile -");
  }else{
    #prepare formatting files

    foreach my $ext (@formats) {
      my $ffile="$out_prefix.to_$ext";
      my $outf="$out_prefix.$m.$ext";
      writeOut($ffile,<<ENDTEXT);
set terminal $formats{$ext}
set output '$outf'
ENDTEXT
      system("gnuplot $ffile $plotfile");
      unlink($plotfile) unless $keepGnuplot;
      if ($ext eq 'pdf') {
	system("ps2pdf $outf $out_prefix.$m.pdf");
	system("rm $outf") if -e "$out_prefix.$m.pdf" and !$useFormats{ps};
      }
      unlink($ffile);
    }
  }
}


exit 0;

sub magnitude {
  return scalar(grep({$_} @_));
}

sub mclassToSlice {
  my @l;
  for(0..$#_){
    push(@l,$_) if $_[$_];
  }
  return @l;
}

sub aligned {
  my ($gene,@m)=@_;
  my @used_rl=@{$allGenes{$gene}}[mclassToSlice(@m)];
  my @ref=map {$$_{id}} @{$used_rl[0]};
  foreach my $usedr (@used_rl[1..$#used_rl]){
    return 0 if $#{$usedr}!=$#ref;
    foreach my $LCBi (0..$#ref){
      return 0 if $ref[$LCBi]!=$$usedr[$LCBi]{id};
    }
  }
  return 1;
}

sub prettyGenePrint {
  join("\n",map {
    my ($gene)=$_;
    join("\t",$_,
	 map {my $seqId=$_; join(",",map {
	   my $usedr=$_;
	   "$$usedr{id}.$$usedr{partial}" } @{${$allGenes{$gene}}[$seqId]})
	    } (0..$#seqs)) } @_ )
}

sub geneSort {
#  return $quickOrder{$a}<=>$quickOrder{$b};
}

sub pow {
  my ($b,$p)=(shift,(shift)-1);
  my $r=$b;
  for (1..$p){
    $r*=$b;
  }
  return $r;
}

sub digits {
  my ($a,$b)=@_;
  my @l;
  while($a>0){
    unshift(@l,$a % $b);
    $a=int($a / $b);
  }
  return @l;
}

sub pad {
  my $fill=shift;
  my $target=shift;
  unshift(@_,$fill) while($#_+1<$target);
  return @_;
}

sub mclassToStr {
  return join("",@_);
}

sub mclass {
  my $i=shift;
  $i=pow(2,$#seqOrder+1)-1 if $i=~m/all/i;
  return pad(0,$#seqOrder+1,digits($i,2));
}

sub mclassToI {
  my $i=0;
  return pow(2,$#seqOrder+1)-1 if $_[0]=~m/all/i;
  $i=($i+pop) << 1 while(@_);
  return $i>>1;
}

sub mclassMember {
  my $usedr=shift;
  return !grep {!(($#{${$usedr}[$_]}>=0 and $_[$_]) or !$_[$_])} (0..$#_);
}

sub toMauveCoords {
  my @coords=($_[0]->start,$_[0]->end);
  return @coords;
}

sub findLCBs {
  my ($genome,$start,$stop)=($_[0],toMauveCoords($_[1]));
#  print "Searching ".($#LCBs+1)." LCBs on genome $genome\n";
#  print join("\n",map {"$_ -> ".ref($LCBs[$_])."=".join(" ",@{$LCBs[$_]})} 0..$#LCBs);
  return grep {grep {$_} $$_{partial}}
    (map {my @LCBl=@$_;
	  {partial=>coversLCB($start,$stop,$LCBl[$genome]), id=>$LCBl[$genome]{LCBId}} } @LCBs);
}

#returns 0 or which side of pair 1 (gene coords) overhangs pair 2 (LCB)
sub coversLCB {
  my ($start,$stop,$LCBr)=@_;
  my @a=($start,$stop,$$LCBr{start},$$LCBr{stop});
  return covers(@a);
}

sub covers {
  my ($a,$b,$c,$d)=@_;
  $a<=$b or die "start<=stop assertion failed";
  $c<=$d or die "LCB start<=stop assertion failed";
  return 0 if $a>$d or $c>$b; # total mismatch
  #  if($a<=$d and $c<=$b){ #some degree of hit (already guranteed by above)
  return "none" if $c<=$a and $b<=$d;
  return "left" if $c<=$a and $d<=$b;
  return "right" if $a<=$c and $b<=$d;
  return "both" if $a<=$c and $d<=$b;
}

sub loadMauveAlignment {
  my $alignment=shift;
  open(MAUVE,"<$alignment");
  <MAUVE>=~m/FormatVersion\s+(\d+)/ or die "Not a mauve file: $alignment";
  my $version=$1;
  my @seqs=();
  do {print "This program is written for Mauve Format Version 4.\n
This file is version $version. Weird stuff may happen.\n"; $flexible=1;} if $version!=4;
  <MAUVE>=~m/SequenceCount\s+(\d+)/ or die "Unknown sequence count\n";
  my $seqCount=$1;
  while(<MAUVE>){
    next unless m/Sequence(\d+)File\s+(\S.*)/;
    my ($seqId,$seqFile)=($1,$2);
    $_=<MAUVE>;
    m/Sequence${seqId}Length\s+(\d+)/ or $flexible or die "Input file is weird: $_";
    my $seqLength=$1;
    $seqs[$seqId]={'seqId' => $seqId,'seqFile' => $seqFile,'seqLength'=>$seqLength, 'seqName' => getName($seqFile) };
    last if $seqId==$seqCount-1;
  }

  @LCBs=();
  $_=<MAUVE>;
  m/IntervalCount\s(\d+)/ or $flexible or die "Interval Count line weird: $_";
  my $LCBCount=$1;
  while(<MAUVE>){
    m/Interval\s(\d+)/ or next;
    my $LCBId=$1;
    $_=<MAUVE>;
    chomp;
    my ($length,@start)=split(/\s+/);
    @start=map(abs,@start);
    my @stop=map {$_+$length} @start;
    my @segs;
    while(<MAUVE>){
      chomp;
      last if $_ eq '';
      next if $_ eq 'GappedAlignment' or m/^[A-Z-]+$/; #skip gapped lines
      ($length,@segs)=split(/\s+/);
      @stop=map {$_+$length} @segs;
    }
    next if (grep {$_==0} @start)>0;
    my @LCB=map {
#      ($start[$_],$stop[$_])=map(abs,($stop[$_],$start[$_])) if $start[$_]>$stop[$_]; #is rev strand?
      ($start[$_],$stop[$_])=map(abs,($start[$_],$stop[$_])) if $start[$_]>$stop[$_]; #is rev strand?
      { start => $start[$_], stop => $stop[$_], LCBId => $LCBId }} 0..$#stop;
    
    push(@LCBs,\@LCB);
  }
  return {'seqs' => \@seqs, 'LCBs' => \@LCBs};
}

sub loadMurasakiAlignment {
  my $filename=pop;
  my $basename=getName($filename);
  my $path=getPath($filename);
  my @seqs;
  if(-e "$path$basename.seqs"){
    my $seqId=0;
    open(SEQS,"$path$basename.seqs") or die "Could not open seqs file";
    while(<SEQS>){
      chomp;
      push(@seqs,{seqId => $seqId++, seqFile => $_,
		  seqName => getName($_),
		  divs=>getBreaks($_,$path),
		  length=>getLength($_,$path)});
    }
    close(SEQS);
  }else{
    my @seqnames=split(/-/,getName($filename));
    my $seqId=0;
    foreach(@seqnames){
      my $name="$_.gbk";
      -e $name or die "Sequence $name not found.\nEither use correct naming style, or create $path$basename.seqs file specifying sequence locations.";
      push(@seqs,{seqId => $seqId++, seqFile => $name});
    }
  }

  my @LCBs;
  my $LCBId=0;
  open(BLOCKS,$filename) or die "Blocks file not found...??";
  while(<BLOCKS>){
    chomp;
    $LCBId++;
    my @LCB=();
    while(m/(-?\d+)\s+(-?\d+)\s+([+-])/g){
      my ($start,$stop,$back)=($1,$2,($3 eq '-'));
#      ($start,$stop)=map(abs,($stop,$start)) if $start<0; #is rev strand
      ($start,$stop,$back)=(map(abs,($start,$stop)),1) if $start<0 and !$signed;
      ($start,$stop)=($stop,$start) if $back and $start<$stop;
      push(@LCB,{start => $start, stop => $stop, LCBId => $LCBId, back => $back});
    }
    push(@LCBs,\@LCB);
  }
  return {'seqs' => \@seqs, 'LCBs' => \@LCBs};
}

sub getName {
  my @ret=map {
    my ($name,$path,$suffix) = fileparse($_, qr{\.[^.]*});
    $name
    } @_;
  return @ret if $#_;
  return $ret[0];
}

sub getPath {
  my @ret=map {
    my ($name,$path,$suffix) = fileparse($_, qr{\.[^.]*});
    $path
    } @_;
  return @ret if $#_;
  return $ret[0];
}

sub getLength {
  my ($file,$path)=@_;
  my $real=$file if -f $file;
  $real=$path.$file if !$real and -f $path.$file;
  return undef unless $real and -f $real;
  my $length=`$geneparser -l $real`;
  chomp $length;
  return $length;
}

sub getBreaks {
  my ($file,$path)=@_;
  return undef unless $file=~m/\.stitch$/;
  my $fh;
  open($fh,$file) or open($fh,"$path$file") or return (warn "Couldn't find file: $file");
  my @ret;
  local $_;
  while (<$fh>){
    chomp;
    my ($src,$length,$start,$stop)=split(/\t/,$_);
    push(@ret,[$start,$stop]);
  }
  return [@ret];
}

sub addDivPlots {
  my ($divfile,$x,$y)=@_;
  my $cmds="";
  foreach my $s ($x,$y){
    my $op=($s == $y ? $x:$y);
    my $name=$seqs[$s]->{seqName};
    my $sdivfile="$divfile.$s-$op";
    my $oplen=$seqs[$op]->{length};
    next unless $oplen;
    my $divsr=$seqs[$s]->{divs};
    if(ref $divsr and scalar(@$divsr)){
      writeOut($sdivfile,join("\n\n",map { #for each div...
	join("\n",
	     join("\t", ($s==$x ? ($$_[0],1):(1,$$_[0]))),
	     join("\t", ($s==$x ? ($$_[0],$oplen):($oplen,$$_[0])))
	     ,"",
	     join("\t", ($s==$x ? ($$_[1],1):(1,$$_[1]))),
	     join("\t", ($s==$x ? ($$_[1],$oplen):($oplen,$$_[1])))
	    )
      } @$divsr));
      $cmds.=qq!'$sdivfile' with l lw 1 title '$name'!;
    }
  }
  return $cmds;
}

__END__

=head1 NAME

simplegraph.pl - Provides dotplot/chaos style plot of murasaki alignments

=head1 SYNOPSIS

simplegraph.pl <murasaki ailgnment file> [-output=<output prefix>]

=head1 OPTIONS

=over 8

=item B<murasaki alignment file>
The main one outputted by murasaki.

=item --output
Prefix for output files.

=item --nostitch
Don't draw lines for stitch file breaks.

=item --signed

Leave signedness alone (otherwise absolute coordinates are drawn)

=item --keepplot

Keep the file containing the gnuplot commands (helpful if you want to run gnuplot interactively).

=item --interactive=<S>

Run gnuplot interactively for some set of sequences <S> (default all)

=item --order=<S>
Reorder input sequences on the fly (or consider only a subset)

<S> can be specified as a simple permutation like 201 (which would
mean sequence 2, then 0, then 1), or if you have more than 10 sequences you can separate
digits with any non-digit character (eg: "2,0,1").

=back

=head1 DESCRIPTION

Draws graphs of murasaki alignments.


=cut