File: chromostitch.pl

package info (click to toggle)
murasaki 1.68.6-6
links: PTS, VCS
area: main
in suites: stretch
size: 1,928 kB
ctags: 3,100
sloc: cpp: 16,010; perl: 8,365; makefile: 186
file content (164 lines) | stat: -rwxr-xr-x 4,401 bytes
parent folder | download | duplicates (5)
#!/usr/bin/perl

#Copyright (C) 2006-2008 Keio University
#(Kris Popendorf) <comp@bio.keio.ac.jp> (2006)
#
#This file is part of Murasaki.
#
#Murasaki is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#
#Murasaki is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with Murasaki.  If not, see <http://www.gnu.org/licenses/>.

use File::Basename;
use Getopt::Long;
use Pod::Usage;
use Roman;

BEGIN {
  unshift(@INC,(fileparse($0))[1].'perlmodules');
}
use Murasaki;

my $forceRoman;
my $noRoman;
GetOptions('help|?' => \$help, man => \$man, 'output=s' => \$outfile, 'sortall' => \$sortall,
	  'noroman'=>\$noRoman,'roman'=>\$forceRoman);
pod2usage(1) if $help or $#ARGV<1;
pod2usage(-exitstatus => 0, -verbose => 2) if $man;

my $geneparse="$root/geneparse";
$geneparse="$root/geneparse.pl" unless -x $geneparse;
$geneparse=`geneparse.pl` unless -x $geneparse;

if(!$outfile){ #not yet specified by getopt
  $outfile=shift(@ARGV); #snag first arg
}

foreach $arg (@ARGV){
  die "File not found: $arg" if !-e $arg;
  die "File not readable: $arg" if !-R $arg;
  if(-d $arg) { #directory parsing mojo
    print "Reading directory: $arg\n";
    chop($arg) if $arg=~m!/$!; #chomp final / if there is one
    opendir(DIR,$arg);
    @files=grep {!/^\.|~$|^#.*#$|\.sml$|^README|\.stitch$|\.bin|\.p?hmask$/ and -T "$arg/$_"} readdir(DIR);
    @files=sort chrNameCmp @files;
    push(@srcfiles,map {"$arg/$_"} @files);
    closedir(DIR);
  }if(-r $arg) {
    push(@srcfiles,$arg);
  }else{
    die "Can't read file: $arg";
  }
}

$noRoman=grep {numbered($_)=~m/\d+/} @srcfiles unless $forceRoman;
@srcfiles=sort chrNameCmp @srcfiles if $sortall;

open(OUTF,">$outfile");
$pos=1;
foreach $file (@srcfiles){
  $length=genomeLength($file);
  print OUTF join("\t",$file,$length,$pos,$pos+$length)."\n";
  $pos+=$length+10;
}

sub chrNameCmp {
  my ($ca,$cb)=(map {numbered($_)} $a,$b);
  my ($na,$nb)=(chrNum($ca),chrNum($cb));
  return -1 if $na and !$nb;
  return 1 if $nb and !$na;
  return $na<=>$nb unless !($ca<=>$b);
  return $ca cmp $cb; #last resort is lexical compare
}

sub parseEnsemblName {
  my ($file)=@_;
  my ($filename,$dir)=fileparse($file);
  my ($species,$assembly,$release,$type,$chrom,$gz)=($filename=~m/([^.]+)\.(.+)\.(\d+)\.(dna(?:_rm)?)\.chromosome\.([^.]+)\.fa(\.gz)?/) or return undef;
  return {file=>$filename,
	  species=>$species,
	  assembly=>$assembly,
	  release=>$release,
	  type=>$type,
	  chrom=>$chrom,
	  compressed=>$gz};
}

sub chrNum {
  my ($chrom)=@_;
  return arabic($chrom) if !$noRoman and isroman($chrom);
  return $1 if $chrom=~m/^(\d+)/;
}

sub numbered {
  local $_=pop;
  my $ensembl=parseEnsemblName($_);
  return $ensembl->{chrom} if parseEnsemblName($_);
  return $1 if m/(\d+)(?:\.[a-zA-Z]+)+?$/;
  return $1 if m/chr(?:omosome)(?:\W)*([ivxIVX0-9]+)/;
  return undef;
}

sub genomeLength {
  $file=pop;
  print $indent."Finding length of $file ...\n";
#  print $indent."Command: $geneparse -c -l $file\n";
  open(PARSE,"-|","$geneparse -c -l $file");
  $seq=<PARSE>;
  close(PARSE);
  chomp($seq); #thou beist an int!
  return $seq; #omph
}

__END__

=head1 NAME

chromostitch.pl - Makes stitch files for feeding to geneparse.pl to allow feeding of multiple chromosomes to programs like murasaki

=head1 SYNOPSIS

chromostitch.pl [options] <stitch file> <chromosome1 chromosome2 ... >


=item Options:

=item  --sort           sorts everything

=item  --output=<file>  altnerate way of specifying an output file


=head1 PARAMETERS

=over 8

=item B<stitch file>
output file (overwrites)

=item B<chromosome1 chromosome2 ...>
List of chromosome input files

=item B--output=<file>
Alternate way of specifying an output file

=item B--sort
Always sort EVERYTHING. (Allows for nicely sorted lists from sloppy input
of filenames from like multiple directories of *.gbk or something)

=back

=head1 DESCRIPTION

Makes stitch files for feeding to geneparse.pl to allow feeding of multiple chromosomes to programs like murasaki

=cut