1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164
|
#!/usr/bin/perl
#Copyright (C) 2006-2008 Keio University
#(Kris Popendorf) <comp@bio.keio.ac.jp> (2006)
#
#This file is part of Murasaki.
#
#Murasaki is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#
#Murasaki is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with Murasaki. If not, see <http://www.gnu.org/licenses/>.
use File::Basename;
use Getopt::Long;
use Pod::Usage;
use Roman;
BEGIN {
unshift(@INC,(fileparse($0))[1].'perlmodules');
}
use Murasaki;
my $forceRoman;
my $noRoman;
GetOptions('help|?' => \$help, man => \$man, 'output=s' => \$outfile, 'sortall' => \$sortall,
'noroman'=>\$noRoman,'roman'=>\$forceRoman);
pod2usage(1) if $help or $#ARGV<1;
pod2usage(-exitstatus => 0, -verbose => 2) if $man;
my $geneparse="$root/geneparse";
$geneparse="$root/geneparse.pl" unless -x $geneparse;
$geneparse=`geneparse.pl` unless -x $geneparse;
if(!$outfile){ #not yet specified by getopt
$outfile=shift(@ARGV); #snag first arg
}
foreach $arg (@ARGV){
die "File not found: $arg" if !-e $arg;
die "File not readable: $arg" if !-R $arg;
if(-d $arg) { #directory parsing mojo
print "Reading directory: $arg\n";
chop($arg) if $arg=~m!/$!; #chomp final / if there is one
opendir(DIR,$arg);
@files=grep {!/^\.|~$|^#.*#$|\.sml$|^README|\.stitch$|\.bin|\.p?hmask$/ and -T "$arg/$_"} readdir(DIR);
@files=sort chrNameCmp @files;
push(@srcfiles,map {"$arg/$_"} @files);
closedir(DIR);
}if(-r $arg) {
push(@srcfiles,$arg);
}else{
die "Can't read file: $arg";
}
}
$noRoman=grep {numbered($_)=~m/\d+/} @srcfiles unless $forceRoman;
@srcfiles=sort chrNameCmp @srcfiles if $sortall;
open(OUTF,">$outfile");
$pos=1;
foreach $file (@srcfiles){
$length=genomeLength($file);
print OUTF join("\t",$file,$length,$pos,$pos+$length)."\n";
$pos+=$length+10;
}
sub chrNameCmp {
my ($ca,$cb)=(map {numbered($_)} $a,$b);
my ($na,$nb)=(chrNum($ca),chrNum($cb));
return -1 if $na and !$nb;
return 1 if $nb and !$na;
return $na<=>$nb unless !($ca<=>$b);
return $ca cmp $cb; #last resort is lexical compare
}
sub parseEnsemblName {
my ($file)=@_;
my ($filename,$dir)=fileparse($file);
my ($species,$assembly,$release,$type,$chrom,$gz)=($filename=~m/([^.]+)\.(.+)\.(\d+)\.(dna(?:_rm)?)\.chromosome\.([^.]+)\.fa(\.gz)?/) or return undef;
return {file=>$filename,
species=>$species,
assembly=>$assembly,
release=>$release,
type=>$type,
chrom=>$chrom,
compressed=>$gz};
}
sub chrNum {
my ($chrom)=@_;
return arabic($chrom) if !$noRoman and isroman($chrom);
return $1 if $chrom=~m/^(\d+)/;
}
sub numbered {
local $_=pop;
my $ensembl=parseEnsemblName($_);
return $ensembl->{chrom} if parseEnsemblName($_);
return $1 if m/(\d+)(?:\.[a-zA-Z]+)+?$/;
return $1 if m/chr(?:omosome)(?:\W)*([ivxIVX0-9]+)/;
return undef;
}
sub genomeLength {
$file=pop;
print $indent."Finding length of $file ...\n";
# print $indent."Command: $geneparse -c -l $file\n";
open(PARSE,"-|","$geneparse -c -l $file");
$seq=<PARSE>;
close(PARSE);
chomp($seq); #thou beist an int!
return $seq; #omph
}
__END__
=head1 NAME
chromostitch.pl - Makes stitch files for feeding to geneparse.pl to allow feeding of multiple chromosomes to programs like murasaki
=head1 SYNOPSIS
chromostitch.pl [options] <stitch file> <chromosome1 chromosome2 ... >
=item Options:
=item --sort sorts everything
=item --output=<file> altnerate way of specifying an output file
=head1 PARAMETERS
=over 8
=item B<stitch file>
output file (overwrites)
=item B<chromosome1 chromosome2 ...>
List of chromosome input files
=item B--output=<file>
Alternate way of specifying an output file
=item B--sort
Always sort EVERYTHING. (Allows for nicely sorted lists from sloppy input
of filenames from like multiple directories of *.gbk or something)
=back
=head1 DESCRIPTION
Makes stitch files for feeding to geneparse.pl to allow feeding of multiple chromosomes to programs like murasaki
=cut
|