File: chromostitch.pl

package info (click to toggle)
murasaki 1.68.6-6
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 1,928 kB
  • ctags: 3,100
  • sloc: cpp: 16,010; perl: 8,365; makefile: 186
file content (164 lines) | stat: -rwxr-xr-x 4,401 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
#!/usr/bin/perl

#Copyright (C) 2006-2008 Keio University
#(Kris Popendorf) <comp@bio.keio.ac.jp> (2006)
#
#This file is part of Murasaki.
#
#Murasaki is free software: you can redistribute it and/or modify
#it under the terms of the GNU General Public License as published by
#the Free Software Foundation, either version 3 of the License, or
#(at your option) any later version.
#
#Murasaki is distributed in the hope that it will be useful,
#but WITHOUT ANY WARRANTY; without even the implied warranty of
#MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#GNU General Public License for more details.
#
#You should have received a copy of the GNU General Public License
#along with Murasaki.  If not, see <http://www.gnu.org/licenses/>.

use File::Basename;
use Getopt::Long;
use Pod::Usage;
use Roman;

BEGIN {
  unshift(@INC,(fileparse($0))[1].'perlmodules');
}
use Murasaki;

my $forceRoman;
my $noRoman;
GetOptions('help|?' => \$help, man => \$man, 'output=s' => \$outfile, 'sortall' => \$sortall,
	  'noroman'=>\$noRoman,'roman'=>\$forceRoman);
pod2usage(1) if $help or $#ARGV<1;
pod2usage(-exitstatus => 0, -verbose => 2) if $man;

my $geneparse="$root/geneparse";
$geneparse="$root/geneparse.pl" unless -x $geneparse;
$geneparse=`geneparse.pl` unless -x $geneparse;

if(!$outfile){ #not yet specified by getopt
  $outfile=shift(@ARGV); #snag first arg
}

foreach $arg (@ARGV){
  die "File not found: $arg" if !-e $arg;
  die "File not readable: $arg" if !-R $arg;
  if(-d $arg) { #directory parsing mojo
    print "Reading directory: $arg\n";
    chop($arg) if $arg=~m!/$!; #chomp final / if there is one
    opendir(DIR,$arg);
    @files=grep {!/^\.|~$|^#.*#$|\.sml$|^README|\.stitch$|\.bin|\.p?hmask$/ and -T "$arg/$_"} readdir(DIR);
    @files=sort chrNameCmp @files;
    push(@srcfiles,map {"$arg/$_"} @files);
    closedir(DIR);
  }if(-r $arg) {
    push(@srcfiles,$arg);
  }else{
    die "Can't read file: $arg";
  }
}

$noRoman=grep {numbered($_)=~m/\d+/} @srcfiles unless $forceRoman;
@srcfiles=sort chrNameCmp @srcfiles if $sortall;

open(OUTF,">$outfile");
$pos=1;
foreach $file (@srcfiles){
  $length=genomeLength($file);
  print OUTF join("\t",$file,$length,$pos,$pos+$length)."\n";
  $pos+=$length+10;
}

sub chrNameCmp {
  my ($ca,$cb)=(map {numbered($_)} $a,$b);
  my ($na,$nb)=(chrNum($ca),chrNum($cb));
  return -1 if $na and !$nb;
  return 1 if $nb and !$na;
  return $na<=>$nb unless !($ca<=>$b);
  return $ca cmp $cb; #last resort is lexical compare
}

sub parseEnsemblName {
  my ($file)=@_;
  my ($filename,$dir)=fileparse($file);
  my ($species,$assembly,$release,$type,$chrom,$gz)=($filename=~m/([^.]+)\.(.+)\.(\d+)\.(dna(?:_rm)?)\.chromosome\.([^.]+)\.fa(\.gz)?/) or return undef;
  return {file=>$filename,
	  species=>$species,
	  assembly=>$assembly,
	  release=>$release,
	  type=>$type,
	  chrom=>$chrom,
	  compressed=>$gz};
}

sub chrNum {
  my ($chrom)=@_;
  return arabic($chrom) if !$noRoman and isroman($chrom);
  return $1 if $chrom=~m/^(\d+)/;
}

sub numbered {
  local $_=pop;
  my $ensembl=parseEnsemblName($_);
  return $ensembl->{chrom} if parseEnsemblName($_);
  return $1 if m/(\d+)(?:\.[a-zA-Z]+)+?$/;
  return $1 if m/chr(?:omosome)(?:\W)*([ivxIVX0-9]+)/;
  return undef;
}

sub genomeLength {
  $file=pop;
  print $indent."Finding length of $file ...\n";
#  print $indent."Command: $geneparse -c -l $file\n";
  open(PARSE,"-|","$geneparse -c -l $file");
  $seq=<PARSE>;
  close(PARSE);
  chomp($seq); #thou beist an int!
  return $seq; #omph
}

__END__

=head1 NAME

chromostitch.pl - Makes stitch files for feeding to geneparse.pl to allow feeding of multiple chromosomes to programs like murasaki

=head1 SYNOPSIS

chromostitch.pl [options] <stitch file> <chromosome1 chromosome2 ... >


=item Options:

=item  --sort           sorts everything

=item  --output=<file>  altnerate way of specifying an output file


=head1 PARAMETERS

=over 8

=item B<stitch file>
output file (overwrites)

=item B<chromosome1 chromosome2 ...>
List of chromosome input files

=item B--output=<file>
Alternate way of specifying an output file

=item B--sort
Always sort EVERYTHING. (Allows for nicely sorted lists from sloppy input
of filenames from like multiple directories of *.gbk or something)

=back

=head1 DESCRIPTION

Makes stitch files for feeding to geneparse.pl to allow feeding of multiple chromosomes to programs like murasaki

=cut