1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296
|
#!/usr/bin/env perl
# $Id: extract_mRNA,v 1.4 2007/01/25 14:25:13 c4chris Exp $
################################################################################
#
# extract_mRNA
# ------------
#
# Claudio Lottaz, SIB-ISREC, Claudio.Lottaz@isb-sib.ch
# Christian Iseli, LICR ITO, Christian.Iseli@licr.org
#
# Copyright (c) 2006 Swiss Institute of Bioinformatics. All rights reserved.
#
################################################################################
use strict;
use EMBLFile;
use GBFile;
use Symbol;
# global variables
my $verbose = 1;
my $datadir = '.';
my $filestem = '';
require "build_model_utils.pl";
################################################################################
#
# Check command-line for switches
#
my $usage = "Usage: extract_mRNA [options] <conf-files>\n" .
" where options are:\n" .
" -q don't log on terminal\n" .
"More information is obtained using 'perldoc extract_mRNA'\n";
while ($ARGV[0] =~ m/^-/) {
if ($ARGV[0] eq '-q') { shift; $verbose = 0; next; }
die "Unrecognized switch: $ARGV[0]\n$usage";
}
if ($#ARGV < 0) { die "No configuration file specified\n$usage"; }
################################################################################
#
# Main-loop through all specified config-files
#
my $parFile;
while($parFile = shift) {
my($organism, $hightaxo, $dbfiles, $ugdata, $estdata, $datadir2, $filestem2,
$rnafile, $estfile, $estcdsfile, $estutrfile, $trainingfile, $testfile,
$utrfile, $cdsfile, $tuplesize, $minmask, $pseudocounts, $minscore,
$startlength, $startpreroll, $stoplength, $stoppreroll, $smatfile,
$estscanparams, $nb_isochores, $isochore_borders) =
readConfig($parFile, undef, undef, undef,
undef, undef, undef,
undef, undef, $verbose);
log_open("readconfig.log");
$datadir = $datadir2; $filestem = $filestem2;
showConfig($parFile, $organism, $hightaxo, $dbfiles, $ugdata, $estdata, $datadir,
$rnafile, $estfile, $estcdsfile, $estutrfile, $trainingfile,
$testfile, $utrfile, $cdsfile, $tuplesize, $minmask,
$pseudocounts, $minscore, $startlength, $startpreroll,
$stoplength, $stoppreroll, $smatfile, $estscanparams,
$nb_isochores, $isochore_borders);
log_close();
# extract data
log_open("extract_data.log");
extract_mRNAs($dbfiles, $organism, $hightaxo, $rnafile);
log_close();
print "$parFile done.\n";
}
exit(0);
################################################################################
#
# Extract mRNA entries
#
sub extract_mRNAs {
# Find all full-length messenger RNA entries of the given species
# ($organism) in the given databse ($dbfiles) and writes them in
# FASTA format with CDS annotation in the header ($rnafile). Tries
# to guess whether RefSeq or EMBL data is provided.
my($dbfiles, $organism, $hightaxo, $rnafile) = @_;
log_print("\nExtracting mRNA entries...");
if (-s $rnafile) {
log_print(" - $rnafile already exists, skipped");
return;
}
my($file, $fileSeqs, $fileLen);
my $totSeqs = 0;
my $totLen = 0;
my $outfh = gensym;
open($outfh, ">$rnafile");
my(@infiles) = glob($dbfiles);
foreach $file (@infiles) {
$fileSeqs = 0;
$fileLen = 0;
$_ = `head -1 $file`;
if (m/^LOCUS/) { # we read a file in genbank format
my $src = GBFile->new("$file");
$src->openStream;
log_print(" - processing RefSeq-file $file...");
my $e;
while(defined ($e = $src->getNext)) {
my $ac = $ {$e->{_GBac}}[0];
next if $e->{_GBtype} ne "mRNA";
next unless $e->{_GBcdsst} =~ /^\d+$/;
if ($hightaxo eq "") {
if (($e->{_GBscn} eq $organism)) {
$fileSeqs++;
$fileLen += $e->{_GBcdsen} - $e->{_GBcdsst} + 1;
print $outfh &genRNAEntry($ac, $e->{_GBdesc},
$e->{_GBcdsst}, $e->{_GBcdsen},
$e->{_seq});
}
} else {
my $o = index($e->{_GBtaxo}, $hightaxo);
if ($o >= 0) {
$fileSeqs++;
$fileLen += $e->{_GBcdsen} - $e->{_GBcdsst} + 1;
print $outfh &genRNAEntry($ac, $e->{_GBdesc},
$e->{_GBcdsst}, $e->{_GBcdsen},
$e->{_seq});
}
}
}
close($src->{_BTFfile});
} else { # we read a file in EMBL format
my $src = EMBLFile->new("$file");
$src->openStream;
log_print(" - processing EMBL-file $file...");
my $e;
while(defined ($e = $src->getNext)) {
if ($hightaxo ne "") {
my $o = index($e->{_EMBLtaxo}, $hightaxo);
if (($o < 0)
|| ($e->{_EMBLtype} ne "mRNA")) {
next; # no mRNA entry
}
} elsif (($e->{_EMBLscn}[0] ne $organism)
|| ($e->{_EMBLtype} ne "mRNA")) {
next; # no mRNA entry
}
my($begin, $end);
foreach(@{$e->{_EMBLft}}) {
if (($begin, $end) = $_ =~ /^CDS\s+(<?\d+)\.\.>?(\d+)$/) {
last;
}
}
if (!defined($begin)) {
next; # no CDS found
}
if ((m/>/) && (m/</)) {
next; # frame is undefined
}
# all checks passed
$fileSeqs++;
$fileLen += $end - $begin + 1;
print $outfh &genRNAEntry($ {$e->{_EMBLac}}[0],
$e->{_EMBLdesc},
$begin, $end, $e->{_seq});
}
close($src->{_BTFfile});
}
$totSeqs += $fileSeqs;
$totLen += $fileLen;
log_print(" found $fileSeqs sequences, $fileLen coding nucleotides");
}
close($outfh);
log_print(" - overall found $totSeqs sequences, "
. "$totLen coding nucleotides");
}
sub genRNAEntry {
my($id, $desc, $cdsBegin, $cdsEnd, $seq) = @_;
$seq =~ s/(.{80})/$1\n/g;
return ">tem|$id CDS: $cdsBegin $cdsEnd $desc\n$seq\n";
}
################################################################################
#
# Documentation
#
=head1 NAME
extract_mRNA - extract mRNA data to build models for ESTScan
=head1 SYNOPSIS
extract_mRNA [options] <config-files...>
=head1 DESCRIPTION
mRNA data is extracted from files in EMBL or RefSeq format. The script reads
configuration files on the command line and performs the extraction.
=head1 DIRECTORY STRUCTURE
build_model uses the directory structure which is given in the configuration
file.
mRNA data, test and training data files, is deposited in the data-root
directory if not otherwise specified in the configuration file.
=head1 OPTIONS
-q quiet
Do not log on terminal.
=head1 CONFIGURATION FILE
The parameters defined in the configuration file have the
following meaning:
* $organism (mandatory)
The desired organism as it is given in EMBL "OS" or RefSeq
"ORGANISM" lines.
* $dbfiles
Files from where full-length mRNA sequences are to be extracted,
tries to guess whether the files come from EMBL or RefSeq. If
this is not specified, expects a collection of mRNA in $rnafile.
* $datadir (mandatory)
Base directory where all of the above files are located
and the temporary result files are stored
* $rnafile (default is "$datadir/mrna.seq")
Name of the file with the extracted mRNA entries.
* $smatfile (default is "$datadir/Matrices/$filestem.smat")
Name of the file where the HMM-model is to be written
* $nb_isochores (default is 0)
Number of isochores, when isochores are to be determined
automatically from the GC-content distributaion as equal-sized
groups. 0 means no automatic detection.
* @isochore_borders (default is (0, 43, 47, 51, 100)
Array of GC percentages where isochores are split, first entry is
usually 0 and last 100. This is overwritten when $nb_isochores is
not zero.
* $tuplesize (default is 6)
Size of tuples counted for codon statistics. This is overwritten by
the -t switch.
* $minmask (default is 30)
Minimal run of consecutive nucleotides masked as redundant. This is
overwritten by the -m switch.
* $pseudocounts (default is 1)
pseudocount to be added when generating the codon usage tables,
overwritten by -m
* $minscore (default is -100)
minimum score attributed to log-odds and log-probabilities,
overwritten by -s.
* $startlength, $startpreroll (default 2+ceil(tuplesize/3) and 2)
number of nucleotide triplets contained in the start profile and
how many of these are contained in the 5' untranslated region,
overwritten by -l (length) and -r (preroll)
* $stoplength, $stoppreroll (default 2+ceil(tuplesize/3) and 2)
number of nucleotide triplets contained in the stop profile and
how many of these are contained in the coding sequence,
overwritten by -L (length) and -R (preroll)
* $estscanparams (default is "-m -50 -d -50 -i -50 -N 0")
parameters passed to ESTScan during evaluation
$filestem is used to generate many filenames. It is generated
automatically according to the tuplesize, the minmask and the
pseudocounts applied to generate them.
=head1 AUTHOR
Claudio Lottaz, SIB-ISREC, Claudio.Lottaz@isb-sib.ch
=cut
#
# End of file
#
################################################################################
|