1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336
|
#!/usr/bin/env perl
# $Id: build_model,v 1.7 2006/12/19 13:15:06 c4chris Exp $
################################################################################
#
# build_model
# -----------
#
# Claudio Lottaz, SIB-ISREC, Claudio.Lottaz@isb-sib.ch
# Christian Iseli, LICR ITO, Christian.Iseli@licr.org
#
# Copyright (c) 1999, 2006 Swiss Institute of Bioinformatics.
# All rights reserved.
#
################################################################################
use strict;
use Symbol;
# global variables
my $verbose = 1;
my $forcedtuplesize = undef;
my $forcedminmask = undef;
my $forcedpseudocounts = undef;
my $forcedminscore = undef;
my $forcedstartlength = undef;
my $forcedstartpreroll = undef;
my $forcedstoplength = undef;
my $forcedstoppreroll = undef;
my $datadir = '.';
my $filestem = '';
require "build_model_utils.pl";
################################################################################
#
# Check command-line for switches
#
my $usage = "Usage: build_model [options] <conf-files>\n" .
" where options are:\n" .
" -q don't log on terminal\n" .
" -t <int> force tuple size, overwrites entry in config-files\n" .
" -m <int> force minimal mask, overwrites entry in config-files\n" .
" -p <int> force pseudocounts, overwrites entry in config-files\n" .
" -s <int> force minimal score, overwrites entry in config-files\n" .
" -l <int> force length of start profile (in codons/triplets)\n" .
" -r <int> force start profile's preroll in 5'UTR (in codons/triplets)\n" .
" -L <int> force length of stop profile (in codons/triplets)\n" .
" -R <int> force stop profile's preroll in 5'UTR (in codons/triplets)\n" .
"More information is obtained using 'perldoc build_model'\n";
while ($ARGV[0] =~ m/^-/) {
if ($ARGV[0] eq '-q') { shift; $verbose = 0; next; }
if ($ARGV[0] eq '-t') { shift; $forcedtuplesize = shift; next; }
if ($ARGV[0] eq '-m') { shift; $forcedminmask = shift; next; }
if ($ARGV[0] eq '-p') { shift; $forcedpseudocounts = shift; next; }
if ($ARGV[0] eq '-s') { shift; $forcedminscore = shift; next; }
if ($ARGV[0] eq '-l') { shift; $forcedstartlength = shift; next; }
if ($ARGV[0] eq '-r') { shift; $forcedstartpreroll = shift; next; }
if ($ARGV[0] eq '-L') { shift; $forcedstoplength = shift; next; }
if ($ARGV[0] eq '-R') { shift; $forcedstoppreroll = shift; next; }
die "Unrecognized switch: $ARGV[0]\n$usage";
}
if ($#ARGV < 0) { die "No configuration file specified\n$usage"; }
################################################################################
#
# Main-loop through all specified config-files
#
my $parFile;
while($parFile = shift) {
my($organism, $hightaxo, $dbfiles, $ugdata, $estdata, $datadir2, $filestem2,
$rnafile, $estfile, $estcdsfile, $estutrfile, $trainingfile, $testfile,
$utrfile, $cdsfile, $tuplesize, $minmask, $pseudocounts, $minscore,
$startlength, $startpreroll, $stoplength, $stoppreroll, $smatfile,
$estscanparams, $nb_isochores, $isochore_borders) =
readConfig($parFile, $forcedtuplesize, $forcedminmask,
$forcedpseudocounts, $forcedminscore,
$forcedstartlength, $forcedstartpreroll,
$forcedstoplength, $forcedstoppreroll, $verbose);
log_open("readconfig.log");
$datadir = $datadir2; $filestem = $filestem2;
showConfig($parFile, $organism, $hightaxo, $dbfiles, $ugdata, $estdata, $datadir,
$rnafile, $estfile, $estcdsfile, $estutrfile, $trainingfile,
$testfile, $utrfile, $cdsfile, $tuplesize, $minmask,
$pseudocounts, $minscore, $startlength, $startpreroll,
$stoplength, $stoppreroll, $smatfile,
$estscanparams, $nb_isochores, $isochore_borders);
log_close();
my $isochores;
if ($nb_isochores == 0) {
my $i;
my @IC;
for ($i = 0; $i < $#$isochore_borders; $i++) {
$IC[$i] = $$isochore_borders[$i] . "-" . $$isochore_borders[$i + 1];
}
$isochores = \@IC;
} else {
local *FH;
open FH, "$datadir/Report/${filestem}_prepare_data.log"
or die "prepare_data log file is missing : $!";
while ( <FH> ) {
next unless s/^\s+isochores used:\s+//;
s/\s//g;
my @F = split /,/;
$isochores = \@F;
}
close FH;
}
if ($#$isochores < 0) {
die "There must be at least one isochore...";
}
# generate codon usage and transition probability tables
log_open("generate_tables.log");
if (-s "$smatfile") { log_print(" - $smatfile already exists, skipped"); }
else {
generateEmissionTables($tuplesize, $pseudocounts, $minscore,
$startlength, $startpreroll,
$stoplength, $stoppreroll,
$isochores, $smatfile, $minmask, $parFile);
}
log_close();
print "$parFile done.\n";
}
exit(0);
################################################################################
#
# Generate codon usage tables for HMM-model
#
sub generateEmissionTables {
# for earch isochore launches maketable, modifying its output in
# order to mark the isochores and appends the result to the
# smatfile.
my($tuplesize, $pseudocounts, $minscore,
$startlength, $startpreroll, $stoplength, $stoppreroll,
$isochores, $smatfile, $minmask, $parFile) = @_;
log_print("\nWriting codon usage tables...");
# makesmat counts in frames, not in codons/triplets, therefore:
my $startframes = 3 * $startlength;
my $startoffset = 3 * $startpreroll + 1;
my $stopframes = 3 * $stoplength;
my $stopoffset = 3 * $stoppreroll;
my $isodir = "$datadir/Isochores";
my $smatfh = gensym; open($smatfh, ">>$smatfile");
foreach my $iso (sort(@$isochores)) {
log_print(" - computing for isochore $iso...");
my($low, $high) = $iso =~ m/^([^\-]+)\-(.*)$/;
$iso .= "_mr$minmask.seq";
my $cmd = "makesmat -t $tuplesize -p $pseudocounts -m $minscore " .
"-f $startframes -o $startoffset -F $stopframes -O $stopoffset" .
" < $isodir/mrna$iso";
my $out = `$cmd`;
$out =~ s/<NAME>/$parFile/g;
$out =~ s/<CG>/$low $high/g;
print($smatfh $out);
}
close($smatfh);
}
################################################################################
#
# Documentation
#
=head1 NAME
build_model - create a model for ESTScan
=head1 SYNOPSIS
build_model [options] <config-files...>
=head1 DESCRIPTION
build_model generates codon usage tables for ESTScan. Codon usage is analyzed
in mRNAs containing whole coding sequences. The script reads configuration
files on the comannd line and computes codon usage tables for each of them.
Files which already exist are reused. If an existing file is to be
recomputed, it must be deleted before the script is run again. The mRNA files
can be prepared with the extract_mRNA and prepare_data scripts, or simply
provided in FASTA format in the Isochores subdirectory. In mRNA data,
build_model expects annotations of coding sequence start and stop in the
header as two integer values following the tag 'CDS:'. The first integer
points to the first nucleotide of the CDS, the second to the last. Thus the
length of the CDS is <stop> - <start> + 1. The first nucleotide in the
sequence has index 1.
=head1 DIRECTORY STRUCTURE
build_model uses the following directory structure, the root of which
is given in the configuration file. From this root it contains the
following subdirectories:
- Isochores: data split in isochores, some with redundancy masked
- Matrices: contains the generated tables
- Report: contains all log files
mRNA data, test and training data files, is deposited in the data-root
directory if not otherwise specified in the configuration file.
=head1 OPTIONS
-q quiet
Do not log on terminal.
-t <tuple-size>
Size of tuples to be considered to generate the codon usage
tables. <tuple-size> - 1 is the order of the corresponding Markov
model used by ESTScan. This switch overwrites the variable
$tuplesize from paramter files.
-m <minmask>
Minimum consecutive runs of nucleotides masked from the original
sequences in order to limit data redundancy. Pieces of sequence
are only masked, if all of their nucleotides are part of
reoccuring 12-tuples which overlap by at least 4 nucleotides. This
switch overwrites the variable $minmask from parameter files.
-p pseudocount
Pseudocounts to be added when generating codon usage tables. This
switch overwrites the variable $pseudocounts from paramter files.
-s <minscore>
Minimal score in tables and transitions, lower scores are
overwritten with this value. This switch overwrites the variable
$minscore from paramter files.
-l <start-profile-length>
The number of nucleotide triplets taken into account for the
start-profile. This switch overwrites the variable $startlength
from paramter files.
-r <start-profile-preroll>
The number of nucleotide triplets of the 5' untranslated region
contained in the start profile. This switch overwrites the
variable $startpreroll from paramter files.
-L <stop-profile-length>
The number of nucleotide triplets taken into account for the
stop-profile. This switch overwrites the variable $stoplength from
paramter files.
-R <stop-profile-preroll>
The number of nucleotide triplets of the coding sequence contained in the
stop profile. This switch overwrites the variable $stoppreroll
from paramter files.
=head1 CONFIGURATION FILE
The parameters defined in the configuration file have the
following meaning:
* $organism (mandatory)
The desired organism as it is given in EMBL "OS" or RefSeq
"ORGANISM" lines.
* $dbfiles
Files from where full-length mRNA sequences are to be extracted,
tries to guess whether the files come from EMBL or RefSeq. If
this is not specified, expects a collection of mRNA in $rnafile.
* $datadir (mandatory)
Base directory where all of the above files are located
and the temporary result files are stored
* $rnafile (default is "$datadir/mrna.seq")
Name of the file with the extracted mRNA entries.
* $smatfile (default is "$datadir/Matrices/$filestem.smat")
Name of the file where the HMM-model is to be written
* $nb_isochores (default is 0)
Number of isochores, when isochores are to be determined
automatically from the GC-content distributaion as equal-sized
groups. 0 means no automatic detection.
* @isochore_borders (default is (0, 43, 47, 51, 100)
Array of GC percentages where isochores are split, first entry is
usually 0 and last 100. This is overwritten when $nb_isochores is
not zero.
* $tuplesize (default is 6)
Size of tuples counted for codon statistics. This is overwritten by
the -t switch.
* $minmask (default is 30)
Minimal run of consecutive nucleotides masked as redundant. This is
overwritten by the -m switch.
* $pseudocounts (default is 1)
pseudocount to be added when generating the codon usage tables,
overwritten by -m
* $minscore (default is -100)
minimum score attributed to log-odds and log-probabilities,
overwritten by -s.
* $startlength, $startpreroll (default 2+ceil(tuplesize/3) and 2)
number of nucleotide triplets contained in the start profile and
how many of these are contained in the 5' untranslated region,
overwritten by -l (length) and -r (preroll)
* $stoplength, $stoppreroll (default 2+ceil(tuplesize/3) and 2)
number of nucleotide triplets contained in the stop profile and
how many of these are contained in the coding sequence,
overwritten by -L (length) and -R (preroll)
* $estscanparams (default is "-m -50 -d -50 -i -50 -N 0")
parameters passed to ESTScan during evaluation
$filestem is used to generate many filenames. It is generated
automatically according to the tuplesize, the minmask and the
pseudocounts applied to generate them.
=head1 AUTHOR
Claudio Lottaz, SIB-ISREC, Claudio.Lottaz@isb-sib.ch
=cut
#
# End of file
#
################################################################################
|