File: autoAug.pl

package info (click to toggle)
augustus 3.5.0%2Bdfsg-5
links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 777,052 kB
sloc: cpp: 80,066; perl: 21,491; python: 4,368; ansic: 1,244; makefile: 1,141; sh: 171; javascript: 32
file content (1191 lines) | stat: -rwxr-xr-x 54,522 bytes
parent folder | download | duplicates (3)
#!/usr/bin/perl
#

##########################################################################################################################
#                                                                                                                        #
# autoAug.pl                                                                                                             #
# train and run AUGUSTUS automatically                                                                                   #
#                                                                                                                        #
# usage:                                                                                                                 #
# autoAug.pl [OPTIONS] -g=genome.fa -t=trainingfile -s=speciesname -c=cdnafile                                           #
#                                                                                                                        #
##########################################################################################################################

use warnings;
use Getopt::Long;
use Cwd;

use File::Spec::Functions qw(rel2abs);
use File::Basename qw(dirname basename);

BEGIN {
    $0=rel2abs($0);
    our $directory = dirname($0);
} 
use lib $directory;
use helpMod qw(find checkFile check_fasta_headers relToAbs uptodate);
use Term::ANSIColor qw(:constants);
use DBI;
use strict;

my $scriptPath=dirname($0);           # the path of directory where this script placed

my $genome;                           # name of sequece file
my $genome_clean;		      # genome file that has been cleaned of DOS whitespaces and linebreaks
my $trainingset;                      # name of training set file
my $species;                          # species name
my $hints;                            # hints file name
my $havehints=0;                      # have hints at all or not?
my $estali;                           # est file for make UTR-Training
my $positionWD=cwd();                 # working superdirectory where program is called from
my $pasa='';                          # switch it on to create training set, est set, hints file with PASA
my $pasapolyAhints;                   # use PASA Poly A hints as hints for the prediction
my $fasta_cdna;                       # fasta file for PASA
my $verbose=2;                        # verbose level
my $webaugustus=0;                    # run in WebAUGUSTUS - adapt error messages to webservice or standalone
my $singleCPU=0;                      # run everything sequentially without interruption
my $cpus=1;                           # n is the number of CPUs to use (default: 1)
my $maxIntronLen = 100000;            # maximal length of an intron, used by PASA and BLAT
my $noninteractive;                   # parameter for autoAugPred.pl
my $cname="fe";                       # parameter for autoAugPred.pl:cluster name
my $optrounds=1;                      # optimization rounds
my $useGMAPforPASA=0;                 # use GMAP instead of BLAT (only for PASA)
my $useexisting=0;                    # start with and change existing config, parameter and result files
my $utr=1;                            # default value: with "utr" if cDNA exists
my $flanking_DNA='';                  # length of flanking DNA, default value is min{ave. gene length, 10000}
my $help=0;                           # print usage
my $useexistingopt = "";
my $autoAugDir_abinitio;
my $autoAugDir_hints;
my $autoAugDir_hints_utr;
my $autoAugDir_utr;
my $autoAugDir;
my $shells_path;
my $trainDir;                          # directory for creating the training set
my $index=0;
my $shellDir;
my $aug;
my $perlCmdString;                    # to store perl commands
my $cmdString;                        # to store shell commands
my $string;          		      # temp string for perl-scripts, which will be called in this script



# usage

my $usage =  <<_EOH_;

Function: train AUGUSTUS and run AUGUSTUS completely and automatically

Usage:

autoAug.pl [OPTIONS] --species=sname --genome=genome.fa --cdna=cdna.fa --trainingset=genesfile
autoAug.pl [OPTIONS] --species=sname --genome=genome.fa --cdna=cdna.fa --pasa --useGMAPforPASA
autoAug.pl [OPTIONS] --species=sname --genome=genome.fa --trainingset=genesfile [--estali=cdna.psl] [--hints=hints.gff]

--genome=fasta                      fasta file with DNA sequences for training
--trainingset=genesfile             genesfile contains training genes in Genbank, GFF or protein FASTA format
--species=sname                     species name as used by AUGUSTUS
--hints=hints.gff                   hints for gene predictions with AUGUSTUS
--estali=cdna.psl                   cDNA alignments in PSL format (as generated by BLAT and GMAP) are used to construct UTRs
--pasa                              use PASA to construct a training set
--cdna=cdna.fa                      a fasta file with cDNA sequences (ESTs, mRNA)
--pasapolyAhints                    use PASA polyA hints as hints for the prediction

options:
--useexisting                       use and change the present config and parameter files if they exist for 'species'
--verbose                           print more status info. Cumulative option, e.g. use -v -v -v to make this script very verbose
--webaugustus                       run in WebAUGUSTUS - adapt error messages to this webservice
--noutr                             do not train and predict UTRs.
--workingdir=/path/to/wd/           In the working directory results and temporary files are stored.
                                    Default: current working directory
--singleCPU                         run the complete program sequentially instead of parallel execution of jobs on a cluster
--cpus=n                            n is the number of CPUs to use (default: 1), if cpus > 1 install pblat (parallelized blat) for better performance
--noninteractive                    bypass all manual interaction when using a SGE cluster
--cname=yourClusterName             cluster name, only use it when "noninteractive" default:fe
--index=i                           step index, default:0 
--optrounds=n                       optimization rounds - each meta parameter is optimized this often (default 1)
--maxIntronLen=n                    maximal length of an intron as used by PASA and BLAT, not by AUGUSTUS (default 100000)
--useGMAPforPASA                    use GMAP instead of BLAT in the PASA run
--help                              print this usage information
_EOH_
    
    ;

if (@ARGV==0) {print "$usage\n"; exit(0);}

GetOptions( 'genome=s' => \$genome,
	    'trainingset=s' => \$trainingset,
	    'species=s' => \$species,
	    'hints=s' => \$hints,
	    'estali=s' => \$estali,
	    'workingdir=s' => \$positionWD,
	    'pasa!' => \$pasa,
	    'singleCPU!' => \$singleCPU,
	    'cpus=i' => \$cpus,
	    'cdna=s' => \$fasta_cdna,
	    'verbose+' => \$verbose,
	    'webaugustus!' => \$webaugustus,
	    'noninteractive' => \$noninteractive,
	    'cname=s' => \$cname,
	    'index=i' => \$index,
	    'optrounds=i' => \$optrounds,
	    'maxIntronLen=i' => \$maxIntronLen,
	    'useGMAPforPASA!' => \$useGMAPforPASA,
	    'useexisting!' => \$useexisting,
	    'utr!' => \$utr,
	    'help!' => \$help,
            'pasapolyAhints!' => \$pasapolyAhints 
    	);

if ($help) {print "$usage\n"; exit(0);}

             ############ make some regular checks ##############


# check upfront whether any common problems will occur later. So the user doesn't have to wait a long time to
# find out that some programs are not installed. 
check_upfront();


# directory structure:
# $positionWD = cwd  -  $rootDir = autoAug  -   = trainingSet
#                                           -   = autoAugTrain
#                                           -   = autoAugPred_abinitio
#                                           -   = autoAugPred_hints
#                                           -   = autoAugPred_hints_utr
#                                           -   = results

# check the write permission of $positionWD before building of the work directory
die("Do not have write permission for $positionWD.\nPlease use command 'chmod' to reset permission " . 
    "or specify another working directory\n") if (! -w $positionWD);

my $rootDir="$positionWD/autoAug";
die ("$rootDir already exists. Reuse with --useexisting or use another directory with --workingdir=dir")
    if (!$useexisting && -d $rootDir);

$cmdString = "augustus --version 2>&1";
system("$cmdString")==0 or die("Augustus is not installed - failed to execute: $cmdString!\n");
print "\n";

if (! -d $rootDir) {
    mkdir "$rootDir" or die ("Could not create directory $rootDir\n");
}
$autoAugDir_abinitio = "$rootDir/autoAugPred_abinitio";
$autoAugDir_hints = "$rootDir/autoAugPred_hints";
$autoAugDir_hints_utr = "$rootDir/autoAugPred_hints_utr";
$autoAugDir_utr = "$rootDir/autoAugPred_utr";

my $AUGUSTUS_CONFIG_PATH=$ENV{'AUGUSTUS_CONFIG_PATH'};

# show error information and stop the program if $species not specified
die("Error: Need to specify the species!\n$usage") unless($species);

# check species directory
die("$AUGUSTUS_CONFIG_PATH/species/$species already exists. Choose another species name or delete this directory to start from scratch.\n") 
    if (-d "$AUGUSTUS_CONFIG_PATH/species/$species" && !$useexisting && ($noninteractive || $index==0));

# check genome file
$genome = checkFile($genome, "fasta", $usage);
check_fasta_headers($genome);

# show error information and stop the program if the specified $positionWD couldn't be found
# overwrite $positionWD with absolute path
$positionWD=relToAbs($positionWD);         
die("Error: Did not find the directory $positionWD! Please specify a valid one for \"workingdir\"! \n") unless (-d $positionWD);

$useexistingopt = "--useexisting" if ($useexisting);


my $verboseString;
$verboseString='' if ($verbose==0);
$verboseString="-v" if ($verbose==1);
$verboseString='-v -v' if ($verbose==2);
$verboseString='-v -v -v' if ($verbose>2);
#print "First column: verbosity level x, only print this line if $verbose >= x \n\n\n";

$havehints = (defined($hints) || defined($fasta_cdna) || defined($estali));
$fasta_cdna = checkFile($fasta_cdna, "fasta", $usage) if (defined($fasta_cdna));
if (defined($fasta_cdna)) {
    print "1 Checking fasta headers in file $fasta_cdna...\n" if ($verbose>=1);
    check_fasta_headers($fasta_cdna);
}
$trainingset = checkFile($trainingset,"training genes", $usage) if($index==0 && defined($trainingset));
$hints = checkFile($hints,"hints", $usage) if (defined($hints));
$estali = checkFile($estali,"EST alignment", $usage) if (defined($estali));

training_set_dirs() if($index==0);

# Clean genome file from DOS whitespaces/linebreaks
if (!uptodate([$genome], [$genome_clean])){
	print "3 Cleaning genome file from DOS whitespaces/linebreaks...\n" if ($verbose>2);
	$genome_clean = "$rootDir/seq/genome_clean.fa";
	$string = find("cleanDOSfasta.pl");
	$perlCmdString = "perl $string $genome > $genome_clean";
	print "3 $perlCmdString\n" if ($verbose>2);
	system("$perlCmdString")==0 or die ("failed to execute: $perlCmdString!\n");
	unless(-e $genome_clean){die("Clean genome file $genome_clean does not exist!\n");}
}

if($pasa && $index==0){
    $trainingset = "$trainDir/training/training.gb";
    if (!uptodate([$genome_clean,$fasta_cdna], [$trainingset])){
	construct_training_set();
    } else {
	print ("1 Skipping training set construction with PASA. Using existing file $trainingset.\n") if ($verbose>=1);
    }
}

if($index==0 && (!defined($hints) || !defined($estali))){
    if (!uptodate(["$rootDir/seq/genome_clean.fa"],["$rootDir/seq/genome.summary", "$rootDir/seq/contigs.gff"])){
	prepare_genome();
    }
    
    if (defined($fasta_cdna) &&
	!uptodate([$fasta_cdna, "$rootDir/seq/genome_clean.fa"],
		  ["$rootDir/cdna/cdna.psl", "$rootDir/cdna/cdna.f.psl", "$rootDir/hints/hints.E.gff"])){
	alignments_and_hints();
    } else {
	print "1 Using existing cDNA alignments and hints.\n" if ($verbose>=1);
    }
    $hints = "$rootDir/hints/hints.E.gff";
    $estali = "$rootDir/cdna/cdna.f.psl";
}

autoTrain_no_utr() if ($noninteractive or $index==0);

if ($noninteractive){
    autoAug_noninteractive("",""); # without hints
    autoAug_noninteractive("1","") if ($havehints); # with hints
    if ($utr && defined($hints)){
	autoTrain_with_utr();
	autoAug_noninteractive("1","1");
    }
} else {
    autoAug_prepareScripts("","") if ($index==0);
    $index++ if ($singleCPU);

    if($index==1){
	autoAug_continue("",""); # without hints
	autoAug_prepareScripts("1","") if ($havehints);# with hints
	$index++ if ($singleCPU);
    }
    if($index==2){
	autoAug_continue("1","") if ($havehints);# with hints
	if ($utr && ($havehints)){
	    autoTrain_with_utr();
	    autoAug_prepareScripts("1","1");
	}
	$index++ if ($singleCPU);
    }
    autoAug_continue("1","1") if ($index==3 && $utr && ($havehints));
}

collect() if($noninteractive or $index==3);

                           ############### sub functions ##############


        ##################### construct training set with pasa ######################

sub training_set_dirs {
    # build directory for training set construction (e.g. PASA)
   
    chdir $rootDir or die ("Could not change to directory $rootDir.\n");
    $trainDir="$rootDir/trainingSet";
    if (! -d $trainDir){
	print "3 mkdir $trainDir\n" if ($verbose>=3);
	mkdir "$trainDir" or die("\nError: Could not create directory $trainDir under " 
				 . cwd() .".\n");
    }
    if (! -d "seq"){
	print "3 mkdir $rootDir/seq\n" if ($verbose>=3);
	mkdir "seq" or die("\nError: Could not create directory seq.\n");
    }
    if (! -d "hints"){
	print "3 mkdir $rootDir/hints\n" if ($verbose>=3);
	mkdir "hints" or die("\nError: Could not create directory hints.\n");
    }
    if (! -d "cdna"){
	print "3 mkdir $rootDir/cdna\n" if ($verbose>=3);
	mkdir "cdna" or die("\nError: Could not create directory cdna.\n");
    }
    
   
    # build subdirectory structure
    
    chdir "$trainDir" or die("\nError: Could not cd to directory $trainDir.\n");
    

    for(("gbrowse","pasa","training")){
	mkdir "$_" if (! -d $_);
	print "3 mkdir $trainDir/$_\n" if ($verbose>=3);
    }
 
    print "2 All necessary directories have been created under $trainDir.\n" if ($verbose>=2);

    # build symbolic link for $genome

    print "3 cd $rootDir/seq\n" if ($verbose>=3);
    chdir "$rootDir/seq";
    if (!uptodate([$genome], ["genome.fa"])){
	print "3 ln -s $genome genome.fa\n" if($verbose>=3);
	system("ln -s $genome genome.fa")==0 or die("\nfailed to execute ln -s $genome genome.fa\n");
    }
}

sub DropDataBase {
  my $hostname = shift;
  my $database = shift;
  my $user = shift;
  my $pass = shift;
  my $dbh = shift;

  my $dsn  = "DBI:mysql:database=mysql;host=$hostname";

  # Constructor and Connection
  $dbh = DBI::->connect( $dsn, $user, $pass, { 'RaiseError' => 1, 'AutoCommit' => 1 } ) or die DBI::errstr;
  my $sth = $dbh->prepare("SHOW DATABASES;");
  $sth->execute();
  my $dbexists = 0;
  while (my @db = $sth->fetchrow_array()) {
      $dbexists |= ($db[0] eq $database); # db is array of length 1 holding dbname
  }
  
  if ($dbexists) {
      print "1 Dropping MySQL database $database on host $hostname.\n" if ($verbose>=1);
      $sth = $dbh->prepare("DROP DATABASE $database;");
      $sth->execute();
  } else {
      print "3 MySQL database $database does not exist on $hostname.\n" if ($verbose>=1);
  }
  $dbh->disconnect();
}

sub construct_training_set{

    print "\n\n1 ####### Step 0 at ".(scalar localtime()).": Creating training set with genes using PASA #######\n\n" if ($verbose>=1);

    my $PASAHOME=$ENV{'PASAHOME'};
    die("Error: The environment variable PASAHOME is undefined.\n") unless $PASAHOME;
    
    # run seqclean 

    print "3 cd $trainDir/pasa\n" if ($verbose>=3);
    chdir "$trainDir/pasa" or die ("Cannot change directory to $trainDir/pasa\n");

    if (!uptodate([$fasta_cdna], ["transcripts.fasta"])){
	print "3 ln -fs $fasta_cdna transcripts.fasta\n" if ($verbose>=3);
	system("ln -fs $fasta_cdna transcripts.fasta")==0 or die ("failed to execute: ln -fs $fasta_cdna transcripts.fasta\n");
    }

    if (!uptodate(["transcripts.fasta"], ["transcripts.fasta.clean"])){
	count_fasta_entries("$trainDir/pasa/transcripts.fasta");
	$perlCmdString="seqclean transcripts.fasta 1>seqclean.stdout 2>seqclean.stderr";
	print "2 Running \"$perlCmdString\" ".(scalar localtime())." ..." if ($verbose>=2);
	system("$perlCmdString")==0 or die ("failed to execute: $perlCmdString\n");
	print " Finished! ".(scalar localtime())."\n" if ($verbose>=2);
    } else {
	print ("2 Skipping seqclean. Using existing transcripts.fasta.clean.\n") if ($verbose>=2);
    }

    # set appropriate values in file "alignAssembly.config"
    my $pasaDBname = "PASA$species";
    $pasaDBname =~ s/\./_/g; # replace "." by "_" in species name for MySQL database because it is not allowed there

    if (!uptodate(["$PASAHOME/pasa_conf/pasa.alignAssembly.Template.txt"], ["alignAssembly.config"])){
	$cmdString = "cp $PASAHOME/pasa_conf/pasa.alignAssembly.Template.txt alignAssembly.config";
	print "3 $cmdString\n" if ($verbose>=3);
	system("$cmdString")==0 or die ("failed to execute: $cmdString\n");
	
	print "3 Setting appropriate values in alignAssembly.config\n" if ($verbose>=3);
	open(CONFIG, "alignAssembly.config") or die ("Cannot open file alignAssembly.config!\n");
	open(TEMP, ">temp") or die("\nCannot open file temp\n");
	while(<CONFIG>){
	    s/MYSQLDB=<__MYSQLDB__>/MYSQLDB=$pasaDBname/;    # the database name used in old PASA versions
	    s/DATABASE=<__DATABASE__>/DATABASE=$pasaDBname/; # the database name used in newer PASA versions
	    s/^DATABASE=(.*)$/DATABASE=$1\nMYSQLDB=$1/;      # the renaming wasn't carried out in all PASA scripts, so provide both versions
	    s/<__MAX_INTRON_LENGTH__>/$maxIntronLen/;
	    s/<__MIN_PERCENT_ALIGNED__>/0.8/;
	    s/<__MIN_AVG_PER_ID__>/0.9/;
	    print TEMP;
	}
	close(CONFIG);
	close(TEMP);
  
	$cmdString="rm alignAssembly.config; mv temp alignAssembly.config; chmod a+x alignAssembly.config";
	print "3 $cmdString\n" if ($verbose>=3);
	system("$cmdString")==0 or die ("failed to execute: $cmdString\n");
	print "3 Adjusted alignAssembly.config\n" if ($verbose>=3); 
    } else {
	print ("2 Using existing alignAssembly.config.\n") if ($verbose>=3);
    }

    # executing the Alignment Assembly

    if (!uptodate([$genome_clean, "alignAssembly.config", "transcripts.fasta", "transcripts.fasta.clean"],
		  ["$pasaDBname.assemblies.fasta.transdecoder.genome.gff3", "pasa_asmbls_to_training_set.stdout"])){
	$cmdString="ln -fs $genome_clean genome.fasta";
	print "3 $cmdString\n" if ($verbose>=3);
	system("$cmdString")==0 or die("\nfailed to execute $cmdString\n");
	
	my $gmapoption = "blat";
	$gmapoption = "gmap" if ($useGMAPforPASA);
	
	print "3 Reading MySQL variables from $PASAHOME/pasa_conf/\n" if ($verbose>=3);
	open(my $config_fh, "<", "$PASAHOME/pasa_conf/conf.txt") or die("\nCould not open $PASAHOME/pasa_conf/conf.txt!\n");
	my $MYSQLSERVER;
	my $MYSQL_RO_USER;
	my $MYSQL_RO_PASSWORD;
	my $MYSQL_RW_USER;
	my $MYSQL_RW_PASSWORD;
	while(my $line = <$config_fh>){
	    next if ($line =~ /^\s*#/); # discard comments
	    $MYSQLSERVER=$1       if ($line =~ /MYSQLSERVER=(.*)/);
	    $MYSQL_RO_USER=$1     if ($line =~ /MYSQL_RO_USER=(.*)/);
	    $MYSQL_RO_PASSWORD=$1 if ($line =~ /MYSQL_RO_PASSWORD=(.*)/);
	    $MYSQL_RW_USER=$1     if ($line =~ /MYSQL_RW_USER=(.*)/);
	    $MYSQL_RW_PASSWORD=$1 if ($line =~ /MYSQL_RW_PASSWORD=(.*)/);
	    
	}
	close($config_fh);
	print "0 WARNING: MYSQL_RO_PASSWORD is empty!\n" if (! $MYSQL_RO_PASSWORD);
	
	my $dbh;
	if ($useexisting) {
	    &DropDataBase("$MYSQLSERVER","$pasaDBname","$MYSQL_RW_USER","$MYSQL_RW_PASSWORD",\$dbh);
        }

	if (! -e "$PASAHOME/Launch_PASA_pipeline.pl"){
	    die("Error: Script Launch_PASA_pipeline.pl not found. Ensure that this script exists in PASAHOME folder: $PASAHOME.\n");
	}
	$perlCmdString = "perl $PASAHOME/Launch_PASA_pipeline.pl "
	    ."-c alignAssembly.config -C -R -g $genome_clean "
	    ."-t transcripts.fasta.clean -T -u transcripts.fasta --ALIGNERS $gmapoption --CPU $cpus "
	    ."1>Launch_PASA_pipeline.stdout 2>Launch_PASA_pipeline.stderr";
	
	my $abortString;
	$abortString = "\nFailed to execute, possible reasons could be:\n";
	$abortString.= "1. Fasta headers in cDNA or genome file were not unique";
	$abortString.= " (the sequence name up to the first space).\n";
	$abortString.= "2. Fasta headers in cDNA file were too long";
	$abortString.= " (max 90 characters)(the sequence name up to the first space).\n";
	$abortString.= "3. Fasta headers in cDNA file contains square brackets, commas or other non-letter or non-number characters.";
	$abortString.= " (in sequence name up to the first space).\n";
	if (!$webaugustus) {
		$abortString.= "4. There is already a database named \"$pasaDBname\" on your mysql host.\n";
		$abortString.= "5. The software \"slclust\" is not installed correctly, try to install it";
		$abortString.= " again (see the details in the PASA documentation).\n";
		$abortString.= "Inspect $trainDir/pasa/Launch_PASA_pipeline.stderr for PASA error messages.\n";	
	}
	
	print "2 Executing the Alignment Assembly: \"$perlCmdString\" ".(scalar localtime())." ..." if ($verbose>=2);
	system("$perlCmdString")==0 or die ("$abortString");
	print " Finished ".(scalar localtime())."\n" if ($verbose>=2);
  	

    $perlCmdString="perl $PASAHOME/scripts/pasa_asmbls_to_training_set.dbi "
        ."--pasa_transcripts_fasta $pasaDBname.assemblies.fasta "
        ."--pasa_transcripts_gff3 $pasaDBname.pasa_assemblies.gff3 "
        ."1>pasa_asmbls_to_training_set.stdout 2>pasa_asmbls_to_training_set.stderr";
    
    print "2 Running \"$perlCmdString\" ".(scalar localtime())." ..." if ($verbose>=2);
    
    if (system("$perlCmdString") != 0) {
        # check if it is an error like here: https://github.com/TransDecoder/TransDecoder/issues/71 and try to circumvent it
        if (! -e "pasa_asmbls_to_training_set.stderr") { # check if error file exists
            print "\n2 file pasa_asmbls_to_training_set.stderr doesn't exists.\n" if ($verbose>=2);
            die (" failed to execute: $perlCmdString\n");
        }
        open CHK_ARRAY, "pasa_asmbls_to_training_set.stderr"; # check if a TransDecoder.Predict error occured
        my @chk_array = <CHK_ARRAY>;
        close CHK_ARRAY;
        if (grep(/^Error.*TransDecoder\.Predict.*/,@chk_array) eq 0) {
            print "\n2 This is not a TransDecoder.Predict Error\n" if ($verbose>=2);
            die (" failed to execute: $perlCmdString\n");
        }
        print "\n2 failed to execute: $perlCmdString\n" if ($verbose>=2);
        print "2 Try pasa asmbls to training set without refinement - see https://github.com/TransDecoder/TransDecoder/issues/71\n" if ($verbose>=2);
        if (! -e "$PASAHOME/scripts/pasa_asmbls_to_training_set_no_refine_starts.dbi") {
            my $sedCmdString = "sed 's#\\(.*TransDecoder\\.Predict.*\\)#    \$transdecoder_params \\.= \" --no_refine_starts \";\\n\\1#g' $PASAHOME/scripts/pasa_asmbls_to_training_set.dbi > $PASAHOME/scripts/pasa_asmbls_to_training_set_no_refine_starts.dbi";
            print "2 Create asmbl script without refinement: $sedCmdString\n" if ($verbose>=2);
            system($sedCmdString);
            if (! -e "$PASAHOME/scripts/pasa_asmbls_to_training_set_no_refine_starts.dbi") {
                print "2 Could not create script \"$PASAHOME/scripts/pasa_asmbls_to_training_set_no_refine_starts.dbi\"\n" if ($verbose>=2);
                die (" failed to execute: $perlCmdString\n");
            }
        }
        
        $perlCmdString="perl $PASAHOME/scripts/pasa_asmbls_to_training_set_no_refine_starts.dbi "
            ."--pasa_transcripts_fasta $pasaDBname.assemblies.fasta "
            ."--pasa_transcripts_gff3 $pasaDBname.pasa_assemblies.gff3 "
            ."1>pasa_asmbls_to_training_set_no_refine_starts.stdout 2>pasa_asmbls_to_training_set_no_refine_starts.stderr";
        
        print "2 Running \"$perlCmdString\" ".(scalar localtime())." ..." if ($verbose>=2);
        system("$perlCmdString")==0 or die ("failed to execute: $perlCmdString\n");
    }
    print " Finished ".(scalar localtime())."\n" if ($verbose>=2);
	
	print ("2 Cleaning up after PASA ...\n") if ($verbose>=2);
	my @filesToDelete=("output.assembly_building.out" ,
			   "output.tophits" ,                        
			   "output.tophits.btab" ,
			   "blat_validations" ,                        
			   "BLAT_DIR" ,
			   "output.alignment_assemblies.out" ,                        
			   "output.subclusters.out");
	foreach my $file (@filesToDelete) {         
	    $perlCmdString="rm -rf $file";
	    print "3 Deleting $file\n" if ($verbose>=3);
	    system("$perlCmdString");
	}
    #dropping pasa database 
    &DropDataBase("$MYSQLSERVER","$pasaDBname","$MYSQL_RW_USER","$MYSQL_RW_PASSWORD",\$dbh);
    } else {
 	print ("2 Skipping PASA training set creation. Using existing $pasaDBname.assemblies.fasta.transdecoder.genome.gff3.\n") if ($verbose>=2);
    }
    
    # find complete genes in candidate training file
    if (!uptodate(["$pasaDBname.assemblies.fasta.transdecoder.genome.gff3"], ["trainingSetComplete.gff"])){
	print "3 cd ../training\n" if ($verbose>=3);
	chdir "../training" or die ("Could not change directory to training!\n");
	$cmdString = "grep complete ../pasa/$pasaDBname.assemblies.fasta.transdecoder.cds | perl -pe ".'\'s/>(\S+).*/$1\$/\' | perl -pe \'s#\\.#\\\\.#g\' 1> pasa.complete.lst';
	# lines in file pasa.complete.lst are later used as regex in grep - so all metacharachters have to be escaped (currently only for done for dots as PASA uses no other metacharachters)
	print "3 $cmdString\n" if ($verbose>=3);
	system("$cmdString")==0 or die("\nfailed to execute $cmdString\n");
	if (! -e "pasa.complete.lst" || -z "pasa.complete.lst"){
            die ("PASA has not constructed any complete training gene. Training aborted because of insufficient data.\n");
        }

	# $cmdString="grep -f pasa.complete.lst ../pasa/$pasaDBname.assemblies.fasta.transdecoder.genome.gff3 >trainingSetComplete.temp.gff";
	# replaced by this much faster code:
	$cmdString="split -l 100 pasa.complete.lst pasa.complete.lst.split. ;"
            ."for FILE in pasa.complete.lst.split.* ; do grep -f \"\$FILE\" ../pasa/$pasaDBname.assemblies.fasta.transdecoder.genome.gff3 >> trainingSetComplete.temp.gff; done ; "
            ."rm -f pasa.complete.lst.split.*";
	print "2 Running \"$cmdString\" ".(scalar localtime())." ..." if ($verbose>=2);
	system("$cmdString")==0 or die("\nfailed to execute $cmdString\n");
	print " Finished! ".(scalar localtime())."\n" if ($verbose>=2);
	
	# sort trainingSetComplete.temp.gff for gff2gbSmallDNA.pl later
	
	$cmdString='cat trainingSetComplete.temp.gff | perl -pe \'s/\t\S*(asmbl_\d+).*/\t$1/\' | sort '
	    .'-n -k 4 | sort -s -k 9 | sort -s -k 1,1 > trainingSetComplete.gff';
    
	print "2 Running \"$cmdString\" ".(scalar localtime())." ..." if ($verbose >=2);
	system("$cmdString")==0 or die("\nfailed to execute $cmdString\n");
	print " Finished! ".(scalar localtime())."\n" if ($verbose >=2);
    }

    # calculate the average gene length
    my $file_fh;
    open($file_fh, "<", "../pasa/$pasaDBname.assemblies.fasta.transdecoder.genome.gff3") or die("\nCould not open ../pasa/$pasaDBname.assemblies.fasta.transdecoder.genome.gff3\n");
    my $sum=0;
    my $n=0;
    while(my $line = <$file_fh>){
        if($line =~ /\tgene\t/){
            my @fields = split(/\t/, $line);
            my $len=$fields[4]-$fields[3]+1;
            $sum+=$len;
            $n++;
        }
    }
    close($file_fh);
    print "1 Average gene length in the training set is " . sprintf ("%.2f", ($sum/$n)) . "\n" if ($verbose >=1);
    
    # set flanking DNA

    $flanking_DNA = int($sum/$n);
    $flanking_DNA = 10000 if ($flanking_DNA > 10000);
    $flanking_DNA = 1000 if ($flanking_DNA < 1000);
    print "2 The length of flanking DNA is set as $flanking_DNA accordingly.\n" if ($verbose>=2);

    # convert file format from gff to gb

    $string=find("gff2gbSmallDNA.pl");
    print "3 Found script $string.\n" if ($verbose>=3);
    
    $perlCmdString="perl $string trainingSetComplete.gff $genome_clean $flanking_DNA "
    ."trainingSetComplete.gb 1>gff2gbSmallDNA.stdout 2>gff2gbSmallDNA.stderr";
    
    print "3 $perlCmdString\n" if ($verbose>=3);
    system("$perlCmdString")==0 or die ("failed to execute: $perlCmdString\n");
    
    # let etraining find prolematic genbank entries
    
    # count the number of entries in trainingSetComplete.gb
   
    my $num_TSC=`grep -c ^LOCUS trainingSetComplete.gb`;
    $num_TSC*=1;
    print "1 The training set trainingSetComplete.gb contains $num_TSC entries\n" if ($verbose>=1);
    
    # set "stopCodonExcludedFromCDS" to true
  
    print "2 Now trying to find out whether the CDS in the training set contain or exclude the stop codon.\n" if ($verbose >=2);
    my $genericPath="$AUGUSTUS_CONFIG_PATH/species/generic";
    my $genericPathTrain="$AUGUSTUS_CONFIG_PATH/species/${species}_generic";
    $cmdString = "cp -r $genericPath $genericPathTrain";
    print "3 $cmdString\n" if ($verbose>=3);
    system("$cmdString")==0 or die ("failed to execute: $cmdString\n");

    chdir "$genericPathTrain" or die ("Could not change directory to $genericPathTrain\n");
    print "3 cd $genericPathTrain\n" if ($verbose>=3);

    $cmdString='cat generic_parameters.cfg | perl -pe \'s/(stopCodonExcludedFromCDS ).*/$1true /\' > '."${species}_generic_parameters.cfg";
    print "3 $cmdString\n" if ($verbose>=3);
    system("$cmdString")==0 or die ("failed to execute: $cmdString\n");
    
    print "3 Set value of \"stopCodonExcludedFromCDS\" in ${species}_generic_parameters.cfg to \"true\"\n" if ($verbose>=3);
    
    # first try with etraining
  #  print "3 mv $trainDir/pasa/trainingSetComplete.gb $trainDir/training/trainingSetComplete.gb\n";   
  #  $cmdString="mv $trainDir/pasa/trainingSetComplete.gb $trainDir/training/trainingSetComplete.gb";
   # system("$cmdString")==0 or die("\nfailed to move trainingSetComplete.gb to $trainDir/training\n");
    print "3 cd $trainDir/training\n" if ($verbose>=3);
    chdir "$trainDir/training" or die ("Could not change directory to $trainDir/training\n");
    $cmdString="etraining --species=${species}_generic trainingSetComplete.gb 1>train.out 2>train.err";
    print "3 Running \"$cmdString\" ".(scalar localtime())." ... " if ($verbose>=3);
    system("$cmdString")==0 or die("\nfailed to execute: $cmdString\n");
    print " Finished! ".(scalar localtime())."\n" if ($verbose>=3); 
    print "3 train.out and train.err have been made under $trainDir/training.\n" if ($verbose>=3);
    
    # set "stopCodonExcludedFromCDS" to false and run etraining again if necessary
    my $err_stopCodonExcludedFromCDS=`grep -c "exon doesn't end in stop codon" train.err`;
    my $err_rate=$err_stopCodonExcludedFromCDS/$num_TSC;
    print "3 Error rate caused by \"exon doesn't end in stop codon\" is $err_rate\n" if ($verbose>=3);
    if($err_rate>=0.5){
	print "3 The appropriate value for \"stopCodonExcludedFromCDS\" seems to be \"false\".\n" if ($verbose>=3);
        chdir "$genericPathTrain" or die ("Can not chdir to $genericPathTrain.\n");
        system('cat generic_parameters.cfg | perl -pe \'s/(stopCodonExcludedFromCDS ).*/$1false /\' > '."${species}_generic_parameters.cfg")==0 or die ("failed to execute: $!\n");
        print "3 Set value of \"stopCodonExcludedFromCDS\" in ${species}_generic_parameters.cfg to \"false\"\n" if ($verbose>=3);
        print "3 Try etraining again: \"etraining --species=${species}_generic training.gb.train >train.out \" ..." if ($verbose>=3);
        chdir "$trainDir/training/" or die ("Can not change directory to $trainDir/training.");
        $cmdString="etraining --species=${species}_generic trainingSetComplete.gb 1>train.out 2>train.err";
        print "3 Running \"$cmdString\" ".(scalar localtime())."... " if ($verbose>=3);
        system("$cmdString")==0 or die("\nfailed to execute: $cmdString\n");
        print " Finished! ".(scalar localtime())."\n" if ($verbose>=3);
        print "3 train.out and train.err have been made again under $trainDir/training.\n" if ($verbose>=3);
	print "2 Stop codons seem to be contained by CDS. Setting stopCodonExcludedFromCDS to false\n" if ($verbose>=2);
    }
    else{
	print "2 Stop codons seem to be exluded from CDS. Setting stopCodonExcludedFromCDS to true\n" if ($verbose>=2); 
    }
		
    $cmdString = "rm -rf $genericPathTrain";
    print "3 $cmdString\n" if ($verbose>=3);
    system("$cmdString")==0 or die ("failed to execute: $cmdString\n");

    print "1 Now filtering problematic genes from training set...\n" if ($verbose>=1);

    # extract badlist
    $perlCmdString='cat train.err | perl -ne \'print "$1\n" if /in sequence (\S+):/\' > badlist';
    print "3 Running \"$perlCmdString\" ...\n" if ($verbose>=3);
    system("$perlCmdString")==0 or die ("failed to execute: $perlCmdString\n");

    # check whether only a small fraction of all entries created a problem, if >10%, output a warning
    my $bad_num=`wc -l < badlist`;
    $bad_num*=1;
    print "3 The number of all entries that created a problem is $bad_num\n" if ($verbose>=3);
    my $frac=$bad_num/$num_TSC;
    if($frac>=0.5){
	print "3 WARNING: The fraction of all entries that created a problem is ".($bad_num/$num_TSC)."\n" if ($verbose>=3);
    }

    # create file training.gb without erroneous genes
    $string=find("filterGenes.pl");
    print "3 Found script $string.\n" if ($verbose>=3);
    $perlCmdString="perl $string badlist trainingSetComplete.gb > training.gb";
    print "3 Running \"$perlCmdString\" ".(scalar localtime())." ..." if ($verbose>=3);
    system("$perlCmdString")==0 or die("\nfailed to execute: $perlCmdString!\n");
    print " Finished! ".(scalar localtime())."\n" if ($verbose>=3);

    print "\n1 ####### Finished step 0 at ".(scalar localtime()).": All files are stored in $trainDir #######\n\n" if ($verbose>=1);
}


sub prepare_genome{
    # create summary of genome
    print "3 cd $rootDir/seq\n" if ($verbose>=3);
    chdir "$rootDir/seq" or die ("Could not change directory to ../seq\n");
    my $string=find("summarizeACGTcontent.pl");
    $perlCmdString="perl $string $rootDir/seq/genome_clean.fa > genome.summary";
    print "3 Running \"$perlCmdString\" ".(scalar localtime())." ..." if ($verbose>=3);
    system("$perlCmdString")==0 or die("\nfailed to execute: $perlCmdString!\n");
    print " Finished! ".(scalar localtime())."\n" if ($verbose>=3);

    # create contigs gbrowse file
    $cmdString='cat genome.summary | grep "bases." | perl -pe \'s/(\d+)\sbases.\s+(\S*) BASE.*/$2\tassembly\tcontig\t1\t$1\t.\t.\t.\tContig $2/\' > contigs.gff';
    print "3 Running \"$cmdString\" ".(scalar localtime())." ..." if ($verbose>=3);
    system("$cmdString")==0 or die("\nfailed to execute: $cmdString!\n");
    print " Finished! ".(scalar localtime())."\n" if ($verbose>=3);
}

sub alignments_and_hints{

    
    # BLAT cdna files. find blat, pslCDnaFilter minId(???)
    print "3 cd $rootDir/cdna\n" if ($verbose>=3);
    chdir "$rootDir/cdna" or die ("Could not change directory to $rootDir/cdna\n");
    if (!uptodate([$fasta_cdna], ["cdna.fa"])){
	system("ln -fs $fasta_cdna cdna.fa")==0 or die ("failed to execute: ln -fs $fasta_cdna cdna.fa");
    }
    # blat 
    # maxIntron=5000 to be determined
    if (!uptodate(["../seq/genome_clean.fa", "cdna.fa"], ["cdna.psl"])){
	print "1 Aligning cDNA to genome with BLAT...\n" if ($verbose>=1); 
	if ($cpus > 1 && check_command_exists("pblat")) {
		$cmdString="pblat -threads=$cpus";
	}
	else {
		$cmdString="blat";
	}	
	$cmdString.=" -noHead  -minIdentity=80 -maxIntron=$maxIntronLen ../seq/genome_clean.fa cdna.fa cdna.psl 1>blat.stdout 2>blat.stderr";
	print "3 Running \"$cmdString\" ".(scalar localtime())." ..." if ($verbose>=3);
	
	my $abortString = "\nProgram aborted. BLAT threw an error message.\nPossibly \"BLAT\" is not installed or not in your PATH or your genome or cDNA file contained non-unique fasta headers.\n";  
	
	system("$cmdString")==0 or die("$abortString");
	print "Finished! ".(scalar localtime())."\n" if ($verbose>=3);

	if($verbose>=2){
	    open(BLAT, "blat.stdout") or die ("Cannot open blat.stdout!\n");
	    while(defined (my $i=<BLAT>)){
		print '2'." $i";
	    }
	    close(BLAT);
	}
    } else {
	print "1 Reusing existing BLAT alignment.\n" if ($verbose>=1);
    }
    
    # pslCDnaFilter
    $cmdString="pslCDnaFilter -minId=0.9 -localNearBest=0.005 -ignoreNs -bestOverlap "
	."cdna.psl cdna.f.psl 1>pslCDnaFilter.stdout 2>pslCDnaFilter.stderr";
    print "3 $cmdString\n" if ($verbose>=3);
    
    if (system("$cmdString") != 0) {
	print "WARNING: Could not successfully find and run pslCDnaFilter. Please install this program.\n";
	print "Will continue anyways with unfiltered alignments. Expect worse results.\n";
	system("ln -s cdna.psl cdna.f.psl");
    }
    # create gbrowse files
    $string=find("blat2gbrowse.pl");
    print "3 Found script $string.\n" if ($verbose>=3);
    $perlCmdString="perl $string --source=CDNA cdna.f.psl cdna.gbrowse";
    print "3 Running \"$perlCmdString\" ".(scalar localtime())." ..." if ($verbose>3);
    system("$perlCmdString")==0 or die("\nFailed to execute: $perlCmdString!\n");
    print " Finished! ".(scalar localtime())."\n" if ($verbose>3);
    
    # create hints
    print "1 Creating hints from cDNA alignments ...\n" if ($verbose>=1);
    chdir "../hints" or die("\nCould not change directory to ../hints\n");
    $string=find("blat2hints.pl");
    $perlCmdString="perl $string --in=../cdna/cdna.f.psl --out=hints.E.gff --minintronlen=35 --trunkSS 1>blat2hints.stdout 2>blat2hints.stderr";
    print "2 Running \"$perlCmdString\" ".(scalar localtime())." ..." if ($verbose>=2);
    system("$perlCmdString")==0 or die("\nfailed to execute: $perlCmdString!\n");
    print " Finished! ".(scalar localtime())."\n" if ($verbose>=2);
    
    if ($pasapolyAhints) {
      chdir "../trainingSet" or die ("\nCould not change directory to ../\n");
      my $pasapolyAfile=checkFile("pasa/output.polyAsites.fasta");
      if (defined $pasapolyAfile) {
        print "2 Converting $pasapolyAfile into a hintfile\n" if ($verbose>=2);
        $string=find("pasapolyA2hints.pl");
	$perlCmdString="perl $string $pasapolyAfile > pasa/output.polyAsites.gff";
        print "2 Running \"$perlCmdString\" ".(scalar localtime())." ..." if ($verbose>=2);
	system("$perlCmdString")==0 or die("\nfailed to execute: $perlCmdString!\n");
	print " Finished! ".(scalar localtime())."\n" if ($verbose>=2);
	my $pasapolyAhintfile=checkFile("pasa/output.polyAsites.gff");
	if (defined $pasapolyAhintfile) {
          print "2 Appending PASA-polyA-hint file to the cDNA hint file\n";
	  $perlCmdString="cat ../hints/hints.E.gff $pasapolyAhintfile > ../hints/hints.E.gff.temp";
	  print "3 Running $perlCmdString.\n" if ($verbose>=3);
      	  system("$perlCmdString")==0 or die("\nfailed to execute: $perlCmdString!\n");
	  rename("../hints/hints.E.gff.temp","../hints/hints.E.gff");
	}
      }
    }
     
    chdir $positionWD;
    $estali="$rootDir/cdna/cdna.f.psl";
}


         ####################### train AUGUSTUS without UTR #########################


sub autoTrain_no_utr{
    
    print "\n1 ####### Step 1 at ".(scalar localtime()).": Training AUGUSTUS (no UTR models) #######\n" if ($verbose>=1);
    
    $trainingset   =   checkFile($trainingset, "training", $usage);

    # run autoAugTrain.pl
    $perlCmdString="perl $scriptPath/autoAugTrain.pl --cpus=$cpus -t=$trainingset -s=$species $useexistingopt -g=$genome_clean -w=$rootDir $verboseString --opt=$optrounds";
    print "\n2 $perlCmdString\n" if ($verbose>=2);
    system("$perlCmdString")==0 or die ("failed to execute: $perlCmdString\n");

    print "\n1 ####### Finished step 1 at ".(scalar localtime()).": All files are stored in $rootDir/autoAugTrain #######\n" if ($verbose>=1);
}


         ###################### prepare scripts for AUGUSTUS ######################


sub autoAug_prepareScripts{

    my $hints_switch=shift;         # for AUGUSTUS with hints
    my $utr_switch=shift;           # for AUGUSTUS with UTR

    if($verbose>=1){
	my $string="Preparing scripts for AUGUSTUS";
        print "\n\n1 ";
        print "####### Step 2 at ".(scalar localtime()).": $string without hints and UTR #######"      if (!$hints_switch && !$utr_switch);
        print "####### Step 4 at ".(scalar localtime()).": $string with hints, without UTR #######"    if ( $hints_switch && !$utr_switch);
        print "####### Step 7 at ".(scalar localtime()).": $string with hints and UTR #######"         if ( $hints_switch &&  $utr_switch);
        print "\n";
    }
    
    $autoAugDir = $autoAugDir_abinitio  if (!$hints_switch && !$utr_switch);
    $autoAugDir = $autoAugDir_hints     if ($hints_switch && !$utr_switch);
    $autoAugDir = $autoAugDir_hints_utr if ($hints_switch && $utr_switch);
    $autoAugDir = $autoAugDir_utr       if (!$hints_switch && $utr_switch);

    my $hintsString = "";
    my $utrString  = "";
    $hintsString = "--hints=$hints" if ($hints_switch);
    $utrString   = "--utr"          if ($utr_switch);
   
    
    $perlCmdString = "perl $scriptPath/autoAugPred.pl -g=$genome_clean --species=$species -w=$rootDir $utrString " . 
	"$verboseString $hintsString $useexistingopt";
    $perlCmdString .= " --singleCPU" if ($singleCPU);
    $perlCmdString .= " --cpus=$cpus";
    print "2 $perlCmdString\n" if ($verbose>=2);
    system("$perlCmdString")==0 or die("\nfailed to execute $perlCmdString\n");
    
    my $stepNum;
    $stepNum=2 if (!$hints_switch && !$utr_switch);
    $stepNum=4 if ( $hints_switch && !$utr_switch);
    $stepNum=7 if ( $hints_switch &&  $utr_switch);
    print "\n1 ####### Finished step $stepNum at ".(scalar localtime()).": The scripts are stored in $autoAugDir/shells #######\n" if ($verbose>=1);
	
    my $estString;
    $estString = "--estali=your.cdna.psl" if ($index==1 && !defined($estali));
    $estString = "--estali=$estali"           if ($index==1 && $pasa); 
  
    my $pasaString ="--pasa"               if ($pasa);

    # show prompt

    my $sum=$index+1;
    if (!$singleCPU) {
	print "\n\nWhen above jobs are finished, continue by running the command\n";
	print "autoAug.pl --species=$species --genome=$genome_clean --useexisting "
	    . "--hints=$hints $estString $verboseString $pasaString --index=$sum\n\n";
    }
}


       ########################### deal with results of AUGUSTUS ############################


sub autoAug_continue{
    
    my $hints_switch=shift;         # for AUGUSTUS with hints
    my $utr_switch=shift;           # for AUGUSTUS with UTR

    if($verbose>=1){
	my $string="Continue to predict genome structure with AUGUSTUS";
        print "\n1 ";
        print "####### Step 3 at ".(scalar localtime()).": $string without hints, no UTR #######"       if (!$hints_switch && !$utr_switch);
        print "####### Step 5 at ".(scalar localtime()).": $string with hints, no UTR #######"          if ( $hints_switch && !$utr_switch);
        print "####### Step 8 at ".(scalar localtime()).": $string with hints, containing UTR #######"  if ( $hints_switch &&  $utr_switch);
        print "\n";
    }

    my $hintsString = "";
    my $utrString = "";
    $hintsString = "--hints=$hints" if ($hints_switch);
    $utrString = " --utr" if ($utr_switch);

    $estali="$rootDir/cdna/cdna.f.psl" if ($pasa);

    my $mainDir;
    $mainDir = "$autoAugDir_abinitio"  if ($index==1);
    $mainDir = "$autoAugDir_hints"     if ($index==2);
    $mainDir = "$autoAugDir_utr"       if ($index==3);
 
    $shellDir = "$mainDir/shells";


    $perlCmdString = "perl $scriptPath/autoAugPred.pl --species=$species --genome=$rootDir/seq/genome_clean.fa --continue --workingdir=$rootDir $verboseString $hintsString $utrString $useexistingopt";
    $perlCmdString .= " --singleCPU" if ($singleCPU);
    $perlCmdString .= " --cpus=$cpus";
    my $abortString = "\nError executing\n$perlCmdString\n";
    print "3 $perlCmdString\n" if ($verbose >= 3);
    chdir $positionWD;
    system("$perlCmdString")==0 or die ("$abortString");

    $aug = "$shellDir/../predictions/augustus.gff" if($index==2);
    
    my $stepNum;
    $stepNum=3 if (!$hints_switch && !$utr_switch);
    $stepNum=5 if ( $hints_switch && !$utr_switch);
    $stepNum=8 if ( $hints_switch &&  $utr_switch);

    print "\n1 ####### Finished step $stepNum at ".(scalar localtime()).": All files are stored in $mainDir #######\n" if ($verbose>=1);

}






       ################## run AUGUSTUS completely automatically ##################


sub autoAug_noninteractive{

    my $hints_switch=shift;       # for AUGUSTUS with hints
    my $utr_switch=shift;         # for AUGUSTUS with UTR

    my $hintsString="";
    my $utrString="";
    $hintsString="--hints=$hints" if ($hints_switch);
    $utrString="--utr"            if ($utr_switch);

    my $string;
    $string="ab initio (without hints and utr)" if(!$hints_switch && !$utr_switch);
    $string="with hints" if($hints_switch && !$utr_switch);
    $string="with hints and utr" if($hints_switch && $utr_switch);

    print "\n\n1 ####### Now predicting genes $string in the whole sequence...#######\n" if ($verbose>=1);
    $perlCmdString="perl $scriptPath/autoAugPred.pl -g=$genome_clean --species=$species $hintsString $utrString --noninteractive --cname=$cname -w=$rootDir $verboseString $useexistingopt";
    $perlCmdString .= " --cpus=$cpus";
    print "2 \"$perlCmdString\" ...\n" if ($verbose>1);
    system("$perlCmdString")==0 or die ("failed to execute: $perlCmdString!\n");

    print "\n####### Finished predicting genes $string #######\n";	
}






      ################# training AUGUSTUS with UTR ####################


sub autoTrain_with_utr{
    
    my $stepNum;
    
    $stepNum=6 if (!$noninteractive);
    $stepNum=8 if ( $noninteractive);

    print "\n1 ####### Step $stepNum at ".(scalar localtime()).": Training AUGUSTUS with UTR #######\n" if ($verbose>=1);

    my $augString;
    $augString="--aug=$autoAugDir_hints/predictions/augustus.gff";

    if(-d $rootDir){
  	  $perlCmdString="perl $scriptPath/autoAugTrain.pl --cpus=$cpus -g=$genome_clean -s=$species --utr -e=$estali $augString -w=$rootDir $verboseString --opt=$optrounds --useexisting";
    }else{
  	  $perlCmdString="perl $scriptPath/autoAugTrain.pl --cpus=$cpus -g=$genome_clean -s=$species --utr -e=$estali $augString -w=$rootDir $verboseString --opt=$optrounds $useexistingopt";
    }
    print "\n2 $perlCmdString\n" if ($verbose>=2);
    system("$perlCmdString")==0 or die ("failed to execute: $perlCmdString\n");

    print "\n1 ####### Finished step $stepNum at ".(scalar localtime()).": All files are stored in $rootDir/training/utr #######\n" if ($verbose>=1);
    
}


      ########################### collect all important files in one directory #######################


sub collect{

    my $stepNum;
    
    $stepNum=7 if (!$noninteractive);
    $stepNum=9 if ( $noninteractive);

    print "\n1 ####### Step $stepNum at ".(scalar localtime()).": Collecting important files #######\n" if ($verbose>=1);

    my $summary_dir = "$rootDir/results";
    if (!$useexisting && -d $summary_dir){
	print STDERR "Directory $summary_dir already exists. Use --useexisting or move it.\n";
	exit(1);
    }
    system("mkdir -p $summary_dir")==0 or die("\nCould not create directory $summary_dir.\n");
    
    # build subdir structure

    chdir "$summary_dir" or die("\nError: cannot change directory to $summary_dir!\n");
    for(("gbrowse", "hints","predictions","seq", "genes", "config")){mkdir "$_" if (! -d $_);}
    print "3 All necessary directories have been created under $summary_dir.\n" if ($verbose>=3);
    
    # collect gbrowse files
    print "3 cd gbrowse\n" if ($verbose>=3);
    chdir "gbrowse";
    system ("ln -sf $genome genome.fa") if (!uptodate([$genome], ["genome.fa"]));


    $cmdString = "cp $rootDir/seq/contigs.gff contigs.gff";
    system("$cmdString")==0 or die("\nfailed to execute: $cmdString\n");

    if (-f "$rootDir/cdna/cdna.gbrowse"){
	$cmdString = "ln -sf $rootDir/cdna/cdna.gbrowse cdna.gbrowse";
	system("$cmdString")==0 or die("\nfailed to execute: $cmdString\n");
    }
    
    if (defined($fasta_cdna) && -f $fasta_cdna){
	$cmdString = "ln -sf $fasta_cdna cdna.fa";
	system("$cmdString")==0 or die("\nfailed to execute: $cmdString\n");
    }

    
    foreach((["$autoAugDir_abinitio/gbrowse/augustus.abinitio.gbrowse", "augustus.abinitio.gbrowse"],
            ["$autoAugDir_hints/gbrowse/augustus.E.gbrowse", "augustus.E.gbrowse"],
	    ["$autoAugDir_utr/gbrowse/augustus.UTR.gbrowse", "augustus.UTR.gbrowse"],
	    ["$rootDir/autoAugTrain/gbrowse/utr.train.gbrowse", "utr.train.gbrowse"])){
	if (-f $_->[0]){
	    $cmdString = "cp $_->[0]  $_->[1]";
	    print "3 $cmdString\n" if ($verbose>=3);
	    system("$cmdString")==0 or die ("Could not execute $cmdString");
	}
    }

    # collect the hints file
    print "3 cd ../hints\n" if ($verbose>=3);
    chdir "../hints";
    if($pasa){
	$cmdString="ln -sf $rootDir/hints/hints.E.gff hints.E.gff";
	system("$cmdString")==0 or die("\nfailed to execute: $cmdString\n");
        print "3 $cmdString\n" if ($verbose>=3);
    } elsif ($havehints) {
	$cmdString="ln -sf  $hints hints.E.gff";
	system("$cmdString")==0 or die("\nfailed to execute: $cmdString\n");
	print "3 $cmdString\n" if ($verbose>=3);
    }
    
    # collect prediction files
    print "3 cd ../predictions\n" if ($verbose >= 3);
    chdir "../predictions";
    
    foreach((["$autoAugDir_abinitio/predictions/augustus.gff", "augustus.abinitio.gff"],
	     ["$autoAugDir_abinitio/predictions/augustus.aa", "augustus.abinitio.aa"],
	     ["$autoAugDir_hints/predictions/augustus.gff", "augustus.hints.gff"],
	     ["$autoAugDir_hints/predictions/augustus.aa", "augustus.hints.aa"],
	     ["$autoAugDir_utr/predictions/augustus.gff", "augustus.utr.hints.gff"],
	     ["$autoAugDir_utr/predictions/augustus.aa", "augustus.utr.hints.aa"])){
	if (-f $_->[0]){
	    $cmdString = "cp $_->[0]  $_->[1]";
	    print "3 $cmdString\n" if ($verbose>=3);
	    system("$cmdString")==0 or die ("Could not execute $cmdString");
	}
    }
    
    # make a link for genome.fa 
    chdir "../seq";
    $cmdString="ln -s $genome genome.fa" if (!uptodate([$genome], ["genome.fa"]));

    system("$cmdString")==0 or die("\nfailed to execute: $cmdString\n");
    print "3 $cmdString\n" if ($verbose>=3);
    
    # collect config files
    my $configDir="$AUGUSTUS_CONFIG_PATH/species/$species";
    print '3 cd ../config'."\n" if ($verbose>=3);
    chdir "../config";
    if(-e "*.orig*"){
       $cmdString = "cp $configDir/* . ; rm *.orig*;";
       print "3 $cmdString\n" if ($verbose>=3);
       system("$cmdString")==0 or die ("failed to execute: $cmdString\n");
    }
    # collect files with gb format

    print "3 cd ../genes\n" if ($verbose>=3);
    chdir "../genes";

    foreach(("find $rootDir/autoAugTrain -name \"*.gb\" | grep -v tmp_opt_ > tempgbn",
	     "find $rootDir/autoAugTrain -name \"*.gb.*\" | grep -v .gb.lst >> tempgbn")){
	system("$_")==0 or die("\nfailed to execute: $!\n");
        print "3 $_\n" if ($verbose>=3);
    }

    open(TP, "tempgbn") or die ("Cannot open the file \"tempgbn\"!\n");
    while(defined (my $i=<TP>)){
	$i =~ /^(\/.*\/)(.*)\n$/;
	if(-f "$2"){
	    $cmdString="ln -fs $1$2 $2"."_another";
	    system("$cmdString")==0 or die("\nfailed to execute: $cmdString\n");
	    print "3 $cmdString\n" if ($verbose>=3);
	}
	else{
	    $cmdString="ln -s $1$2 $2";
	    system("$cmdString")==0 or die("\nfailed to execute: $cmdString\n");
            print "3 $cmdString\n" if ($verbose>=3);
	}
    }
    print "3 rm tempgbn\n" if ($verbose>=3);
    system("rm tempgbn")==0 or die die("failed to execute: $!\n");
    
    print "\n1 ####### Finished step $stepNum at ".(scalar localtime()).": All files are stored in $summary_dir #######\n" if ($verbose>=1);
    print "\n1 ####### Done autoAug.pl #######\n" if ($verbose>=1);
    print "" . (scalar localtime()) . "\n" if ($verbose>=1);
}


# check upfront whether any common problems will occur later. So the user doesn't have to wait a long time to
# find out that some programs are not installed.
# TODO: put more checks in here
sub check_upfront{
    print "2 checking for installed programs ... " if ($verbose>=2);
    die("Error: The environment variable AUGUSTUS_CONFIG_PATH is not defined.\n") unless $ENV{'AUGUSTUS_CONFIG_PATH'};
    die("Error: The environment variable PASAHOME is undefined.\n") if ($pasa && !defined($ENV{'PASAHOME'}));
    
    if (system("which augustus > /dev/null") != 0){
        print STDERR "Error: augustus not installed. Please install first.\n";
        exit (1);
    }
    if (defined($fasta_cdna)){
	if (system("which blat > /dev/null") != 0){
	    print STDERR "Error: blat not installed. Please install first.\n";
	    exit (1);
	}
    }
    if ($useGMAPforPASA && $pasa){
	if (system("which gmap > /dev/null") != 0){
	    print STDERR "Error: 'gmap' not installed. Install GMAP first or use BLAT.\n";
	    exit(1);
	}
    }
    if ($pasa){
        if (system("which seqclean > /dev/null") != 0){
            print STDERR "Error: seqclean script not installed. Install seqclean first or if it is available in a PASAHOME subdirectory add this to PATH.\n";
            exit(1);
        }
    }
    find("gff2gbSmallDNA.pl");
    find("summarizeACGTcontent.pl");
    print "ok.\n" if ($verbose>=2);
}

sub count_fasta_entries{
    my $fastaFile=shift;
    my $fc = 0;
    open(FASTA, "<", $fastaFile) or die("Could not open fasta file $fastaFile!\n");
    while(<FASTA>){
	if(m/^>/){$fc++;}
    }
    close(FASTA) or die("Could not close fasta file $fastaFile!\n");
    if($fc<=100){print STDERR "WARNING: Fasta file $fastaFile contained less than 100 entries. At least 100 genes are required for training AUGUSTUS. It is impossible to generate this number of genes with the given data! If PASA will be unable to generate at least one gene structure, the pipeline will die, later!\n";}
}

sub check_command_exists { 
    my $command=shift;
    my $status = system("which $command > /dev/null");
    return !$status;
}