File: FastAlign.pl

package info (click to toggle)
bioperl 1.7.8-1
links: PTS, VCS
area: main
in suites: bookworm, sid, trixie
size: 35,788 kB
sloc: perl: 94,019; xml: 14,811; makefile: 20
file content (226 lines) | stat: -rw-r--r-- 6,822 bytes
parent folder | download | duplicates (5)
#! /usr/bin/perl
#####################################################
#                 Fasta
#                     |
#                     Align
#
#                     By
#                 Antony Vincent 
#          (a.vincent.0312@gmail.com)
#           	
# FastAlign is a perl script which uses the heuristic method
# of tfasty36 for find similarity between a query sequence
# in amino acids and a sequence in nucleotides. It provides
# a more intuitive output to find exon-intron junctions. 
# The query string is in amino acids and the hit string is
# in nucleotides. There are extra nucleotides at the end of
# the hit string (option -diff and by default = 10), that 
# allow to verify if the intron start with common rules 
# (5'-GTGCGA-... for group II intron and after an exonic T 
# for group I intron).
# 
# The FASTA version can be changed by the user by changing
# the line with tfasty36 for tfastyXX.
#
# If you have Emboss, you can genarate a graphic with option
# -graph 1.
#
# For complete help: type perl fastalign.pl -help
#              Last Update: 01/06/13
#######################################################


=head1

Copyright (C) 2013  Antony Vincent

Licence:

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <http://www.gnu.org/licenses/>.

=cut

use strict; 
use Bio::SearchIO;
use Bio::SeqIO;
use Getopt::Long;
use Bio::SeqUtils;


## Set the default variables
	my $identity     = 75;
	my $length      = 50;
	my $diff      = 10;
	my $out      = 'output';
	my $graphic      = 10;
	my $query;
	my $library;
	my $help;

GetOptions(
    'seq=s'    => \$query,
    'db=s'     => \$library, 
    'graph=s'     => \$graphic,
    'i=i'     => \$identity,
    'l=i'     => \$length,
    'diff=s'    => \$diff,
    'out=s'     => \$out,   
    'help!'     => \$help,
) 
or die "Incorrect usage! Try perl fastalign.pl -help for an exhaustif help.\n";
###
	if( $help ) 
{ # if start
    print "\n";
    print "Two options are required:\n";
    print "	-seq: Your sequence in amino acids\n";
    print "	-db: The sequence in nucleotides (Could be a whole genome or a partial sequence...)\n";
    print "\n";
    print "There are few miscellaneous options:\n";
    print "	-i: Minimum identity percentage (default = 75)\n";
    print "	-l: Minimum match length (default = 50)\n";
    print "	-diff: Difference between the hit string and the alignement (default = 10)\n";
    print "	-out: Name of the output file (default = output.txt)\n";
    print "	-graph: If this option = 1, a graph will be generated (default = no)\n";
} # if help

	else 
{ # else start
my $date = `date`;
open (WRITE, ">>$out.txt"); ## Open the output file
print WRITE "		Fasta\n";
print WRITE "		    |\n";
print WRITE "		    Align\n\n";
print WRITE "Date:", $date, "\n";
print WRITE "PARAMETERS\n";
print WRITE "Minimum identity =", $identity, "\n";
print WRITE "Minimum length =", $length, "\n";
print WRITE "Diff =", $diff, "\n\n";

	if ( $graphic == 1 )
{
open (WRITE, ">>$out.txt"); ## Open the output file
open (WRITE2, ">>lindna.lnp"); ## Open the lindna config file

## start the lindna header
print WRITE2 "start";
print WRITE2 "\t";
print WRITE2 "1";
print WRITE2 "\n";
print WRITE2 "End";
print WRITE2 "\t";
my $seqio_obj = Bio::SeqIO->new(-file => "$library", -format => "fasta" );
my $seq_obj = $seqio_obj->next_seq;
my $count_obj = $seq_obj->length;
print WRITE2 "$count_obj";
print WRITE2 "\n\n";
print WRITE2 "group";
print WRITE2 "\n";
}
	else
{
	print "No graphic generated \n";
}
## run tfasty36
my $fh;                   
my $fasta   = "tfasty36"; # <------ You can change this line for newest fasta algorithm

my $command = "$fasta $query $library";
 
open $fh,"$command |" || die("cannot run fasta cmd of $command: $!\n");
 
my $searchio  = Bio::SearchIO->new(-format => 'fasta', -fh => $fh);
print WRITE "Fasta algorithm:", $fasta, "\n\n";
			## start the parsing part of the script

while( my $result = $searchio->next_result ) {
  ## $result is a Bio::Search::Result::ResultI compliant object
  while( my $hit = $result->next_hit ) {
    ## $hit is a Bio::Search::Hit::HitI compliant object
    while( my $hsp = $hit->next_hsp ) {
      ## $hsp is a Bio::Search::HSP::HSPI compliant object
      if( $hsp->length('total') > $length ) {
        if ( $hsp->percent_identity >= $identity ) {

          		## Generals informations
print WRITE "Rank=", $hsp->rank, "\n",
            "Query=",   $result->query_name, "\n",
            "Hit=",        $hit->name, "\n" ,
            "Length=",     $hsp->length('total'), "\n",
            "Percent_id=", $hsp->percent_identity, "\n",
	    "Strand=", $hsp->strand('hit'), "\n";

	     
			print WRITE "\n";

		
  	 
		
			## Search for nucleotide sequences
			print WRITE "\n";
	my $start_hit = $hsp->start('hit'), "\n";
	my $end_hit = $hsp->end('hit') , "\n";	
   	my $in  = Bio::SeqIO->new(-file => "$library" , '-format' => 'fasta');

    while ( my $seq = $in->next_seq() ) {#1
    	  
		## looking for query position
			my $start_query = $hsp->start('query'), "\n";
			my $end_query = $hsp->end('query') , "\n";
		## aa_to_3aa
        my $seqobj = Bio::PrimarySeq->new ( -seq => $hsp->query_string);
	my $polypeptide_3char = Bio::SeqUtils->seq3($seqobj);

                ## modify the homology string
      my $homo = $hsp->homology_string;
         $homo =~ s/:/|||/g;
         $homo =~ s/\./***/g;
         $homo =~ s/ /XXX/g;
	
	
			## HSP
           
	print WRITE "Query($start_query,$end_query)\n";
	print WRITE "Hit($start_hit,$end_hit)\n\n";
	print WRITE $polypeptide_3char, "\n";
	print WRITE $homo, "\n";
	print WRITE $seq->subseq($start_hit,$end_hit+$diff), "\n";


        if ( $graphic == 1)
{ ## if start
	 ## write in lindna file
	print WRITE2 "label", "\n", "Block", "\t", "$start_hit", "\t",
                      "$end_hit", "\t", "3", "\t", "H", "\n";
        print WRITE2 "Exon", $hsp->rank, "\n";
        print WRITE2 "endlabel";
        print WRITE2 "\n\n";
} ## if end
	else
{print "No lindna file generated\n";}
} #1
print WRITE "\n";

        }
      }
    }  
  }
}
        if ( $graphic == 1)
{ ## if start
print WRITE2 "endgroup";
system ("lindna -infile lindna.lnp -ruler y -blocktype filled -graphout svg");
system ("rm *.lnp");
} ## if end
} # else end