1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
|
#!/usr/bin/perl
#############################################################################
# filterGenesIn_mRNAname.pl
# filter genes from a genbank flat file database
# those genes whose mRNA names are given in a gtf file
# are printed to STDOUT.
#
# This script is used by the braker.pl pipeline.
# Please be extremely careful when changing this script because the braker.pl
# pipeline may fail upon custom modification of this script.
# In case of doubt, contact katharina.hoff@uni-greifswald.de
#
# usage: fileterGenesIn_mRNAname.pl gtffile dbfile
#
#
# Mario Stanke, Simone Lange, Katharina Hoff; 20.02.2018
############################################################################
use strict;
use warnings;
if ( $#ARGV != 1 ) {
print "usage: filterGenesIn_mRNAname.pl gtffile dbfile\n\n";
print "gtffile genes to be kept in genbank file in gtf format\n";
print " (transcript_id "...") is essential.\n";
print "dbfile genbank file\n\n";
print "Only the the first of identically named RNA loci is kept\n";
exit;
}
my $origfilename = $ARGV[1];
my $goodfilename = $ARGV[0];
my %goodids;
open( GOODFILE, "<", "$goodfilename" )
|| die "Couldn't open goodfile $goodfilename\n";
while (<GOODFILE>) {
if ( $_ =~ m/transcript_id \"(.*)\"/ ) {
if (not (defined $goodids{$1})){
$goodids{$1} = 1;
}
}
}
close(GOODFILE) || die ( "Couldn't close goodfile $goodfilename!\n" );
open( my $ORIGFILE, "$origfilename" ) || die ( "Couldn't open dbfile $origfilename!\n" );
my @data = <$ORIGFILE>;
close($ORIGFILE) || die ( "Couldn't close dbfile $origfilename!\n" );
$/ = "\n//\n";
my $head;
my $mRNAflag = 0;
my $cdsFlag = 0;
my $genename;
my $printFlag = 0;
my $firstPrintFlag = 0;
foreach (@data) {
if ( $_ =~ m/^LOCUS/ ) {
$head = "";
$printFlag = 0;
$genename = "";
$head = $head . $_;
}
if ( $_ =~ m/^FEATURES/ ) {
$head = $head . $_;
}
if ( $_ =~ m/\s+source/ ) {
$head = $head . $_;
}
if ( $mRNAflag == 1 and not( $_ =~ m/CDS/ ) ) {
$head = $head . $_;
}
if ( $_ =~ m/\s+mRNA/ ) {
$mRNAflag = 1;
$head = $head . $_;
}
if ( $cdsFlag == 1 ) {
if ( $_ =~ m/\s+\/gene="/ ) {
my @tmp = split(/\"/);
$genename = $tmp[1];
$cdsFlag = 0;
$firstPrintFlag = 1;
}
else {
$head = $head . $_;
}
}
if ( $_ =~ m/\s+CDS/ ) {
$mRNAflag = 0;
$head = $head . $_;
$cdsFlag = 1;
}
if ( $firstPrintFlag == 1 and length($head) >= 2 ) {
if ( $goodids{$genename} ) {
print $head;
$head = "";
$printFlag = 1;
}
$firstPrintFlag = 0;
}
if ( $printFlag == 1 ) {
print $_;
}
}
|