File: filterGenesIn_mRNAname.pl

package info (click to toggle)
augustus 3.3.2%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 486,188 kB
  • sloc: cpp: 51,969; perl: 20,926; ansic: 1,251; makefile: 935; python: 120; sh: 118
file content (107 lines) | stat: -rwxr-xr-x 2,908 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/perl

#############################################################################
# filterGenesIn_mRNAname.pl
# filter genes from a genbank flat file database
# those genes whose mRNA names are given in a gtf file
# are printed to STDOUT.
#
# This script is used by the braker.pl pipeline.
# Please be extremely careful when changing this script because the braker.pl
# pipeline may fail upon custom modification of this script.
# In case of doubt, contact katharina.hoff@uni-greifswald.de 
#
# usage: fileterGenesIn_mRNAname.pl gtffile dbfile
#
#
# Mario Stanke, Simone Lange, Katharina Hoff; 20.02.2018
############################################################################

use strict;
use warnings;

if ( $#ARGV != 1 ) {
    print "usage: filterGenesIn_mRNAname.pl gtffile dbfile\n\n";
    print "gtffile         genes to be kept in genbank file in gtf format\n";
    print "                (transcript_id "...") is essential.\n";
    print "dbfile          genbank file\n\n";
    print "Only the the first of identically named RNA loci is kept\n";
    exit;
}
my $origfilename = $ARGV[1];
my $goodfilename = $ARGV[0];

my %goodids;
open( GOODFILE, "<", "$goodfilename" )
    || die "Couldn't open goodfile $goodfilename\n";
while (<GOODFILE>) {
    if ( $_ =~ m/transcript_id \"(.*)\"/ ) {
        if (not (defined $goodids{$1})){
	    $goodids{$1} = 1;
	}
    }
}
close(GOODFILE) || die ( "Couldn't close goodfile $goodfilename!\n" );

open( my $ORIGFILE, "$origfilename" ) || die ( "Couldn't open dbfile $origfilename!\n" );
my @data = <$ORIGFILE>;
close($ORIGFILE) || die ( "Couldn't close dbfile $origfilename!\n" );

$/ = "\n//\n";

my $head;
my $mRNAflag = 0;
my $cdsFlag  = 0;
my $genename;
my $printFlag      = 0;
my $firstPrintFlag = 0;

foreach (@data) {
    if ( $_ =~ m/^LOCUS/ ) {
        $head      = "";
        $printFlag = 0;
        $genename  = "";
        $head      = $head . $_;
    }
    if ( $_ =~ m/^FEATURES/ ) {
        $head = $head . $_;
    }
    if ( $_ =~ m/\s+source/ ) {
        $head = $head . $_;
    }
    if ( $mRNAflag == 1 and not( $_ =~ m/CDS/ ) ) {
        $head = $head . $_;
    }
    if ( $_ =~ m/\s+mRNA/ ) {
        $mRNAflag = 1;
        $head     = $head . $_;
    }
    if ( $cdsFlag == 1 ) {
        if ( $_ =~ m/\s+\/gene="/ ) {
            my @tmp = split(/\"/);
            $genename       = $tmp[1];
            $cdsFlag        = 0;
            $firstPrintFlag = 1;
        }
        else {
            $head = $head . $_;
        }
    }
    if ( $_ =~ m/\s+CDS/ ) {
        $mRNAflag = 0;
        $head     = $head . $_;
        $cdsFlag  = 1;
    }
    if ( $firstPrintFlag == 1 and length($head) >= 2 ) {
        if ( $goodids{$genename} ) {
            print $head;
            $head      = "";
            $printFlag = 1;
        }
        $firstPrintFlag = 0;
    }
    if ( $printFlag == 1 ) {
        print $_;
    }
}