1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
|
#!/usr/bin/perl
#############################################################
# filterGenes
# filter genes from a genbank flat file database
# usage: fileterGenesIn_mRNAname.pl namefile dbfile
#
#
# Mario Stanke, Simone Lange, Katharina Hoff; 21.12.2015
#############################################################
use strict;
use warnings;
if ($#ARGV != 1) {
print "usage:filterGenes namefile dbfile\n";
print "names of the loci to be kept come from\n";
print "the first parameter. Only the the first of identical loci is kept\n";
exit;
}
my $origfilename = $ARGV[1];
my $goodfilename = $ARGV[0];
my %goodids;
open(GOODFILE, "<", "$goodfilename") || die "Couldn't open goodfile $goodfilename\n";
while(<GOODFILE>) {
if($_ =~ m/transcript_id \"(.*)\"/) {
$goodids{$1} = 1;
}
}
close(GOODFILE) || die "Couldn't close goodfile $goodfilename!\n";
open(my $ORIGFILE, "$origfilename") || die "Couldn't open dbfile\n";
my @data = <$ORIGFILE>;
close($ORIGFILE);
$/="\n//\n";
my $head;
my $mRNAflag = 0;
my $cdsFlag = 0;
my $genename;
my $printFlag = 0;
my $firstPrintFlag = 0;
foreach(@data) {
if($_=~m/^LOCUS/){
$head = "";
$printFlag = 0;
$genename = "";
$head = $head.$_;
}
if($_=~m/FEATURES/){
$head = $head.$_;
}
if($_=~m/source/){
$head = $head.$_;
}
if($mRNAflag==1 and not($_=~m/CDS/)){
$head = $head.$_;
}
if($_=~m/mRNA/){
$mRNAflag = 1;
$head = $head.$_;
}
if($cdsFlag==1){
if($_=~m/gene="/){
my @tmp = split(/\"/);
$genename = $tmp[1];
$cdsFlag = 0;
$firstPrintFlag = 1;
}else{
$head = $head.$_;
}
}
if($_=~m/CDS/){
$mRNAflag = 0;
$head = $head.$_;
$cdsFlag = 1;
}
if($firstPrintFlag==1 and length($head)>=2){
if($goodids{$genename}){
print $head;
$head = "";
$printFlag = 1;
}
$firstPrintFlag = 0;
}
if($printFlag==1){
print $_;
}
}
|