1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
|
#! /usr/bin/perl
# pdbfilter.pl - Read pdb or SCOP sequences from infile and write representative set of sequences to outfile
#
# HHsuite version 3.0.0 (15-03-2015)
#
# Reference:
# Remmert M., Biegert A., Hauser A., and Soding J.
# HHblits: Lightning-fast iterative protein sequence searching by HMM-HMM alignment.
# Nat. Methods, epub Dec 25, doi: 10.1038/NMETH.1818 (2011).
# (C) Johannes Soeding, 2012
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
# We are very grateful for bug reports! Please contact us at soeding@mpibpc.mpg.de
use lib ( $ENV{"HHLIB"} || '/usr/share/hhsuite' )."/scripts";
use HHPaths; # config file with path variables for nr, blast, psipred, pdb, dssp etc.
use strict;
$|= 1; # Activate autoflushing on STDOUT
# Default options
my $idmax=90; # maximum sequence identity threshold
my $Evalmin=0.01; # minimum BLAST E-value (should be <0.01 to ensure that sequences being filtered out are homologous to some representative sequence
my $covmin=90; # minimum coverage threshold (should be large enough to ensure that no structural domain is lost from the filtered set just because it occurs in a sequence with, say, 2 other domains similar to some representative sequence)
my $v=2;
my $blastpgp="$ncbidir/blastpgp";
my $help="
pdbfilter.pl - Read pdb or SCOP sequences from infile and write representative set of
sequences to outfile
Compares each sequence with all others using BLAST (blastpgp). If two sequences A and B are
sufficiently similar, the sequence with lower resolution will be removed.
The exact criterion for removal of sequence B is:
IF more than \$covmin\% residues of sequence B are aligned to sequence A
AND the sequence identity of the alignment is larger than \$idmin
AND the E-value is better than \$Evalmin
AND sequence B has lower resolution than A.
The input file must have been prepared with pdb2fasta.pl or scop2fasta.pl.
Sequences with fewer than 15 residues are suppressed.
Usage: pdbfilter.pl infile filtered_file [-u old_filtered_file] [-id int] [-cov int]
Options
-id <int> maximum sequence identity between representative sequences (default=$idmax)
-e <float> minimum E-value between representative sequences (default=$Evalmin)
-cov <int> minimum coverage of a sequence with a similar one to get thrown out (default=$covmin)
-u <file> update the old filtered file; this saves a lot of execution time
-v <int> verbose mode
Example: pdbfilter.pl pdb.fas pdb70.fas -u pdb70.fas -id 70 -cov 90\n
";
if (@ARGV<2) {print($help); exit;}
my $infile;
my $outfile;
my $oldfile;
my $root=""; # $outfile without extension
my $line;
my $pdbid=""; # e.g. 1ahs_C
my $qpdbid; # pdbid of query sequence
my $seq=""; # sequence record (name+residues) in FASTA
my @seqs; # sequence records as they were read
my $len=0; # length of sequence to be read in
my %len; # $len{$pdbid} is length of sequence
my $resol; # experimental resolution in Angstroem
my %resol; # experimental resolution in Angstroem
my %excluded; # representative sequences are all those not in this hash
my %accepted; # representative sequences accpeted so far
my %similar; # $similar{$pdbid} contains list of all pdbids (SCOPIDs) that are represented by (i.e. similar to) $qpdbid
my %het; # $het{$pdbid} is "*" if sequence $pdbid has a HET: record (i.e. contains HET group with >=10 atoms), "" otherwise
my $id; # sequence identity to query
my $cov; # coverage
my $Evalue; # BLAST E-value
my $nold=0; # number of sequences in oldfile
my $ntot=0; # total number of sequences in oldfile and infile
my $k=0; # counts sequences read in so far
my $sort_by_family=1; # set to 0 if at least one non-SCOP sequence found
# Read command line options
my $options="";
for (my $i=0; $i<=$#ARGV; $i++) {$options.=" $ARGV[$i]";}
if ($options=~s/ -id\s+(\S+)//) {$idmax=$1;}
if ($options=~s/ -cov\s+(\S+)//) {$covmin=$1;}
if ($options=~s/ -e\s+(\S+)//) {$Evalmin=$1;}
if ($options=~s/ -u\s+(\S+)//) {$oldfile=$1;}
if ($options=~s/ -v\s*(\d+)//) {$v=$1;}
if ($options=~s/ -v//) {$v=2;}
if (!$infile && $options=~s/^\s*([^- ]\S+)\s*//) {$infile=$1;}
if (!$outfile && $options=~s/^\s*([^- ]\S+)\s*//) {$outfile=$1;}
# Warn if unknown options found or no infile/outfile
if ($options!~/^\s*$/) {$options=~s/^\s*(.*?)\s*$/$1/g; die("Error: unknown options '$options'\n");}
if (!$infile) {print($help); print("Error: no input file given\n"); exit;}
if (!$outfile) {print($help); print("Error: no output file given\n"); exit;}
$covmin*=0.01;
if ($outfile=~/^(\S*?)\.(.*)$/) {$root=$1;} else {$root=$outfile;}
# Read sequences from oldfile
if ($oldfile) {$nold=&ReadSequences($oldfile,0);}
# Read NEW sequences from infile (the ones that are not yet in oldfile)
$ntot=&ReadSequences($infile,$nold);
# Sort by resolution
@seqs=sort SortByResolution @seqs;
#foreach $seq (@seqs) {print("$seq");}
# Record resolution and length of all sequences in hashes
for ($k=0; $k<@seqs; $k++) {
$seqs[$k]=~s/^(\S+?)!(\S+?)!>(\S+)(.*)/>$3$4/o;
$len{$3}=$1;
$resol{$3}=$2;
my $pdbid=$3;
if ($seqs[$k]=~s/ PDB:\s*(( \d\S{3,5}\*?)+)//) {
$similar{$pdbid}=$1;
} elsif ($seqs[$k]=~s/ SCOPID:\s*(( [a-z]\d\S{5}\*?)+)//) {
$similar{$pdbid}=$1;
} else {
$similar{$pdbid}="";
}
if ($seqs[$k]=~/ HET:/) {
$het{$pdbid}="*";
} else {
$het{$pdbid}="";
}
}
# Format BLAST database, initialize
if ($v>=2) {printf("Doing PSI-BLAST runs\n");}
&PrintSequences("$root.db",$nold);
&System("$ncbidir/formatdb -i $root.db");
my $nacc=0; # number of accepted sequences = number of BLAST searches already performed
my $done=0; # number of sequences already processed
my $nexcl=0; # number of already excluded chains
my $step=($ntot>1000)*0.05+($ntot<=1000)*0.1;
my $reformat=$step; # when X% of the total number of sequences have been excluded, reformat database
####################################################################################################
# For each sequence in infile, do BLAST search and exclude hits that are too similar to query
foreach $seq (@seqs) {
$seq=~/^>(\S+)/o;
$done++;
if($excluded{$1}) {next;}
$qpdbid=$1;
$resol=$resol{$1};
$len=$len{$1};
if ($v>=2 && !($nacc%100)) {
printf("\nSearches:%-4i Done:%3i%% DB-size:%3i%% ",$nacc,100*($nexcl+$nacc)/$ntot+0.5,100*($ntot-$nexcl)/$ntot+0.5);
}
# When a substantial number of sequences have been excluded, reformat database WITHOUT excluded sequences
if (!$oldfile && ($nexcl+$nacc)/$ntot>$reformat) {
if ($v>=2) {printf("\b\b\b%2i%%",100*$reformat+0.5);}
elsif ($v>=3) {
printf("\nReformatting search database containing %i out of %i sequences\n",$ntot-$nexcl,$ntot);}
# Write updated database with all seqs with index >=$done that have not yet been excluded
# (Don't have to compare A->B and B->A, hence exclude seqs with index <$done)
&PrintSequences("$root.db",$done);
&System("$ncbidir/formatdb -i $root.db");
$reformat+=$step;
}
open (TMPFILE,">$root.tmp") || die ("ERROR: cannot open $root.tmp for writing: $!\n");
print(TMPFILE $seq);
close(TMPFILE);
# Find hits that are too similar
if ($v>=3) {print("\n$blastpgp -i $root.tmp -d $root.db -v 1 -b 1000 -s T -z $ntot");}
open(BLAST,"$blastpgp -i $root.tmp -d $root.db -v 1 -b 1000 -s T -z $ntot|");
while ($line=<BLAST>){
if ($line=~/^>(\S+)/o && $1 ne $qpdbid && !$excluded{$1} && !$accepted{$1}) {
$pdbid=$1;
while ($line=<BLAST>){if ($line=~/^ Score = /o) {last;}}
$line=~/ Expect =\s*(\S+)/ or die("Error: format error in '$blastpgp -i $root.tmp -d $root.db -v 1 -b 1000 -s T -z $ntot|', line $.\n");
$Evalue=$1;
$Evalue=~s/^e/1e/;
$Evalue=~s/,$//;
$line=<BLAST>;
$line=~/^ Identities =\s*\d+\/(\d+)\s+\((\d+)\%\)/o or die("Error: format error in '$blastpgp -i $root.tmp -d $root.db -v 1 -b 1000 -s T -z $ntot|', line $.\n");
$len=$1;
$id=$2;
# Coverage = (length of whole alignment (including gaps) - gaps in query or HSP) / total length of matched sequence
if ($line=~/Gaps =\s*(\d+)\/\d+/) {$cov=($len-$1)/$len{$pdbid};} else {$cov=$len/$len{$pdbid};}
## Main filtering criterion: remove sequence from representative set if...
if ($id>=$idmax && $Evalue<=$Evalmin && $cov>=$covmin && $resol<=$resol{$pdbid}) {
$excluded{$pdbid}=1;
# if ($similar{$qpdbid} ne "") {$similar{$qpdbid}.=",";} # separate pdbids with identical sequences with ','
if (!$similar{$qpdbid}) {$similar{$qpdbid}="";}
$similar{$qpdbid}.=" ".$pdbid.$het{$pdbid}.$similar{$pdbid};
$nexcl++;
if ($v>=4) {
print($line);
printf("Rep: $qpdbid excl: $pdbid (len=%3i cov=%3.0f id=$1)\n",$len{$pdbid},100*$cov);
}
}
}
}
close(BLAST);
$nacc++;
$accepted{$qpdbid}=1;
if ($v>=2) {print(".");}
}
if ($v>=2) {print("\n");}
####################################################################################################
if ($sort_by_family) {@seqs=sort SortByFamily @seqs;}
# Print out all representative sequences to outfile
&PrintSequences($outfile,0);
if ($v>=2) {
printf("Written %i out of %i sequences to $outfile\n",$ntot-$nexcl,$ntot);
}
unlink("$root.tmp");
unlink(glob("$root.db*"));
exit;
# Read sequences in infile beginning at index $k
sub ReadSequences()
{
my $infile=$_[0];
my $k=$_[1];
my $k0=$k;
my $nres=0; # skip sequences with fewer than 15 real residues
if ($v>=3) {printf("Reading $infile ... \n");}
open (INFILE,"<$infile") || die ("ERROR: cannot open $infile for reading: $!\n");
while ($line=<INFILE>) {
if ($line=~/^>/) {
if ($pdbid && !$len{$pdbid} && $nres>=15) {
if ($len<26) {chomp $seq; $seq.=('x'x(26-$len))."\n";} # add 'x' to make $seq at least 26 resiudes long
$seqs[$k++]="$len!$resol!$seq";
}
# Read pdbid (or SCOP id) and resolution from name line
if ($line=~/^>(\S{4,6}) .*; (\d+\.?\d*|NMR)A? \{/o) {
# PDB sequence
$pdbid=$1;
if ($2 eq "NMR") {$resol=10;} else {$resol=$2;}
$seq=$line;
$len=$nres=0;
$sort_by_family=0;
} elsif ($line=~/^>([a-z]\S{6}) .* (\d+\.?\d*)\s*$/) {
# SCOP sequence
$pdbid=$1;
if ($2 eq "" or $2 eq "NMR") {$resol=10;} else {$resol=$2;}
$resol=$2;
$line=~s/^(>.*) (\d+\.?\d*)\s*$/$1\n/; # remove resolution at the end of the name line
$seq=$line;
$len=0;
} else {print("WARNING: nameline format not recognized: $line");}
$nres=0;
} else {
$seq.=$line;
$len+=length($line)-1;
$nres+=($line=~tr/a-wyA-WY/a-wyA-WY/);
}
}
if ($pdbid && !$len{$pdbid}) {
if ($len<26) {$seq.=('X'x(26-$len))."\n";} # add 'X' to make $seq at least 26 resiudes long
$seqs[$k++]="$len!$resol!$seq";
}
close(INFILE);
if ($v>=2) {printf("Read %i sequences from $infile ... \n",$k-$k0);}
return $k;
}
# Print all sequences that have not been excluded so far to file, beginning at index $k
sub PrintSequences()
{
open (OUTFILE,">$_[0]") || die("Error: could not write to $_[0]: $!\n");
for (my $k=$_[1]; $k<@seqs; $k++) {
$seqs[$k]=~/>(\S+)/o;
if (!$excluded{$1}) {
my $seq=$seqs[$k];
if ($similar{$1} ne "") {
my $pdbs=$similar{$1};
# Limit number of represented pdbs to 30
if ($pdbs =~/^(\s+\S+){31,}/) {$pdbs =~/^((?:\s+\S+){30})/; $pdbs=$1." ...";}
if ($pdbs=~/^\s*\d\S{3,5}/) {
$seq=~s/^(.*)/$1 PDB:$pdbs/;
} elsif ($pdbs=~/^\s*[a-z]\d\S{5}/) {
$seq=~s/^(.*)/$1 SCOPIDS:$pdbs/;
}
}
print(OUTFILE $seq);
}
}
close(OUTFILE);
}
sub SortByResolution() {
my $aa;
my $bb;
if ($a!~/^\d+!(\S+)!>/o) {
$a=~/^(\S+)/o;
printf("Error: sequence %s does not contain resolution in right format\n",$1);
return 0;
}
$aa=$1;
if ($b!~/^\d+!(\S+)!>/o) {
$b=~/^(\S+)/o;
printf("Error: sequence %s does not contain resolution in right format\n",$1);
return 0;
}
$bb=$1;
return ($aa<=>$bb);
}
sub SortByFamily() {
my $aa;
my $bb;
if ($a!~/^>\S+ (\S+)/) {
$a=~/^(\S+)/o;
printf("Error: sequence %s does not contain resolution in right format\n",$1);
return 0;
}
$aa=$1;
if ($b!~/^>\S+ (\S+)/) {
$b=~/^(\S+)/o;
printf("Error: sequence %s does not contain resolution in right format\n",$1);
return 0;
}
$bb=$1;
return ($aa cmp $bb);
}
sub System() {
$v--;
&HHPaths::System($_[0]);
$v++;
}
|