1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390
|
#!/usr/bin/perl
use strict;
use warnings;
# tblfix.pl
# NCBI is supplying this Perl script for the convenience of GenBank
# submitters who need to edit 5-column feature tables prior to processing
# with tbl2asn. The tblfix.pl script can perform various data conversions on
# feature table files, with the conversion function specified by a command
# line argument. Additional conversions can be supported by modification of
# the script. Any suggestions for improvements or corrections are welcome.
# However, given the number of possible variations of local data files and
# conditions, NCBI cannot promise to implement them all and cannot be
# expected to troubleshoot local installations. This script is being
# supplied as is, but you are always welcome to make any changes or
# improvements yourself.
#
# For example, the script can create a codon_recognized qualifier from
# either the tRNA product or note qualifier, and it can reverse complement
# the triplet codon for the codon_recognized value. Since the script reads
# from stdin and writes to stdout, individual conversion steps can be piped
# together to accomplish multiple operations on a single command line. This
# avoids requiring submitters to change the storage conventions of their
# underlying data in order to prepare their submission.
#
# Existing conversion arguments are described below:
#
# -product_to_codon_recognized
#
# Parses the product qualifier in a tRNA feature looking for a value that
# fits the pattern "tRNA-Met-CAT". It will create separate product
# "tRNA-Met" and codon_recognized "CAT" qualifiers.
#
# -note_to_codon_recognized
#
# Looks for a tRNA note qualifier that fits the pattern "codon recognized:
# CAT". It will create a codon_recognized "CAT" qualifier and remove the
# note.
#
# -revcomp_codon_recognized
#
# This will adjust the codon recognized if the original data provided the
# reverse complement sequence. For codon_recognized "CAT" it will result
# in a codon_recognized "ATG" qualifier, the correct codon for methionine.
#
# -split_compound_product
#
# Splits a product at " / " delimiters.
#
# -product_to_ec_number
#
# This function parses an enzyme commission number at the end of a
# /product qualifier in the form (EC 1.2.3.4). It will create a product
# qualifier without the EC number and an EC_number qualifier of form
# "1.2.3.4".
#
# -product_to_tc_number
#
# Parses (TC 3.A.5.1.1) to a TCDB (transporter classification database) note.
#
# -remove_ec_number
#
# Remove existing EC_number qualifiers.
#
# -remove_predicted_evidence
#
# If the value of an evidence qualifier is the uninformative value
# "predicted", it will be removed.
#
# -clean_rrna_product
#
# This moves secondary rRNA product names to a note qualifier.
#
# -create_locus_tag
#
# This takes a locus_tag prefix as an additional command-line argument
# and creates gene features with locus_tag of the form PFX_0001, assigning
# sequential numbers, on CDS, tRNA, rRNA, ncRNA, and tmRNA features.
#
# -create_protein_id
#
# This takes a general database identifier as an additional command-line
# argument and creates a protein_id qualifier of the form gnl|DBX|PFX_0001,
# using the existing locus_tag qualifier on a CDS feature.
#
# The regular expressions used to detect these patterns can easily be
# changed to handle different variants.
# Script to edit contents of 5-column feature table files.
# Uses STDIN and STDOUT so multiple instances can be piped.
# define global variables
my $line_number = 0;
my $locus_tag_prefix = "";
my $protein_id_prefix = "";
my $locus_tag_count = 0;
my $flag = shift or die "Must indicate desired command (e.g., -revcomp_codon_recognized)\n";
if ($flag =~ /^-create_locus_tag$/) {
$locus_tag_prefix = shift or die "Must supply locus_tag prefix\n";
}
if ($flag =~ /^-create_protein_id$/) {
$protein_id_prefix = shift or die "Must supply protein_id prefix\n";
}
# state variables for tracking current position in table file
my $thisline;
my $in_key;
my $in_qual;
my $current_key;
my $current_loc;
my $current_qual;
my $current_val;
my @master_loc;
my @tokens;
sub clearflags {
$thisline = "";
$in_key = 0;
$in_qual = 0;
$current_key = "";
$current_loc = "";
$current_qual = "";
$current_val = "";
@master_loc = ();
}
# subroutine for printing key and location
sub printkey {
my $thiskey = shift (@_);
my $is_first = 1;
foreach my $thisloc (@master_loc) {
if ($is_first == 1) {
print STDOUT "$thisloc\t$thiskey\n";
} else {
print STDOUT "$thisloc\n";
}
$is_first = 0;
}
}
# subroutine for filtering by feature key
sub printfeat {
my $thiskey = shift (@_);
if ($flag =~ /^-create_locus_tag$/ and $locus_tag_prefix ne "-") {
if ($thiskey eq "CDS" ||
$thiskey eq "rRNA" || $thiskey eq "tRNA" ||
$thiskey eq "ncRNA" || $thiskey eq "tmRNA") {
printkey ("gene");
my $locus_tag_string = "0000";
$locus_tag_count++;
$locus_tag_string = "0000";
if ($locus_tag_count > 999) {
$locus_tag_string = $locus_tag_prefix . "_" . $locus_tag_count;
} elsif ($locus_tag_count > 99) {
$locus_tag_string = $locus_tag_prefix . "_0" . $locus_tag_count;
} elsif ($locus_tag_count > 9) {
$locus_tag_string = $locus_tag_prefix . "_00" . $locus_tag_count;
} else {
$locus_tag_string = $locus_tag_prefix . "_000" . $locus_tag_count;
}
print STDOUT "\t\t\tlocus_tag\t$locus_tag_string\n";
printkey ($thiskey);
print STDOUT "\t\t\tlocus_tag\t$locus_tag_string\n";
return;
}
}
printkey ($thiskey);
}
# subroutine for filtering by qualifier and value
sub printqual {
my $thisqual = shift (@_);
my $thisval = shift (@_);
my $thiskey = shift (@_);
if ($flag =~ /^-product_to_codon_recognized$/) {
if ($thiskey eq "tRNA" && $thisqual eq "product") {
if ($thisval =~ /^tRNA-([A-Za-z]{3})$/) {
print STDOUT "\t\t\tproduct\ttRNA-$1\n";
return;
} elsif ($thisval =~ /^tRNA-([A-Za-z]+)-([A-Za-z]{3})$/ || $thisval =~ /^([A-Za-z]+)-([A-Za-z]{3})$/) {
print STDOUT "\t\t\tproduct\ttRNA-$1\n";
print STDOUT "\t\t\tcodon_recognized\t$2\n";
return;
} elsif ($thisval =~ /^tRNA-Undet-\?\?\?$/) {
print STDOUT "\t\t\tproduct\ttRNA-OTHER\n";
return;
}
}
}
if ($flag =~ /^-revcomp_codon_recognized$/) {
if ($thiskey eq "tRNA" && $thisqual eq "codon_recognized") {
if ($thisval =~ /^([A-Za-z]{3})$/) {
my $revcomp = reverse $1;
$revcomp =~ tr/ACGTacgt/TGCAtgca/;
print STDOUT "\t\t\tcodon_recognized\t$revcomp\n";
return;
}
}
}
if ($flag =~ /^-note_to_codon_recognized$/) {
if ($thiskey eq "tRNA" && $thisqual eq "note") {
if ($thisval =~ /^codon recognized: ([A-Za-z]{3})$/) {
print STDOUT "\t\t\tcodon_recognized\t$1\n";
return;
}
}
}
if ($flag =~ /^-split_compound_product$/) {
if ($thiskey eq "CDS" && $thisqual eq "product") {
$thisval =~ s/\/ / \/ /g;
$thisval =~ s/ \// \/ /g;
$thisval =~ s/; / \/ /g;
$thisval =~ s/ ;/ \/ /g;
if ($thisval =~ /^(.+)\s+\/\s+(.+)$/) {
@tokens = split (/ \/ /, $thisval);
foreach my $token (@tokens) {
$token =~ s/^\s+//;
$token =~ s/\s+$//;
print STDOUT "\t\t\tproduct\t$token\n";
}
return;
}
}
}
if ($flag =~ /^-product_to_ec_number$/) {
if ($thiskey eq "CDS" && $thisqual eq "product") {
while ($thisval =~ /^(.+)\s*\(\s*EC[\s\:]*([1234567890n\-\.]+)\s*\,\s*EC[\s\:]*([1234567890n\-\.]+)\s*\)(.+)$/) {
$thisval = "$1 (EC $2) (EC $3) $4";
}
if ($thisval =~ /^(.+)\s*\(\s*EC[\s\:]*([1234567890n\-\.]+)\s*\,\s*EC[\s\:]*([1234567890n\-\.]+)\s*\)$/) {
$thisval = "$1 (EC $2) (EC $3)";
}
while ($thisval =~ /^(.+)\s*\(\s*EC[\s\:]*([1234567890n\-\.]+)\s*\)(.+)$/) {
print STDOUT "\t\t\tEC_number\t$2\n";
$thisval = "$1 $3";
$thisval =~ s/\s+\,/,/g;
$thisval =~ s/\s+/ /g;
}
if ($thisval =~ /^(.+)\s*\(\s*EC[\s\:]*([1234567890n\-\.]+)\s*\)$/) {
print STDOUT "\t\t\tproduct\t$1\n";
print STDOUT "\t\t\tEC_number\t$2\n";
return;
}
}
}
if ($flag =~ /^-product_to_tc_number$/) {
if ($thiskey eq "CDS" && $thisqual eq "product") {
if ($thisval =~ /^(.+)\s*\(TC\s*(.+)\)$/) {
print STDOUT "\t\t\tproduct\t$1\n";
print STDOUT "\t\t\tnote\tTCDB $2\n";
return;
}
}
}
if ($flag =~ /^-clean_rrna_product$/) {
if ($thiskey eq "rRNA" && $thisqual eq "product") {
if ($thisval =~ /^([^;]+)\s*;\s*(.+)$/) {
print STDOUT "\t\t\tproduct\t$1\n";
print STDOUT "\t\t\tnote\t$2\n";
return;
}
}
}
if ($flag =~ /^-remove_ec_number$/) {
if ($thisqual eq "EC_number") {
return;
}
}
if ($flag =~ /^-remove_predicted_evidence$/) {
if ($thisqual eq "evidence" && $thisval eq "predicted") {
return;
}
}
if ($flag =~ /^-create_protein_id$/ and $protein_id_prefix ne "-") {
if ($thiskey eq "CDS" && $thisqual eq "locus_tag") {
print STDOUT "\t\t\tprotein_id\tgnl|$protein_id_prefix|$thisval\n";
print STDOUT "\t\t\t$thisqual\t$thisval\n";
return;
}
}
print STDOUT "\t\t\t$thisqual\t$thisval\n";
}
#subroutine to print next feature key / location / qualifier line
sub flushline {
if ($in_key == 1) {
# call filter subroutine for feature keys
printfeat ($current_key);
} elsif ($in_qual == 1) {
if ($current_val eq "") {
print STDOUT "\t\t\t$current_qual\n";
} else {
# call filter subroutine for qualifiers
printqual ($current_qual, $current_val, $current_key);
}
}
}
# initialize flags at start of program
clearflags ();
# main loop reads one line at a time
while ($thisline = <STDIN>) {
$thisline =~ s/\r//;
$thisline =~ s/\n//;
$line_number++;
if ($thisline =~ /^>Feature.+$/) {
# new table
flushline ();
$in_key = 0;
$in_qual = 0;
$current_key = "";
$current_loc = "";
$current_qual = "";
$current_val = "";
@master_loc = ();
print STDOUT "$thisline\n";
} elsif ($thisline =~ /^(<?\d+)\t(>?\d+)\t(\S+)$/) {
# new feature
flushline ();
$in_key = 1;
$in_qual = 0;
$current_key = $3;
$current_loc = "$1\t$2";
@master_loc = ();
push (@master_loc, $current_loc);
} elsif ($thisline =~ /^(<?\d+)\t(>?\d+)$/) {
# multiple intervals
$in_key = 1;
$in_qual = 0;
$current_loc = "$1\t$2";
push (@master_loc, $current_loc);
} elsif ($thisline =~ /^\t\t\t(\S+)\s*$/) {
# singleton qualifier
flushline ();
$in_key = 0;
$in_qual = 1;
$current_qual = $1;
$current_val = "";
} elsif ($thisline =~ /^\t\t\t(\S+)\t(.+)$/) {
flushline ();
# qualifier
$in_key = 0;
$in_qual = 1;
$current_qual = $1;
$current_val = $2;
}
}
flushline ();
# close input and output files
close (STDIN);
close (STDOUT);
|