1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52
|
#!/usr/bin/env perl
# Copyright 2013-2019, Derrick Wood, Jennifer Lu <jlu26@jhmi.edu>
#
# This file is part of the Kraken taxonomic sequence classification system.
# Reads multi-FASTA input and examines each sequence header. Headers are
# OK if a taxonomy ID is found (as either the entire sequence ID or as part
# of a # "kraken:taxid" token), or if something looking like an accession
# number is found. Not "OK" headers will are fatal errors unless "--lenient"
# is used.
#
# Each sequence header results in a line with three tab-separated values;
# the first indicating whether third column is the taxonomy ID ("TAXID") or
# an accession number ("ACCNUM") for the sequence ID listed in the second
# column.
use strict;
use warnings;
use File::Basename;
use Getopt::Long;
require "$ENV{KRAKEN_DIR}/krakenlib.pm";
my $PROG = basename $0;
my $lenient = 0;
GetOptions("lenient" => \$lenient)
or die "Usage: $PROG [--lenient] <fasta filename(s)>\n";
while (<>) {
next unless /^>/;
# while (/.../g) needed because non-redundant DBs sometimes have multiple
# sequence IDs in the header; extra sequence IDs are prefixed by
# '\x01' characters (if downloaded in FASTA format from NCBI FTP directly).
s/^>gi\|/>/;
s/\| .*//;
while (/(?:^>|\x01)(\S+)/g) {
my $seqid = $1;
my $taxid = krakenlib::check_seqid($seqid);
if (! defined $taxid) {
next if $lenient;
die "$PROG: unable to determine taxonomy ID for sequence $seqid\n";
}
# "$taxid" may actually be an accession number
if ($taxid =~ /^\d+$/) {
print "TAXID\t$seqid\t$taxid\n";
}
else {
print "ACCNUM\t$seqid\t$taxid\n";
}
}
}
|