1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65
|
#!/usr/bin/env perl
# Copyright 2013-2019, Derrick Wood, Jennifer Lu <jlu26@jhmi.edu>
#
# This file is part of the Kraken taxonomic sequence classification system.
# Looks up accession numbers and reports associated taxonomy IDs
#
# Input is (a) 1 2-column TSV file w/ sequence IDs and accession numbers,
# and (b) a list of accession2taxid files from NCBI.
# Output is tab-delimited lines, with sequence IDs in first
# column and taxonomy IDs in second.
use strict;
use warnings;
use File::Basename;
use Getopt::Std;
my $PROG = basename $0;
my $lookup_list_file = shift @ARGV;
my %target_lists;
open FILE, "<", $lookup_list_file
or die "$PROG: can't open $lookup_list_file: $!\n";
while (<FILE>) {
chomp;
my ($seqid, $accnum) = split /\t/;
$target_lists{$accnum} ||= [];
push @{ $target_lists{$accnum} }, $seqid;
}
close FILE;
my $initial_target_count = scalar keys %target_lists;
my @accession_map_files = @ARGV;
for my $file (@accession_map_files) {
open FILE, "<", $file
or die "$PROG: can't open $file: $!\n";
scalar(<FILE>); # discard header line
while (<FILE>) {
chomp;
my ($accession, $with_version, $taxid, $gi) = split /\t/;
if ($target_lists{$accession}) {
my @list = @{ $target_lists{$accession} };
delete $target_lists{$accession};
for my $seqid (@list) {
print "$seqid\t$taxid\n";
}
last if ! %target_lists;
}
}
close FILE;
last if ! %target_lists;
}
if (%target_lists) {
warn "$PROG: @{[scalar keys %target_lists]}/$initial_target_count accession numbers remain unmapped, see unmapped.txt in DB directory\n";
open LIST, ">", "unmapped.txt"
or die "$PROG: can't write unmapped.txt: $!\n";
for my $target (keys %target_lists) {
print LIST "$target\n";
}
close LIST;
}
|