File: scan_fasta_file.pl

package info (click to toggle)
kraken 1.1.1-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, forky, sid, trixie
  • size: 4,636 kB
  • sloc: cpp: 1,924; perl: 1,563; sh: 470; makefile: 51
file content (52 lines) | stat: -rwxr-xr-x 1,665 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
#!/usr/bin/env perl

# Copyright 2013-2019, Derrick Wood, Jennifer Lu <jlu26@jhmi.edu>
#
# This file is part of the Kraken taxonomic sequence classification system.

# Reads multi-FASTA input and examines each sequence header.  Headers are
# OK if a taxonomy ID is found (as either the entire sequence ID or as part
# of a # "kraken:taxid" token), or if something looking like an accession
# number is found.  Not "OK" headers will are fatal errors unless "--lenient"
# is used.
#
# Each sequence header results in a line with three tab-separated values;
# the first indicating whether third column is the taxonomy ID ("TAXID") or
# an accession number ("ACCNUM") for the sequence ID listed in the second
# column.

use strict;
use warnings;
use File::Basename;
use Getopt::Long;
require "$ENV{KRAKEN_DIR}/krakenlib.pm";

my $PROG = basename $0;

my $lenient = 0;
GetOptions("lenient" => \$lenient)
  or die "Usage: $PROG [--lenient] <fasta filename(s)>\n";

while (<>) {
  next unless /^>/;
  # while (/.../g) needed because non-redundant DBs sometimes have multiple
  #   sequence IDs in the header; extra sequence IDs are prefixed by
  #   '\x01' characters (if downloaded in FASTA format from NCBI FTP directly).
  s/^>gi\|/>/;
  s/\| .*//;
  while (/(?:^>|\x01)(\S+)/g) {
    my $seqid = $1;
    my $taxid = krakenlib::check_seqid($seqid);
    if (! defined $taxid) {
      next if $lenient;
      die "$PROG: unable to determine taxonomy ID for sequence $seqid\n";
    }
    # "$taxid" may actually be an accession number
    if ($taxid =~ /^\d+$/) {
      print "TAXID\t$seqid\t$taxid\n";
    }
    else {
      print "ACCNUM\t$seqid\t$taxid\n";
    }
  }
}