File: extractTaxonomy.pl

package info (click to toggle)
radiant 2.7%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 1,048 kB
  • sloc: perl: 5,393; sh: 323; makefile: 35
file content (95 lines) | stat: -rwxr-xr-x 1,463 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
#!/usr/bin/env perl

# Copyright © 2011, Battelle National Biodefense Institute (BNBI);
# all rights reserved. Authored by: Brian Ondov, Nicholas Bergman, and
# Adam Phillippy
#
# See the LICENSE.txt file included with this software for license information.


use strict;


my ($taxonomy) = @ARGV;

my %data;

# load scientific names for each tax ID

open NAMES, "<$taxonomy/names.dmp" or die "Couldn't open names.dmp";

while ( my $line = <NAMES> )
{
	my ($id, $name, $uniqueName, $class) = split /\t\|\t/, $line;
	
	if ( $class =~ /scientific name/ )
	{
		if ( ! defined $data{$id} )
		{
			$data{$id} = ();
		}
		
		$data{$id}->{'name'} = $name;
	}
}

close NAMES;

# load parents and ranks for each tax ID

open NODES, "<$taxonomy/nodes.dmp" or die "Couldn't open nodes.dmp";

while ( my $line = <NODES> )
{
	$line =~ /(\d+)\t\|\t(\d+)\t\|\t([^\t]+)/;
	
	my $id = $1;
	
	if ( ! defined $data{$id} )
	{
		$data{$id} = ();
	}
	
	$data{$id}->{'parent'} = $2;
	$data{$id}->{'rank'} = $3;
}

close NODES;

open OUT, ">$taxonomy/taxonomy.tab" or die "Couldn't write to taxonomy.tab";

foreach my $id ( sort {$a <=> $b} keys %data )
{
	print OUT join "\t",
	(
		$id,
		depth($id),
		getParent($id),
		$data{$id}->{'rank'},
		$data{$id}->{'name'}
	);
	print OUT "\n";
}

close OUT;


sub depth
{
	my ($id) = @_;
	
	if ( $id == 1 )
	{
		return 0;
	}
	else
	{
		return depth(getParent($id)) + 1;
	}
}

sub getParent
{
	my ($id) = @_;
	return $data{$id}->{'parent'};
}