File: centrifuge-promote

package info (click to toggle)
centrifuge 1.0.3-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye, buster, sid
  • size: 11,864 kB
  • sloc: cpp: 51,936; perl: 1,919; python: 1,538; makefile: 618; sh: 352
file content (122 lines) | stat: -rwxr-xr-x 2,471 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#!/usr/bin/env perl

use strict ;
use warnings ;

use File::Basename;
use Cwd;
use Cwd 'cwd' ;
use Cwd 'abs_path' ;


die "Usage: centrifuge-promote.pl centrifuge_index centrifuge_output level > output\n\n".
	"Promote the taxonomy id to specified level in Centrifuge output.\n" if ( @ARGV == 0 ) ;

my $CWD = dirname( abs_path( $0 ) ) ;
# Go through the index to obtain the taxonomy tree
my %taxParent ; 
my %taxIdToSeqId ;
my %taxLevel ;

my $centrifuge_index = $ARGV[0] ;
open FP1, "-|", "$CWD/centrifuge-inspect --taxonomy-tree $centrifuge_index" or die "can't open $!\n" ;
while ( <FP1> )
{
	chomp ;
	my @cols = split /\t\|\t/;
	$taxParent{ $cols[0] } = $cols[1] ;
	$taxLevel{ $cols[0] } = $cols[2] ;
}
close FP1 ;
open FP1, "-|", "$CWD/centrifuge-inspect --conversion-table $centrifuge_index" or die "can't open $!\n" ;
while ( <FP1> )
{
	chomp ;
	my @cols = split /\t/ ;
	$taxIdToSeqId{ $cols[1] } = $cols[0] ;
}
close FP1 ;

# Go through the output of centrifuge
my $level = $ARGV[2] ;
sub PromoteTaxId
{
	my $tid = $_[0] ;
	return 0 if ( $tid <= 0 || !defined( $taxLevel{ $tid } ) ) ;

	if ( $taxLevel{ $tid } eq $level )
	{
		return $tid ;
	}
	else
	{
		return 0 if ( $tid <= 1 ) ;
		return PromoteTaxId( $taxParent{ $tid } ) ;
	}
}

sub OutputPromotedLines
{
	my @lines = @{ $_[0] } ;
	return if ( scalar( @lines ) <= 0 ) ;

	my @newLines ;
	my $i ;
	my $numMatches = 0 ;
	my %showedUpTaxId ;
	my $tab = sprintf( "\t" ) ;
	for ( $i = 0 ; $i < scalar( @lines ) ; ++$i )
	{
		my @cols = split /\t+/, $lines[ $i ] ;
		my $newTid = PromoteTaxId( $cols[2] ) ;
		if ( $newTid <= 1 )
		{
			$newTid = $cols[2] ;
		}
		my $newLevel = $cols[1] ;
		$newLevel = $taxLevel{ $newTid } if ( $newTid >= 1 && defined $taxLevel{ $newTid } ) ;
			
		next if ( defined $showedUpTaxId{ $newTid } ) ;
	
		$showedUpTaxId{ $newTid } = 1 ;	
		++$numMatches ;

		$cols[2] = $newTid ;
		$cols[1] = $newLevel ;
		push @newLines, join( $tab, @cols ) ;
	}

	for ( $i = 0 ; $i < scalar( @newLines ) ; ++$i )
	{
		my @cols = split /\t+/, $newLines[$i] ;
		$cols[-1] = $numMatches ;
		print join( $tab, @cols ), "\n" ;
	}
}

open FP1, $ARGV[1] ;
my $header = <FP1> ;
my $prevReadId = "" ;
my @lines ;

print $header ;
while ( <FP1> )
{
	chomp ;
	my @cols = split /\t/ ;
	if ( $cols[0] eq $prevReadId )
	{
		push @lines, $_ ;
	}
	else
	{
		$prevReadId = $cols[0] ;
				
		OutputPromotedLines( \@lines ) ;

		undef @lines ;
		push @lines, $_ ;
	}
}
OutputPromotedLines( \@lines )  ;
close FP1 ;