File: CS_add_taxonomy.pl

package info (click to toggle)
microbiomeutil 20101212%2Bdfsg1-4
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 49,276 kB
  • sloc: perl: 4,878; ansic: 419; makefile: 98; sh: 27
file content (129 lines) | stat: -rwxr-xr-x 2,805 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env perl

use strict;
use warnings;

use FindBin;
use lib ("$FindBin::Bin/../PerlLib");

use Fasta_reader;



my %acc_to_taxonomy;

{
	my $db_FASTA = "/usr/share/microbiomeutil-data/RESOURCES/rRNA16S.gold.fasta";
	if ( ! -e $db_FASTA ) {
		if ( -e "../../RESOURCES/rRNA16S.gold.fasta" ) {
			$db_FASTA = "../../RESOURCES/rRNA16S.gold.fasta" ;
		} else {
			die("File $db_FASTA does not exist.");
		}
	}
	my $fasta_reader = new Fasta_reader($db_FASTA);
	while (my $seq_obj = $fasta_reader->next()) {
		my $acc = $seq_obj->get_accession;
		
		my $header = $seq_obj->get_header();
		
		my @header_comp = split(/\t/, $header);                                                                                                                       
		
		my $taxonomy = pop @header_comp;                                                                                                                              
		$header =~ /^\S+\s+([^\t]+)/;                                                                                                                                
		my $species = $1 or die "Error, cannot parse species from header: $header ";                                                                                  
		
		my @taxons = split (/;\s+/, $taxonomy);           
		
		push (@taxons, $species);
		
		$acc_to_taxonomy{$acc} = \@taxons;
	}
}



while (<STDIN>) {
	
	unless (/^ChimeraSlayer/) { 
		print;
		next; 
	}
	chomp;
	my $line = $_;
	my @x = split(/\t/);
	
	my $accA = $x[2];
	my $accB = $x[3];
	
	unless ($x[10] eq 'YES') { next; } # not a chimera, not interested.
	
	if ($accA =~ /\|(\w+)/) {
		$accA = $1;
	}

	if ($accB =~ /\|(\w+)/) {
		$accB = $1;
	}
	

	eval {

		my @taxonsA = &get_taxonomy($accA);
		my @taxonsB = &get_taxonomy($accB);
		
		my ($domainA, $phylumA, $classA, $orderA, $familyA, $genusA, $speciesA) = @taxonsA;
		my ($domainB, $phylumB, $classB, $orderB, $familyB, $genusB, $speciesB)  = @taxonsB;
		
		
		my $level_same = "";
		
		if ($speciesA eq $speciesB) {
			$level_same = "SPECIES";
		}
		elsif ($genusA eq $genusB) {
			$level_same = "GENUS";
		}
		elsif ($familyA eq $familyB) {
			$level_same = "FAMILY";
		}
		elsif ($orderA eq $orderB) {
			$level_same = "ORDER";
		}
		elsif ($classA eq $classB) {
			$level_same = "CLASS";
		}
		elsif ($phylumA eq $phylumB) {
			$level_same = "PHYLUM";
		}
		elsif ($domainA eq $domainB) {
			$level_same = "DOMAIN";
		}
		else {
			$level_same = "UNKNOWN";
		}
		
		print "$line\t$genusA\t$speciesA\t$genusB\t$speciesB\tINTRA-$level_same\n";
	};

	if ($@) {
		print STDERR "Error, cannot find taxonomy info for $accA or $accB\n";
	}


}

exit(0);

####
sub get_taxonomy {
	my $acc = shift;
		
	my @taxons = @{$acc_to_taxonomy{$acc}} or die "Error, no taxonomy for $acc";
	
	return(@taxons);
	
}