File: CPS_to_RENAST.pl

package info (click to toggle)
microbiomeutil 20101212%2Bdfsg1-4
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 49,276 kB
  • sloc: perl: 4,878; ansic: 419; makefile: 98; sh: 27
file content (124 lines) | stat: -rwxr-xr-x 2,624 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env perl

use strict;
use warnings;

use Getopt::Long qw(:config no_ignore_case bundling);
use FindBin;

use lib ("$FindBin::Bin/../PerlLib");
use Fasta_reader;
use CdbTools;


my $usage = <<_EOUSAGE_;

##########################################################################################
$0
##########################################################################################
#
#  Required:
#
#    --query_NAST      multi-fasta file containing query sequences in alignment format
#
#    --db_NAST        db in NAST format
#
#    --CPS_output      ChimeraParentSelector output (file.CPS)
#
#########################################################################################


_EOUSAGE_
	
	;


## required options
my ($query_NAST, $db_NAST, $CPS_file);


&GetOptions (
	
	"query_NAST=s" => \$query_NAST,
	"db_NAST=s" => \$db_NAST,
	"CPS_output=s" => \$CPS_file,
	);

unless ($query_NAST && $db_NAST && $CPS_file) { die $usage; }

my $TMP = $ENV{TMPDIR} || "/tmp";

main: {
	
	open (my $fh, $CPS_file) or die "Error, cannot open file $CPS_file";
	while (<$fh>) {
		print STDERR "processing: $_";
		chomp;
		my @toks = split(/\t/);
		
		my $query_acc = $toks[1];
		
		if ($toks[2] ne 'YES') {
			## use original sequence alignment
			my $query_nast = &cdbyank($query_acc, $query_NAST);
			print $query_nast;
			next;
		}
		
		## RENAST using CM-chosen alignments:
		
		my $query_seq = &cdbyank_linear($query_acc, $query_NAST);
		$query_seq =~ s/[\.\-]//g; # remove any gap characters.
		
		my $query_file = "$TMP/tmp.$$.query";
		my $db_file = "$TMP/tmp.$$.db";
		open (my $ofh, ">$query_file") or die "Error, cannot write to $query_file ";
		print $ofh ">$query_acc\n$query_seq\n";
		close $ofh;
		
		my @accs_for_renast;
		
		while ($toks[3] =~ /\((\S+), NAST:(\d+)-(\d+), ECO:\d+-\d+, RawLen:(\d+), G:([\d\.]+), L:([\d\.]+), ([\d
\.]+)\)/g) {
			
			my $acc = $1;
			
			my $range_lend = $2;
			my $range_rend = $3;
			
			my $segmentLength = $4;
			
			my $global_per_id = $5;
			my $local_per_id = $6;
			my $divR = $7;
			
			push (@accs_for_renast, $acc);
		}
		
		## RENAST it!
				
		open ($ofh, ">$query_file.dbrenast.tmp") or die $!;
		foreach my $acc (@accs_for_renast) {
			my $fasta_seq = &cdbyank($acc, $db_NAST);
			print $ofh $fasta_seq . "\n";
		}
		close $ofh;

		## RENAST it
		my $cmd = "$FindBin::Bin/../../NAST-iEr/NAST-iEr $query_file.dbrenast.tmp $query_file";
		my $renast = `$cmd`;
		my $ret = $?;
		print $renast;
		
		if ($ret) { 
			die "Error, cmd $cmd died with ret $ret";
		}

		unlink("$query_file", "$query_file.dbrenast.tmp"); # awful names
		
	}

	
	exit(0);
}