File: join_aug_pred.pl

package info (click to toggle)
augustus 3.4.0%2Bdfsg2-2
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 758,480 kB
  • sloc: cpp: 65,451; perl: 21,436; python: 3,927; ansic: 1,240; makefile: 1,032; sh: 189; javascript: 32
file content (278 lines) | stat: -rwxr-xr-x 9,060 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
#!/usr/bin/perl
#
# join_aug_pred.pl
# Join AUGUSTUS predictions from separate runs of possibly overlapping sequence segments.
#
# This script is used by the braker.pl pipeline.
# Please be extremely careful when changing this script because the braker.pl
# pipeline may fail upon custom modification of this script.
# In case of doubt, contact katharina.hoff@uni-greifswald.de
#
# Mario Stanke, 25.10.2006, last modification by Katharina J. Hoff on Feb 21st 2018
#
use strict;
use Getopt::Long;

my $usage = "$0 -- join AUGUSTUS predictions from separate runs of possibly overlapping sequence segments.\n\n";
$usage .= "Usage: $0 < augustus.concat > augustus.joined\n\n";
$usage .= "Reads from standard input a file with AUGUSTUS outputs. This file will typically be the concatenation\n";
$usage .= "of the outputs of several simultaneous runs of AUGUSTUS of different sequence segments on a computer cluster.\n";
$usage .= "The sequence segments can be overlapping, in which case this script intelligently omits one version of\n";
$usage .= "overlapping genes from different runs. The outputs from the runs must have been concatenated IN ORDER.\n";
$usage .= "The lines '# This output was generated with AUGUSTUS' must still be present in the input file so the\n";
$usage .= "boundaries between runs can be determined.\n";
$usage .= "The output file contains a nonredundant set of renumbered genes.\n";
$usage .= "Options:\n";
$usage .= "   droplist=s   file with list of gene identifiers that should not be included in output.\n";

my $geneNumber = 0;
my $droplist;
my %drop = ();

GetOptions('droplist:s'=>\$droplist);

if ($#ARGV != -1) {
    die "Unknown option\n\n$usage";
}

if (defined($droplist)){
    open (DROP, "<$droplist") or die ("Could not open file $droplist.");
    while (<DROP>){
	chomp;
	s/\.t\d+//;
	$drop{$_}=1;
    }
    print STDERR "Excluding " . scalar(keys %drop) . " gene ids from joining.\n";
}


# Run: the results for one segment
# I use a hash with two elements:
# header: string with the head part preceeding the first gene
# genes: reference to an array of Genes
# seqname: sequence name

# Gene:
# Hash with the following elements
# begin: begin position of the transcribed part
# end: end position of the transcribed part
# gff: string with the complete AUGUSTUS output about this gene

my $printAllHeaders = 0; # if 0, only the first header is printed
my $lastRun=0;
my $currentRun=0;
my $line;
my $runSeparator = "\# This output was generated with AUGUSTUS";
my $geneid = 1;
my $headerPrinted = 0;
my $gff3flag = 0;

# start main loop that gets run by run
do {
    if (($currentRun = getNextRun())) {
	if ($lastRun) {
	    cleanRedundant($lastRun, $currentRun);
	    if ($gff3flag && $geneid == 1) {
		print "##gff-version 3\n";
	    }
	    printRun($lastRun);
	}
	$lastRun = $currentRun;
    }
} until ($currentRun == 0);

if ($lastRun) {
    printRun($lastRun);
}

sub getNextRun {
    my %run;
    my $numSeps = 0;
    my $geneNr = 0;
    my $inGene = 0;
    my $endHeader = 0;
    do {
	if (defined $line) {
	    if ($line =~ /\#\#gff-version 3/){
		$gff3flag = 1;
	    }
	    if ($line =~ /^$runSeparator/) {
		$numSeps++;
	    }
	    if ($numSeps == 1) {
		if ($line =~ /^\#\#\# gene g/ || $line =~ /^\# start gene g/ ) { # the first option for back compatibility with a previous AUGUSTUS version
		    if ($line =~ /gene (g\d+)/ && $drop{$1}){
			print STDERR "dropping $1\n";
		    }  else {
			$geneNr++;
			$inGene=1;
		    }
		}
		if ($geneNr == 0) {
		    if ( $line=~ /prediction on sequence number/ || 
			 $line =~ /Looks like .* is in .* format/) { # this excludes the hints and hint messages for the first run from the header
			$endHeader = 1;
		    }
		    if (!$endHeader){
			$run{"header"}.= $line;
		    }
		}
		if ($geneNr > 0 && $inGene) {
		    if ($line =~ /^(\S+)\t.*\tgene\t(\d+)\t(\d+)\t/) {
			my $seqname = $1;
			my $begin = $2;
			my $end = $3;
			if (!defined $run{"seqname"}){
			    $run{"seqname"} = $seqname;
			}
			$run{"genes"}->[$geneNr-1]->{"begin"} = $begin;
			$run{"genes"}->[$geneNr-1]->{"end"} = $end;
		    }
		    $run{"genes"}->[$geneNr-1]->{"gff"} .= $line;
		}
		if ($line =~ /^\#\#\# end gene g/ || $line =~ /^\# end gene g/) { # the first option for back compatibility with a previous AUGUSTUS version
		    $inGene = 0;
		}
	    }
	}
	if ($numSeps<2) {
	    $line = <>;
	}
    } until (!(defined $line) || $numSeps >1);
    if (defined $line || $geneNr>0){
	return \%run;
    } else {
	return 0;
    }
};


sub printRun {
    my $run = shift @_;
    if (!$headerPrinted || $printAllHeaders) {
#	print "================ header ==================\n";
	print $run->{"header"};
	$headerPrinted = 1;
    }
#    print "================ genes ==================\n";
    if (defined $run->{"genes"}) {
	foreach my $gene (@{$run->{"genes"}}) {
#	    print "============= gene ======= begin = " .$gene->{"begin"} . " end = " . $gene->{"end"} . " =======\n";
	    foreach my $gffline (split /\n/, $gene->{"gff"}){
		$gffline =~ s/\bg(\d+)\b/g$geneid/g;
		print "$gffline\n";
	    }
	    $geneid++;
	}
    }
}

#
# Clean redundant genes of two possibly overlapping neighboring runs.
# Run1 starts left of run2.
#
sub cleanRedundant {
    my $run1 = shift @_;
    my $run2 = shift @_;
    if (!(defined $run1->{"seqname"}) || !(defined $run2->{"seqname"}) || $run1->{"seqname"} ne $run2->{"seqname"}){
	return;
    }
    if (!(defined $run1->{"genes"}) || !(defined $run2->{"genes"}) || @{$run1->{"genes"}} == 0 || @{$run2->{"genes"}} == 0) {
	if (!(defined $run1->{"genes"})){
	    return;
	}
	if (!(defined $run2->{"genes"})){
	    return;
	}
	return;
    }
 #   print "Anzahl Gene in Run1: " . (scalar @{$run1->{"genes"}}) . "\n";
 #   print "Anzahl Gene in Run2: " . (scalar @{$run2->{"genes"}}) . "\n";
    my $firstgene2 = $run2->{"genes"}->[0];
    my $firstbegin2 = $firstgene2->{"begin"};
    my $lastend1 = -1;
    my $lastgene1;
    foreach my $gene1 (@{$run1->{"genes"}}) {
	if ($gene1->{"end"} > $lastend1) {
	    $lastend1 = $gene1->{"end"};
	    $lastgene1 = $gene1;
	}
    } 
    my $lastend2 = -1;
    foreach my $gene2 (@{$run2->{"genes"}}) {
	if ($lastend2 == -1 || $gene2->{"end"} > $lastend2) {
	    $lastend2 = $gene2->{"end"};
	}
    }
    if ($lastend2 < $run1->{"genes"}->[0]->{"begin"}){
	print STDERR "Prediction runs are not in the right order in sequence "  . $run1->{"seqname"} . ". Please sort along the chromosome first.\n";
    }

 #   print "rightmost end  of gene in run1 : " . $lastend1 . "\n";
 #   print "leftmost begin of gene in run2 : " . $firstbegin2 . "\n";
    if ($lastend1 < $firstbegin2) {
	return; # no overlap between any genes
    }
    # the two runs really have overlapping genes
    # find a good breakposition and then remove all genes 
    # in run1 ending to the right of the breakposition and all genes
    # in run2 beginning at or to the left of breakposition.
    #                                                     lastgene1
    # run1  ------              -------       ------      ---------
    #          -------                                      ---
    # run2                               -----------      ------------------------        -----------         -----
    #                                    firstgene2                                                        ---------------
    #                                    |-d2-|                   |-     d1     -|
    #                                                    |breakpoint
    my $breakpoint;
    my $d1=-1; 
    my $d2=-1;
    
    foreach my $gene2 (@{$run2->{"genes"}}) {
	if ($gene2->{"begin"} <= $lastend1 && $gene2->{"end"} - $lastend1 > $d1) {
	    $d1 = $gene2->{"end"} - $lastend1;
	}
    }
#    print "d1=$d1\n";
    foreach my $gene1 (@{$run1->{"genes"}}) {
	if ($gene1->{"end"} >= $firstbegin2 && $firstbegin2 - $gene1->{"begin"} > $d2) {
	    $d2 = $firstbegin2 - $gene1->{"begin"};
	}
    }
#    print "d2=$d2\n";
    if ($d1 >= $d2) {
	# set the breakpoint directly left to the leftmost begin of any gene in run2 that
	# ends at or after lastend1.
	$breakpoint = $lastend1;
	foreach my $gene2 (@{$run2->{"genes"}}) {
	    if ($gene2->{"end"} >= $lastend1 && $gene2->{"begin"}-1 < $breakpoint) {
		$breakpoint = $gene2->{"begin"}-1;
	    }
	}
    } else {
	# set the breakpoint to the rightmost end of any gene in run1 that
	# begins at or before firstbegin2.
	$breakpoint = $firstbegin2;
	foreach my $gene1 (@{$run1->{"genes"}}) {
	    if ($gene1->{"begin"} <= $firstbegin2 && $gene1->{"end"} > $breakpoint) {
		$breakpoint = $gene1->{"end"};
	    }
	}
    }
#    print "breakpoint = $breakpoint\n";
    #now delete the redundant genes from both lists
    my @newgenes1 = ();
    my @newgenes2 = ();
    foreach my $gene1 (@{$run1->{"genes"}}) {
	if ($gene1->{"end"} <= $breakpoint) {
	    push @newgenes1, $gene1;
	}
    }
    $run1->{"genes"} = \@newgenes1;
    foreach my $gene2 (@{$run2->{"genes"}}) {
	if ($gene2->{"begin"} > $breakpoint) {
	    push @newgenes2, $gene2;
	}
    }
    $run2->{"genes"} = \@newgenes2;
}