File: gtf2bed.pl

package info (click to toggle)
augustus 3.3.2%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 486,188 kB
  • sloc: cpp: 51,969; perl: 20,926; ansic: 1,251; makefile: 935; python: 120; sh: 118
file content (259 lines) | stat: -rwxr-xr-x 9,915 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
#!/usr/bin/perl
#
# convert gtf/gff/gff3 to BED format
# 23.9.2016 Stefanie Koenig

use strict;
use Getopt::Long;


my $help = 0;
my $verbose = 0;
my $itemRgb="0,0,255";
my $includeStopInCDS = 0;

GetOptions(
    'itemRgb=s'=>\$itemRgb,
    'help!'=>\$help,
    'verbose!'=>\$verbose,
    'includeStopInCDS!'=>\$includeStopInCDS);

exec("perldoc $0") if ($help);

my @txorder = (); # tx ids in the order of the input file
my %geneOf = ();  # keys tx ids, values: gene ids
my %geneLine =  (); # keys gene ids, values: array refs for the gene GTF line (if exists)
# hash of transcripts
#    keys: transcript ids
#    values: hash reference
#            keys: txstart
#                  txend
#                  codingstart
#                  codingend
#                  strand, chr, source
#                  txline  array of columns if a transcript/mRNA line exists
#                  CDS  array of arrays (lines and columns) for coding parts of exons
#                  UTR  array of arrays for UTR exons
#                  exon array of arrays for complete exons
#                  intron array of arrays for introns
#                  rest array of arrays all other features, like tts,tss start_codon, stop_codon

my %txs = ();

parseAndStoreGTF();
convert();
printBed();

sub parseAndStoreGTF{
    my %seen = ();
    my ($txid, $geneid, $chr, $start, $end, $feature, $strand, $source, $stop_codon);
    foreach my $line (<STDIN>){
	my @f = split /\t/, $line;
	next if (@f<8);
	($chr,$source,$feature,$start,$end,$strand) = ($f[0],$f[1],$f[2],$f[3],$f[4],$f[6]);
	# check whether it is a line with 'gene' feature
	if ($f[2] eq "gene" && ($f[8] =~ /ID=([^;]+)/ || $f[8] =~ /gene_id."?([^";]+)"?/ || $f[8] =~ /^(\S+)$/)){
	    $geneid = $1;
	    $geneLine{$geneid} = \@f;
	    next;
	} 
	# check whether it is a line with 'transcript' feature
	if ($f[2] =~ "(transcript|mRNA)" && ($f[8] =~ /ID=([^;]+)/ || $f[8] =~ /transcript_id."?([^";]+)"?/ || $f[8] =~ /^(\S+)$/)){
	    $txid = $1;
	    $txs{$txid} = {"strand"=>$strand, "chr"=>$chr, "source"=>$source, "CDS"=>[], "UTR"=>[], "exon"=>[], "intron"=>[], "rest"=>[]} if (!exists($txs{$txid}));
	    $txs{$txid}{"txline"} = \@f;
	    if($f[8] =~ /Parent=([^;]+)/ || $f[8] =~ /gene_id."?([^";]+)"?/){
		$geneOf{$txid} = $1;
	    }
	    next;
	}
	
	$txs{$txid}{"CDS"} = [] if (!defined($txs{$txid}{"CDS"}));
	$txs{$txid}{"UTR"} = [] if (!defined($txs{$txid}{"UTR"}));
	$txs{$txid}{"exon"} = [] if (!defined($txs{$txid}{"exon"}));
	$txs{$txid}{"rest"} = [] if (!defined($txs{$txid}{"rest"}));

	# all other lines must belong to a transcript and a gene
	if ($f[8] =~ /(transcript_id|Transcript)."?([^";]+)"?/ ){
	    $txid = $2;
	} else {
	    if($f[8] =~ /Parent=([^;]+)/){
		$txid = $1;
	    }else{
		die ("Neither GTF nor GFF format in the following line:\n$line\ntranscript_id not found.\n");
	    }
	}
	if ($f[8] =~ /gene_id."?([^";]+)"?/){
	    $geneid = $1;
	} else {
	    if($f[8] =~ /Parent=([^;]+)/){
		$geneid = $geneOf{$1};
	    }else{
		die ("Neither GTF nor GFF format in the following line:\n$line\ngene_id not found.\n");
	    }
	}
	$txs{$txid} = {"strand"=>$strand, "chr"=>$chr, "source"=>$source, "CDS"=>[], "UTR"=>[], "exon"=>[], "intron"=>[], "rest"=>[]} if (!exists($txs{$txid}));

	if (!$seen{$txid}){
	    push @txorder, $txid; # remember the input order for transcripts for the output
	    $seen{$txid} = 1;
	}
	# assign parental gene id to tx id
	die ("transcript $txid has conflicting gene parents: and $geneid. Remember: In GTF txids need to be overall unique.")
	    if (defined($geneOf{$txid}) && $geneOf{$txid} ne $geneid);
	
	if ($feature eq "CDS" || $feature eq "coding_exon" || $feature eq "exon" || $feature =~ /UTR/i){
	    $txs{$txid} = {"strand"=>$strand, "chr"=>$chr, "source"=>$source, "CDS"=>[], "UTR"=>[], "exon"=>[], "intron"=>[], "rest"=>[]} if (!exists($txs{$txid}));
	    $txs{$txid}{"txstart"} = $start if (!defined($txs{$txid}{"txstart"}) || $txs{$txid}{"txstart"} > $start);
	    $txs{$txid}{"txend"} = $end if (!defined($txs{$txid}{"txend"}) || $txs{$txid}{"txend"} < $end);
	}
	if ($feature eq "CDS" || $feature eq "coding_exon"){
	    $txs{$txid}{"codingstart"} = $start if (!defined($txs{$txid}{"codingstart"}) || $txs{$txid}{"codingstart"} > $start);
	    $txs{$txid}{"codingend"} = $end if (!defined($txs{$txid}{"codingend"}) || $txs{$txid}{"codingend"} < $end);
	    push @{$txs{$txid}{"CDS"}}, \@f;
	} elsif ($feature =~ /UTR/i){
	    push @{$txs{$txid}{"UTR"}}, \@f;
	} elsif ($feature eq "exon"){
	    push @{$txs{$txid}{"exon"}}, \@f;
	} elsif ($feature eq "intron") {
	    $txs{$txid}{"intron"} = [] if (!defined($txs{$txid}{"intron"}));
            push @{$txs{$txid}{"intron"}}, \@f;
	} else {
	    push @{$txs{$txid}{"rest"}}, \@f;
	}
	if ($feature eq "stop_codon"){
	    $txs{$txid}{"stop_codon"} = $start
	}
    }
}


sub convert{
    my @f;
    foreach my $txid (keys %txs){
	
        # optionally, include stop codon in CDS
	if($includeStopInCDS){
	    my @cdslines = sort {$a->[3] <=> $b->[3] || $a->[4] <=> $b->[4]} @{$txs{$txid}{"CDS"}};
	    if( $txs{$txid}{"strand"} eq '-' && $txs{$txid}{"stop_codon"}+3 == $txs{$txid}{"codingstart"}){
		$cdslines[0]->[3]-=3;
	    }
	    if( $txs{$txid}{"strand"} eq '+' && $txs{$txid}{"stop_codon"} == $txs{$txid}{"codingend"}+1){
		$cdslines[$#cdslines]->[4]+=3;
	    }
	    # TODO: if stop_codon is a separate CDS exon, then insert new CDS exon
	}
	
	# remember whether exons were not in the input file
	my $exonArrayWasEmpty;
	if(@{$txs{$txid}{"exon"}} == 0){
	    $exonArrayWasEmpty = 1;
	}else{
	    $exonArrayWasEmpty = 0;
	}
	# add exon lines if not already present and if desired in output
	if (@{$txs{$txid}{"exon"}} == 0){
	    print "Creating exon lines for $txid\n" if ($verbose);
	    # sort UTR and CDS lines by coordinates
	    my @exonpartlines = sort {$a->[3] <=> $b->[3] || $a->[4] <=> $b->[4]} (@{$txs{$txid}{"CDS"}}, @{$txs{$txid}{"UTR"}});
	    next if (@exonpartlines == 0);
	    @f = @{$exonpartlines[0]};
	    shift @exonpartlines;
	    ($f[2], $f[5], $f[7]) = ("exon", '.', '.'); # score and frame are not defined
	    foreach my $g (@exonpartlines){
		if ($f[4] >= $g->[3]){ # check for non-overlappingness
		    die ("In transcript $txid two UTR/CDS features are overlapping. Not allowed by definition.");
		} elsif ($f[4] + 1 == $g->[3]){ # exactly adjacent
		    # join two UTR/CDS features to one
		    $f[4] = $g->[4];
		} else {
		    # push exon
		    my @ff = @f; # deep copy array
		    push @{$txs{$txid}{"exon"}}, \@ff;
		    @f = @$g;
		    ($f[2], $f[5], $f[7]) = ("exon", '.', '.'); # score and frame are not defined
		}
	    }
	    # push remaining, last exon
	    my @ff = @f;
	    push @{$txs{$txid}{"exon"}}, \@ff;
	}

    }
}

sub printBed {

    foreach my $txid (@txorder){

	my @lines = sort {$a->[3] <=> $b->[3] || $a->[4] <=> $b->[4]} @{$txs{$txid}{"exon"}};
	my $blockCount = @lines;
	my @blockSizes = ();
	my @blockStarts = ();

	foreach my $line (@lines){
	    push @blockSizes, $line->[4] - $line->[3] + 1;
	    push @blockStarts,  $line->[3] - $txs{$txid}{"txstart"};
	}
	
	print $txs{$txid}{"chr"} . "\t";
	print $txs{$txid}{"txstart"}-1 . "\t";
	print $txs{$txid}{"txend"} . "\t";
	print $txid . "\t0\t";
	print $txs{$txid}{"strand"} . "\t";
	if(@{$txs{$txid}{"CDS"}} == 0){
	    print $txs{$txid}{"txstart"}-1 . "\t";
	    print $txs{$txid}{"txend"} . "\t";
	}
	else{
	    print $txs{$txid}{"codingstart"}-1 . "\t";
	    print $txs{$txid}{"codingend"} . "\t";
	}
	print "$itemRgb\t";
	print "$blockCount\t";
	print join(',',@blockSizes) ."\t";
	print join(',',@blockStarts) ."\n";
    }
}

__END__

=pod

=head1 NAME

gtf2bed.pl      convert gtf/gff/gff3 to BED format

=head1 SYNOPSIS

gtf2bed.pl <in.gtf >out.bed
    
=head1 OPTIONS

  --itemRgb=s              a string s encoding the RGB value of the form R,G,B (default 0,0,225).
  --includeStopInCDS       include stop codon into the coding sequence (default off)

=head1 DESCRIPTION
    
    example input:
    chr16   AUGUSTUS        transcript      100472  160062  .       -       .       jg7.t1
    chr16   AUGUSTUS        tts     100472  100472  .       -       .       transcript_id "jg7.t1"; gene_id "jg7";
    chr16   AUGUSTUS        UTR     100472  101158  0       -       .       transcript_id "jg7.t1"; gene_id "jg7";
    chr16   AUGUSTUS        stop_codon      101159  101161  .       -       0       transcript_id "jg7.t1"; gene_id "jg7";
    chr16   AUGUSTUS        CDS     101159  101285  1       -       1       transcript_id "jg7.t1"; gene_id "jg7";
    chr16   AUGUSTUS        CDS     102477  102644  1       -       1       transcript_id "jg7.t1"; gene_id "jg7";
    chr16   AUGUSTUS        CDS     104227  104334  0.9     -       1       transcript_id "jg7.t1"; gene_id "jg7";
    chr16   AUGUSTUS        CDS     106525  106755  1       -       1       transcript_id "jg7.t1"; gene_id "jg7";
    chr16   AUGUSTUS        CDS     110093  110263  1       -       1       transcript_id "jg7.t1"; gene_id "jg7";
    chr16   AUGUSTUS        CDS     110759  111288  1       -       0       transcript_id "jg7.t1"; gene_id "jg7";
    chr16   AUGUSTUS        CDS     117341  117478  1       -       0       transcript_id "jg7.t1"; gene_id "jg7";
    chr16   AUGUSTUS        CDS     123010  123106  1       -       1       transcript_id "jg7.t1"; gene_id "jg7";
    chr16   AUGUSTUS        CDS     127580  127720  1       -       1       transcript_id "jg7.t1"; gene_id "jg7";
    chr16   AUGUSTUS        CDS     159185  160062  0       -       0       transcript_id "jg7.t1"; gene_id "jg7";
    chr16   AUGUSTUS        start_codon     160060  160062  .       -       0       transcript_id "jg7.t1"; gene_id "jg7";

    example output :

    chr16   100471  160062  jg7.t1  0       -       101158  160062  0,0,255 10      814,168,108,231,171,530,138,97,141,878  100471,102476,104226,106524,110092,110758,117340,123009,127579,159184

=cut