File: gmap_native_to_format_converter.pl

package info (click to toggle)
trinityrnaseq 2.2.0%2Bdfsg-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 212,452 kB
  • ctags: 5,067
  • sloc: perl: 45,552; cpp: 19,678; java: 11,865; sh: 1,485; makefile: 613; ansic: 427; python: 313; xml: 83
file content (96 lines) | stat: -rwxr-xr-x 1,895 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
#!/usr/bin/env perl

use strict;
use warnings;

use FindBin;
use lib ("$FindBin::RealBin/../../PerlLib");
use Gene_obj;

my $usage = "usage: $0 file.gmap (BED|GTF)\n\n";


my $gmap_file = $ARGV[0] or die $usage;
my $out_format = $ARGV[1] or die $usage;

unless ($out_format =~ /^(BED|GTF)$/) {
    die $usage;
}


my %data;

open (my $fh, $gmap_file) or die $!;
while (<$fh>) {
    if (/^>(\S+)/) {
        if (%data) {
			&format_output(%data);
		}
		
		%data = ();
		
		my $transcript_acc = $1;
    
		%data = (acc => $transcript_acc,
				 segments => {},  # end5 => end3
			);
		
	}
    elsif (/\s*([\+\-])([^\:]+):(\d+)-(\d+)\s+\((\d+)-(\d+)\)\s+(\d[^\%]+\%)/) {
        #print " $transcript_acc\t$_";
        my $orient = $1;
        my $genome_acc = $2;
        my $genome_end5 = $3;
        my $genome_end3 = $4;
        my $transcript_end5 = $5;
        my $transcript_end3 = $6;
        my $percent_identity = $7;
        
		
		$data{segments}->{$genome_end5} = $genome_end3;
		$data{scaff} = $genome_acc;
	}
}

if (%data) {
	&format_output(%data);
}

exit(0);


####
sub format_output {
	my %data = @_;

	my $gene_obj = new Gene_obj();
	
	my $coords_href = $data{segments};
	my $acc = $data{acc};
	my $genome_scaff = $data{scaff};

	if (%$coords_href) {
		
		$gene_obj->populate_gene_object($coords_href, $coords_href);
		
		$gene_obj->{com_name} = $acc;
		$gene_obj->{asmbl_id} = $genome_scaff;
		$gene_obj->{TU_feat_name} = "g|$acc";
        $gene_obj->{Model_feat_name} = "t|$acc";
        
        if ($out_format eq "BED") {
            print $gene_obj->to_BED_format();
        }
        elsif ($out_format eq "GTF") {
            print $gene_obj->to_transcript_GTF_format() . "\n";
        }
        else {
            die "Error, cannot process format $out_format";
            # shouldn't get here if proper option values processed at top.
        }
        

	}
	
	return;
}