File: gmod_sort_gff3.pl

package info (click to toggle)
libchado-perl 1.31-6
  • links: PTS, VCS
  • area: main
  • in suites: bookworm, bullseye, sid
  • size: 44,716 kB
  • sloc: sql: 282,721; xml: 192,553; perl: 25,524; sh: 102; python: 73; makefile: 57
file content (107 lines) | stat: -rwxr-xr-x 2,698 bytes parent folder | download | duplicates (4)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
#!/usr/bin/env perl
use strict;
use warnings;

use Getopt::Long;

my ($INFILE, $OUTFILE);

GetOptions(
    'infile=s'     => \$INFILE,
    'outfile=s'    => \$OUTFILE,
  ) or ( system( 'pod2text', $0 ), exit -1 );

die "You must supply an input file name via --infile\n" unless $INFILE;

$OUTFILE  ||='sorted.gff';

open OUT, ">", $OUTFILE or die "couldn't open $OUTFILE for writing:$!\n";
open IN,  "<", $INFILE  or die "couldn't open $INFILE for reading:$!\n";

my %parent_hash;
my %child_hash;  # has ID (of parent) as key
                 # and anon array of GFF3 lines as value
while (<IN>) {
    my $line = $_;
    my @la   = split "\t", $line;
    if (@la != 9) { #not a GFF line, so let it though unmolested
        print OUT $line;
    }
    else {
        if ($la[8] =~ /Parent=([^;]+)/ ) {
            my $id = $1;
            if ($parent_hash{$id}) {
                print OUT $line;
            }
            else {
                push @{ $child_hash{$id} }, $line;
            }
        }
        elsif ($la[8] =~ /ID=([^;]+)/ ) {
            my $id = $1;
            if ($parent_hash{$id}) {
#                die "This ID: $1 has appeared twice in this GFF file\n";
# can't die here; CDS features can share IDs
# (though the chado bulk loader doesn't support that yet).
            }
            else {
                print OUT $line;
  
                for my $c_line ( @{ $child_hash{$id} } ){
                    print OUT $c_line;
                }
                $child_hash{$id} = 1;
            }
            $parent_hash{$id} = 1;
        }
        else {
            print OUT $line;
        }
    }
}

for my $key (keys %child_hash) {
    if ($child_hash{$key} != 1 ) {
        print "Unresolved child relationship for this line:\n";
        for my $line (@{ $child_hash{$key} }) {
            print "$line\n";
        }
    }
}

close OUT;
close IN;

=pod

=head1 NAME

gmod_sort_gff3.pl - Sorts a GFF3 file to put lines with Parent tags after their parent.

=head1 SYNOPSIS

  % gmod_sort_gff3.pl --infile <gff file name> 

=head1 COMMAND-LINE OPTIONS

  --infile		Name of the input gff3 file (required)
  --outfile		Name of the output gff3 file
                           (default: sorted.gff)

=head1 DESCRIPTION

This is a very simple (and only lightly tested) script for sorting
gff3 files so that all lines that have Parent tags come after the
line that contains the parent ID tag.  Files thusly sorted are
required for the GMOD chado bulk loader, L<gmod_bulk_load_gff3.pl>.

=head1 AUTHORS

Scott Cain E<lt>cain@cshl.orgE<gt>

Copyright (c) 2006

This library is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.

=cut