File: GTF.pm

package info (click to toggle)
trinityrnaseq 2.15.2%2Bdfsg-1
links: PTS, VCS
area: main
in suites: trixie
size: 468,004 kB
sloc: perl: 49,905; cpp: 17,993; java: 12,489; python: 3,282; sh: 1,989; ansic: 985; makefile: 717; xml: 62
file content (4280 lines) | stat: -rw-r--r-- 128,211 bytes
parent folder | download | duplicates (6)
# -*- Mode: Perl; tab-width: 8; perl-indent-level: 2; indent-tabs-mode: nil -*- 
use strict;
###############################################################################
# GTF
###############################################################################
# The GTF object parses and stores all data from a gtf file.
package GTF;
use Carp;
# GTF::new(hash)
#    This is the contructor for GTF objects.  It takes one required argument
#    which is a hash containing the following optional fields:
#        gtf_filename: The filename of the gtf file to be loaded.  If no filename 
#                      is given it just creates an empty object.
#        seq_filename: The filename of the sequence corresponding to this gtf file.
#                      If given additional validity checks are made, additional stats
#                      are kept and the transcript can be output.
#        tx_out_fh:    The filehandle to output the genes transcripts to.  Only works
#                      If seq_filename is given.
#        conseq_filename: The filename of the conservation sequence corresponding to 
#                      this gtf file.  If given additional stats are kept.
#        warning_fh:   The filehandle to output gtf format warnings to.  If none 
#                      is given warnings are disregarded.
sub new {
  my ($info) = @_;
  my $gtf = bless {Genes => [], Transcripts => [], Exons => [], CDS   => [], 
                   Inter => [], Inter_CNS => [], Filename => "", Sequence => "",
                   Comments => "", Modified => 0};
  $gtf->{Total_Conseq} = {0 => -1,
                          1 => -1,
                          2 => -1};
  $gtf->{Total_Seq} = {A => -1,
                       C => -1,
                       G => -1,
                       T => -1,
                       N => -1};
  if(defined($info->{seq_filename})){
    if(-e $info->{seq_filename}){
      $gtf->{Sequence} = $info->{seq_filename};
    }
    else{
      $gtf->{Sequence} = 0;
      print STDERR "GTF: $info->{seq_filename} does not exist.\n"; 
    }
  }
  else{
    $gtf->{Sequence} = 0;
  }
  if(defined($info->{strict})){
    $gtf->{Strict} = $info->{strict};
  }
  else{
    $gtf->{Strict} = 0;
  }
  if(defined($info->{tx_out_fh})){
    if(defined($gtf->{Sequence})){
      $gtf->{Tx} = $info->{tx_out_fh};
    }
    else{
      $gtf->{Tx} = 0;
      print STDERR "GTF: Cannot ouput transcripts without the sequence.\n"; 
    }
  }
  else{
    $gtf->{Tx} = 0;
  }
  if(defined($info->{conseq_filename})){
    if(-e $info->{conseq_filename}){
      $gtf->{Conseq} = $info->{conseq_filename};
    }
    else{
      $info->{Conseq} = 0;
      print STDERR "GTF: $info->{conseq_filename} does not exist.\n"; 
    }
  }
  else{
    $info->{Conseq} = 0;
  }
  if((defined($info->{inframe_stops})) && ($info->{inframe_stops})){
    $gtf->{Inframe_Stops} = 1;
  }
  else{
    $gtf->{Inframe_Stops} = 0;
  }
  if((defined($info->{fix_gtf})) &&
     ($info->{fix_gtf} == 1)){
    $gtf->{Fix_GTF} = 1;
  }
  else{
    $gtf->{Fix_GTF} = 0;
  }
  if(defined($info->{warning_skips})){
    $gtf->{Warning_Skips} = $info->{warning_skips};
  }
  else{
    $gtf->{Warning_Skips} = [];
  }
  if(defined($info->{bad_list})){
    $gtf->{Bad_List} = $info->{bad_list};
  }
  else{
    $gtf->{Bad_List} = [];
  }
  if(defined($info->{detailed_error_count})){
    $gtf->{Detailed_Error_Count} = $info->{detailed_error_count};
  }
  if(defined($info->{mark_ase})) {
    $gtf->{Mark_ASE} = $info->{mark_ase};
  }
  else {
    $gtf->{Mark_ASE} = 0;
  }
  if(defined($info->{gtf_filename})){
    if(-e $info->{gtf_filename}){
      $gtf->{Filename} = $info->{gtf_filename};
      if((defined($info->{no_check})) && ($info->{no_check})){
        $gtf->_load_no_check($info->{gtf_filename});
      }
      else{
        $gtf->_parse_file($info->{warning_fh});
      }
      if($gtf->{Mark_ASE}) {
        $gtf->mark_ase;
      }
    }
    else{
      print STDERR "GTF: $info->{gtf_filename} does not exist.\n"; 
    }		    
  }
  else{
    $gtf->{Filename} = "";
  }
  return $gtf;
}

# GTF::_update
#    This function resorts all lists.  It is called when a list is requested.
#    It does nothing unless the $gtf->{Modified} value is 1.  Sets 
#    $gene->{Modified} to 0;
sub _update{
  my ($gtf) = @_;
  unless($gtf->{Modified}){
    return;
  }
  $gtf->{Modified} = 0;
  my $genes = $gtf->{Genes};
  my $txs = $gtf->{Transcripts};
  my $cds = $gtf->{CDS};    
  my $inter = $gtf->{Inter};
  $genes = [sort {$a->start <=> $b->start} @$genes];
  $txs = [sort {$a->start <=> $b->start} @$txs];
  $cds = [sort {$a->start <=> $b->start} @$cds];
  $inter = [sort {$a->start <=> $b->start} @$inter];
  $gtf->{Genes} = $genes;
  $gtf->{Transcripts} = $txs;
  $gtf->{CDS} = $cds;
  $gtf->{Inter} = $inter;
}

# GTF::genes()
#    This function returns an array of refernces to GTF::Gene objects.
#    The array contains one reference for each gene in the gtf file.
#    The array is sorted by the start position of each gene.
sub genes{
  my ($gtf) = @_;
  $gtf->_update;
  return $gtf->{Genes};
}

sub transcripts{
  my ($gtf) = @_;
  $gtf->_update;
  return $gtf->{Transcripts};
}

# GTF::add_gene(gene_ref)
#    This function take a reference to a GTF::Gene object and adds it to the 
#    list of genes stored by this object.
sub add_gene{
  my ($gtf,$gene) = @_;
  my $genes = $gtf->{Genes};
  my $txs = $gtf->{Transcripts};
  my $cds = $gtf->{CDS};    
  push @$genes, $gene;
  push @$txs, @{$gene->transcripts};
  push @$cds, @{$gene->cds};
  $gtf->{Modified} = 1;
}

# GTF::add_feature(feature_ref,seqname,source,strand,id)
#   This function will take a reference to an intergenic feature (such as 
#   inter or inter_CNS) and add it to the appropriate feature list.
#   Pass a sequence name, the strand ("+" is OK for intergenic)
#   and an ID to identify the feature.
#   Note that the feature must be of the types inter or inter_CNS.
sub add_feature{
  my ($gtf,$feature,$seqname,$source,$strand,$id) = @_;
  die "Only inter or inter_CNS features may be directly added to a GTF.\n"
    if($feature->type ne "inter" && $feature->type ne "inter_CNS");
  my $gene = GTF::Gene::new($id,$seqname,$source,$strand);
  my $tx = GTF::Transcript::new($id);
  $gene->add_transcript($tx);
  $tx->add_feature($feature);
  my $features;
  if($feature->type eq "inter")         { $features = $gtf->{Inter}     }
  elsif($feature->type eq "inter_CNS")  { $features = $gtf->{Inter_CNS} }
  push @$features, $feature;
  $gtf->{Modified} = 1;
}

# GTF::set_genes(gene_list)
#    This function take a reference to a list of GTF::Gene objects and adds each to the 
#    list of genes stored by this object.
sub set_genes{
  my ($gtf,$genes) = @_;
  $gtf->{Genes} = $genes;
  my @txs;
  my @cds;
  foreach my $gene (@$genes){
    push @txs, @{$gene->transcripts};
    push @cds, @{$gene->cds};
  }
  $gtf->{Transcripts} = \@txs;
  $gtf->{CDS} = \@cds;
  $gtf->{Modified} = 1;
}

# GTF::remove_gene(gene_id)
#    This function takes a gene_id and removes the gene with that gene_id
#    from the list of genes stored by this object, if a gene with that gene_id
#    can be found.  Otherwise does nothing.  Returns true if a gene was removed
#    and false if no gene with that gene_id was found.
sub remove_gene{
  my ($gtf,$gene_id) = @_;
  my $genes = $gtf->{Genes};
  my $removed = 0;
  my @new_genes;
  my @new_txs;
  my @new_cds;
  foreach my $gene (@$genes){
    if($gene->gene_id eq $gene_id){
      $removed++;
    }
    else{
      push @new_genes, $gene;
      push @new_txs, @{$gene->transcripts};
      push @new_cds, @{$gene->cds};
    }
  }
  if($removed){
    $gtf->{Genes} = \@new_genes;
    $gtf->{Transcripts} = \@new_txs;
    $gtf->{CDS}   = \@new_cds;
  }
  return $removed;
}

# GTF::remove_feature(feature)
#   This function will remove a feature from the list of features.
#   Note the feature must be of type inter or inter_CNS.
sub remove_feature {
    my($gtf,$feature) = @_;
    my $removed = 0;
    my $feature_key;
    if($feature->type eq "inter") {
        $feature_key = "Inter";
    }
    elsif($feature->type eq "inter_CNS") {
        $feature_key = "Inter_CNS";
    }
    my @new_features;
    foreach my $stored_feature (@{$gtf->{$feature_key}}) {
        if($stored_feature == $feature) {
            $removed++;
        }
        else {
            push @new_features, $stored_feature;
        }
    }
    if($removed) {
        $gtf->{$feature_key} = \@new_features;
    }
    return $removed;
}

# GTF::set_filename(filename)
#    Sets the filename to be returned by the filename function 
sub set_filename{
  my ($gtf,$filename) = @_;
  $gtf->{Filename} = $filename;
}

# GTF::offset(ammount)
#    This function will offset all genes in this gene by the given ammount
sub offset{
  my ($gtf,$offset) = @_;
  unless(defined($offset)){
    print STDERR "Undefined value passed to GTF::offset.\n";
    return;
  }
  my $genes = $gtf->{Genes};
  foreach my $gene (@$genes){
    $gene->offset($offset);
  }    
  $gtf->{Modified} = 1;
}

# GTF::reverse_complement(seq_length)
#    This function takes the length of the sequence this gtf file was based on and 
#    and reverse complements everything in the file.  So the positive strand becomes
#    the negative strand and the files starts at seq_length and moves downward to 0.
sub reverse_complement{
  my ($gtf, $seq_length) = @_;
  unless(defined($seq_length)){
    print STDERR 
        "Undefined value for seq_length passed to GTF::reverse_complement.\n";
  }
  my $genes = $gtf->{Genes};
  foreach my $gene(@$genes){
    $gene->reverse_complement($seq_length);
  }
  $gtf->{Modified} = 1;
}

# GTF::cds()
#    This function returns an array of refernces to GTF::Feature objects. 
#    The array contains one refernece to each CDS feature in the gtf file.
#    The array is sorted by the <start> of each CDS.
sub cds{
  my ($gtf) = @_;
  $gtf->_update;
  return $gtf->{CDS};
}

sub inter {
    my ($gtf) = @_;
    $gtf->_update;
    return $gtf->{Inter};
}

sub inter_cns {
    my ($gtf) = @_;
    $gtf->_update;
    return $gtf->{Inter_CNS};
}

# GTF::mark_ase()
# If you are looking for alternative splicing events in your GTF file,
# call this function to marke them.  This function is separate from the 
# _update methods in order to keep the running time low.

sub mark_ase {
  my ($gtf) = @_;

  #set the alternative splicing event flags for each feature
  #currently, only optional coding exons

  # The main idea is to check all pairs of transcripts in a gene,
  # and find any incident where there are two exons that have the same
  # end coord and two exons which have the same start coord
  # and that an exon in only one of the transcripts lies between them.
  # It is accomplished by sorting all the exons in both transcripts
  # and checking for a supercasette, i.e.
  #
  # ===|||||||||===||||||||======||||||||||====
  # ==||||||||||=================||||||||||||==
  #             <--------------->
  #               supercassette
  #
  # Note that it doesn't matter if the opposite ends of the exon are equal.
  
  # rpz

  my @genes = @{$gtf->genes};
  for my $gene(@{$gtf->genes}) {
    my @txs = (@{$gene->transcripts}); 
    if(scalar(@txs) > 1) {
      for (my $i = 0; $i < scalar(@txs)-1; $i++) {
        next if(scalar(@{$txs[$i]->cds}) == 0);
        for (my $j = $i+1; $j < scalar(@txs); $j++) {
          next if(scalar(@{$txs[$j]->cds}) == 0);

          my @ocds = sort {$a->start <=> $b->start || $a->stop <=> $b->stop}
            (@{$txs[$i]->cds}, @{$txs[$j]->cds});

          for(my $k = 4; $k < scalar(@ocds); $k++) {
            if(     $ocds[$k-4]->stop == $ocds[$k-3]->stop &&
                    $ocds[$k-1]->start == $ocds[$k]->start) {
              if($ocds[$k-2]->length%3 == 0) {
                $ocds[$k-2]->set_ase("InframeOptional");
              }
              else {
                $ocds[$k-2]->set_ase("FrameshiftOptional");
              }
            }
          }
        }
      }
    }
  }

}

# GTF::infer_exons()
# Skips infering if Exons already exist
# Output is undefined if features overlap, except for start_codon
# Currently knows about utr5, start,stop,cds, utr3
sub infer_exons {
  my ($gtf) = @_; 
  $gtf->_update;

  for my $tx (@{$gtf->transcripts}) {
    next if (@{$tx->exons});

    if ($tx->strand eq '+') {    
      for  ( @{$tx->utr5}, @{$tx->cds}, @{$tx->stop_codons}, @{$tx->utr3} ) {

        if (!@{$tx->exons} ||  ${$tx->exons}[-1]->stop + 1 < $_->start ) {
          $tx->add_feature( GTF::Feature::new('exon', $_->start, $_->stop, 0, 0) );

        }  elsif ( ${$tx->exons}[-1]->stop + 1 eq $_->start ) {
          ${$tx->exons}[-1]->set_stop($_->stop);            
        }
      }
    } else {
      for  ( @{$tx->utr3}, @{$tx->stop_codons}, @{$tx->cds}, @{$tx->utr5} ) {

        if (!@{$tx->exons} || ${$tx->exons}[-1]->stop + 1 < $_->start  ) {
          $tx->add_feature(GTF::Feature::new('exon', $_->start, $_->stop, 0, 0));
        } elsif ( ${$tx->exons}[-1]->stop + 1 eq $_->start ) {
          ${$tx->exons}[-1]->set_stop($_->stop);            
        }
      }
    }
  }
}

# GTF::remove_exons()
# remove exons from all transcripts
sub remove_exons {
  my ($gtf) = @_; 
  $gtf->_update;

  $_->{Exons} = [] for (@{$gtf->transcripts});
}

# GTF::filename()
#    This function returns the filename of the gtf file.
sub filename           {shift->{Filename}}

# GTF::comments()
#    This function returns all full line commentsfrom the gtf file.
sub comments           {shift->{Comments}};

# GTF::conservation_count()
#    Returns a hash containing counts for each conservation character (0,1,2)
sub conservation_count {shift->{Total_Conseq}};

# GTF::sequence_count()
#    Returns a hash containing counts for each sequence character (A,C,T,G,N,X),
#    Where X is all other characters
sub sequence_count {shift->{Total_Seq}};

# GTF::output_gtf_file([file_handle]);
#    This function moves through each gene in the gtf file and outputs 
#    all data in valid gtf2 format.  If takes and optional file_handle
#    as the only argument to which it outputs the data.  If no argument is 
#    given it outputs the data to stdout.
sub output_gtf_file{
  my ($gtf,$out_handle) = @_;
  $gtf->_update;
  unless(defined $out_handle){
    $out_handle = \*STDOUT;
  }
  if($gtf->comments){
    foreach my $comment (@{$gtf->comments}){
      print $out_handle "$comment\n";
    }
  }

  for my $gene (@{$gtf->genes}) { $gene->_update }
  my @top_level_features = sort {$a->start <=> $b->start} 
    (@{$gtf->genes}, @{$gtf->inter_cns}, @{$gtf->inter});
  for my $feature (@top_level_features) {
    $feature->output_gtf($out_handle);
  }
}

# GTF::output_gff_file([file_handle]);
#    This function moves through each gene in the gtf file and outputs 
#    all data in valid GFF format.  If takes and optional file_handle
#    as the only argument to which it outputs the data.  If no argument is 
#    given it outputs the data to stdout.
sub output_gff_file{
  my ($gtf,$out_handle) = @_;
  $gtf->_update;
  unless(defined $out_handle){
    $out_handle = \*STDOUT;
  }
  if($gtf->genes){
    foreach my $gene (@{$gtf->genes}){
      $gene->output_gff($out_handle);
    }
  }
}

# GTF::_parse_file()
#    This function is used internally by the GTF constructor GTF::new
#    to read, parse, and store the GTF file;
sub _parse_file{
  my ($gtf,$warnings) = @_;
  unless(defined($warnings)){
    if(open(WARN, ">/dev/null")){
      $warnings = \*WARN;
    }
    else{
      die "Could not open /dev/null for write\n";
    }
  }
  my $fix_gtf = $gtf->{Fix_GTF};
  my $strict = $gtf->{Strict};
  #open file 
  open(GTF, "<$gtf->{Filename}") 
      or die "Could not open $gtf->{Filename}.\n";
  my (%GeneObj,%TxObj);
  my (@all_genes,@all_txs,@all_cds,@all_inter,@all_cns);
  my $all_line_comments = [];
  #prepare error counts
  my $max_error_count = 5;
  if(defined($gtf->{Detailed_Error_Count})){
    $max_error_count = $gtf->{Detailed_Error_Count};
  }
  my @error_msgs = $gtf->_get_error_messages();
  my @errors;
  my @bad_list;
  my $no_warn = $gtf->{Warning_Skips};
  for(my $i = 0;$i <= $#error_msgs;$i++){
    if($$no_warn[$i]){
      $errors[$i] = $max_error_count;
    }
    else{
      $errors[$i] = 0;
    }
  }
  #read the file
  my $line_num = 0;
  while(my $in_line = <GTF>){
    $line_num++;
    chomp $in_line;
    if($in_line !~ /\S/){
      next;
    }
    if($in_line =~ /^\s*\#/){
      push @$all_line_comments, $in_line;
      next;
    }
    else{
      #remove leading whitespace and get comments
      my $comments = "";	  
      while($in_line =~ /^(.*)\#(.*)$/){
        $in_line = $1;
        $comments = $2.$comments;
      }
      if($in_line =~ /^\s+(.*)$/){
        $in_line = $1;
      }
      my @data = split /\s+/,$in_line;
      #verify line is correct length
      if($#data < 8){
        if($errors[0] < $max_error_count){
          print $warnings
              "Not enough fields on line $line_num.\n";
        }
        $errors[0]++;
        next;
      }
      #check for correct whitespace
      my $field = 1;
      while(($field < 8)&&($in_line =~ /\S+(\s+)/g)){
        unless($1 =~ /^\t$/){
          if($errors[1] < $max_error_count){
            print $warnings 
                "Incorrect type of whitespace between fields ",
                "$field and ",$field + 1, " on line $line_num.  ",
                "Should be tab.\n";
          }
          $errors[1]++;
        }
        $field++;
      }
      #verify correct <feature> field
      $data[2] = lc($data[2]);
      unless(($data[2] eq "cds") || ($data[2] eq "exon")
             || ($data[2] eq "5utr") || ($data[2] eq "3utr")
             || ($data[2] eq "start_codon") 
             || ($data[2] eq "stop_codon")
             || ($data[2] eq "sec") 
             || ($data[2] eq "intron_cns")
             || ($data[2] eq "inter_cns")
             || ($data[2] eq "inter")) {
        if($errors[2] < $max_error_count){
          print $warnings
              "Incorrect value for <feature> field, \"$data[2]\", ",
              "on line $line_num.\n";
        }
        $errors[2]++;
      }		
      if (($data[2] eq "cds") || ($data[2] eq "5utr")
          || ($data[2] eq "3utr") || ($data[2] eq "sec")) {
        $data[2] = uc($data[2]);
      }
      if ($data[2] eq "inter_cns")  { $data[2] = "inter_CNS"  }
      if ($data[2] eq "intron_cns") { $data[2] = "intron_CNS" }
      #verify correct <start> format
      if($data[3] !~ /^\d*$/){
        if($errors[3] < $max_error_count){
          print $warnings 
              "Non-numerical value for <start> field, \"$data[3]\"",
              ", on line $line_num.\n";
        }
        $errors[3]++;
      }	  
      #verify correct <stop> format
      if($data[4] !~ /^\d*$/){		
        if($errors[4] < $max_error_count){
          print $warnings 
              "Non-numerical value for <stop> field, \"$data[4]\"",
              ", on line $line_num.\n";
        }
        $errors[4]++;
      }
      #verify start is < stop
      if($data[3] > $data[4]){
        if($errors[38] < $max_error_count){
          print $warnings 
              "Start field is greater than stop field on line $line_num.\n";
        }
        $errors[38]++;
        my $start = $data[3];
        $data[3] = $data[4];
        $data[4] = $start;
      }
      #verify correct <score> format
      if($data[5] !~ /^-?\d*\.?\d*$/){ # should check for float
        if($data[5] ne '.'){
          if($errors[5] < $max_error_count){
            print $warnings
                "Value for <score> field, \"$data[5]\", is ",
                "neither numerical nor \".\" on line $line_num\n";
          }
          $errors[5]++;
        }
      }
      if(!$strict && $data[5] eq '.'){
        $data[5] = 0;
      }
      #verify correct <strand> field
      unless(($data[6] eq "+")||($data[6] eq "-")||
             ($data[6] eq ".")){
        if($errors[6] < $max_error_count){
          print $warnings
              "Value of <strand> field is \"$data[6]\" on line ",
              "$line_num.  Should be \"+\", \"-\", or \".\".\n";
        }
        $errors[6]++;
        if(!$strict){
          if($data[6] eq "1"){
            $data[6] = "+";
          }
          elsif($data[6] eq "-1"){
            $data[6] = "-";
          }
          elsif($data[6] eq "0"){
            $data[6] = ".";
          }
        }
      }
      #verify correct <frame> field
      unless(($data[7] eq "0")||($data[7] eq "1")||
             ($data[7] eq "2")||($data[7] eq ".")){
        if($errors[7] < $max_error_count){
          print $warnings
              "Value of <frame> field is \"$data[7]\" on line ",
              "$line_num.  Should be \"0\", \"1\", \"2\", or \".\"",
              ".\n";
        }
        $errors[7]++;
      }	    
      #prepare <attributes> and <comment> fields
      my $attribute_string = "";
      if($in_line =~ /^\S+\t\S+\t\S+\t\S+\t\S+\t\S+\t\S+\t\S+\t(.*)$/){
        $attribute_string = "$1 ";
      }	       
      #check for equals
      if($attribute_string =~ / = /){
        if($errors[30] < $max_error_count){
          print $warnings
              "Illegal \'=\' character in <attributes> field on ",
              "line $line_num\n";
        }
        $errors[30]++;
        $attribute_string =~ s/=/ /g;
      }
      # check for tab character.
      if($attribute_string =~ /\t/){
        if($errors[9] < $max_error_count){
          print $warnings
              "Illegal tab character in <attributes> field on ",
              "line $line_num.\n";
        }
        $errors[9]++;
        $attribute_string =~ s/\t/ /g;
      }
      # make sure it has semicolon terminator after each attribute
      # otherwise fake it.
      unless($attribute_string =~ /;/){
        if($errors[10] < $max_error_count){
          print $warnings
              "<attributes> field conatins no semicolons",
              " terminating each attribute on line $line_num.\n";
        }
        $errors[10]++;
        my @att = split /\s+/, $attribute_string;
        $attribute_string = "";
        my $att_count = $#att - (($#att+1) % 2);
        for(my $i = 0;$i <= $att_count;$i+=2){
          $attribute_string .= $att[$i]." ".$att[$i+1]."; ";
        }
      }
      # parse <attributes> field
      my $attributes = {};
      while($attribute_string =~ /(\S+)(\s+)(\S+)(\s*)/g){
        my $name = $1;
        my $space1 = $2;
        my $value = $3;
        my $space2 = $4;
        if($value =~ /^\"(\S*)\";$/){
          $value = $1;
        }
        else{
          if($value =~ /^(\S+);$/){
            $value = $1;
          }
          else{
            if($errors[10] < $max_error_count){
              $value =~ /(.)$/;
              print $warnings
                  "Bad terminator, \"$1\", after ",
                  "<attributes> name-value pair, $name $value, on ",
                  "line $line_num. Should be \";\".\n";
            }
            $errors[10]++;
          }
          if($value =~ /^\"(\S+)\"$/){
            $value = $1;
          }
          else{
            if($errors[11] < $max_error_count){
              print $warnings
                  "<attributes> field missing quotes around",
                  " values on line $line_num.\n";
            }
            $errors[11]++;
          }
        }
        $attributes->{$name} = $value;
        unless($space1 =~ /^ $/){
          if($errors[12] < $max_error_count){
            print $warnings
                "Bad separator, $space1, between <attributes> ",
                "name-value pair, $name $value, on line ",
                "$line_num.\n";
          }
          $errors[12]++;
        }
        unless($space2 =~ /^ $/){
          if($errors[12] < $max_error_count){
            print $warnings
                "Bad separator, \"$space2\", between <attributes>",
                " name-value pairs on line $line_num.  Should be",
                " \" \".\n$name $value\n";
          }
          $errors[12]++;
        }		    
      }
      # check for gene_id and transcript_id attributes
      my $gid = "missing";
      if(defined($attributes->{gene_id})){
        $gid = $attributes->{gene_id};
      }
      else{
        if($errors[13] < $max_error_count){
          print $warnings
              "gene_id not set in <attributes> field on line ",
              "$line_num.\n";
        }
        $errors[13]++;
        $attributes->{gene_id} = "missing";		   
      }
      my $tid = "missing";
      if(defined($attributes->{transcript_id})){
        $tid = $attributes->{transcript_id};
      }		
      else{
        if($errors[14] < $max_error_count){
          print $warnings
              "transcript_id not set in <attributes> field on line ",
              "$line_num.\n";
        }
        $errors[14]++;
        $attributes->{transcript_id} = "missing";
      }
      #create objects
      my $feature = 
        GTF::Feature::new($data[2], $data[3], $data[4], $data[5], $data[7]);
      if($feature->type eq "CDS"){
        push @all_cds, $feature;
      }
      elsif($feature->type eq "inter") {
        push @all_inter, $feature;
      }
      elsif($feature->type eq "inter_CNS") {
        push @all_cns, $feature;
      }
      
      my $tx;
      my $gene;
      unless(defined($TxObj{$tid}) && length($tid)){
        $tx = GTF::Transcript::new($tid);
        $TxObj{$tid} = $tx;
        unless($feature->type eq "inter" || $feature->type eq "inter_CNS") {
          push @all_txs, $tx;
        }
        unless(defined($GeneObj{$gid}) && length($gid)){
          $gene = GTF::Gene::new($gid,$data[0],$data[1],$data[6]);
          $GeneObj{$gid} = $gene;
          unless($feature->type eq "inter" || $feature->type eq "inter_CNS") {
            push @all_genes, $gene;
          }
        }
        $gene = $GeneObj{$gid};
        $gene->add_transcript($tx);
      }
      else{
        $gene = $GeneObj{$gid};
      }
      $tx = $TxObj{$tid};
      #check that strand value is consistent with the strand of the tx 
      unless($data[6] eq $tx->strand){
        if($errors[20] < $max_error_count){
          print $warnings
            "Inconsistent <strand> value across gene_id = ".
                $tx->gene_id.".\n";
      }
      $errors[20]++;
      if (_check_errors_against_badlist($gtf,20)) {
        push @bad_list, $tx->id;
      }
      if($strict){
        die;
      }
        #last;???
      }
      $tx->add_feature($feature);
    }
  }
  close(GTF);
  #check each gene object
  foreach my $tx (@all_txs){
    my $exons = $tx->exons;
    my $cds = $tx->cds;
    my $utr3 = $tx->utr3;
    my $utr5 = $tx->utr5;
    my $sec = $tx->sec;
    my $inter = $tx->inter;
    my $inter_cns = $tx->inter_cns;
    if($#$exons >= 0 && $#$utr3 == -1 && $#$utr5 == -1){
      $tx->create_utr_objects_from_exons;
    }
    my $starts = $tx->start_codons;
    my $stops = $tx->stop_codons;
    my $strand = $tx->strand;
    #make sure this tx contains at least one CDS, inter_CNS, or inter feature
    if($#$cds == -1 && $#$inter_cns == -1 && $#$inter == -1){
      if($errors[15] < $max_error_count){
        print $warnings
          "Transcript ".$tx->id." contains no CDS, inter or inter_CNS features.\n";
      }
      $errors[15]++;
      if (_check_errors_against_badlist($gtf,15)) {
        push @bad_list, $tx->id;
      }
      next;
    }
    #verify start/stop codon exists and has length 3
    my $start_codon_start = -1;
    my $start_codon_stop = -1;
    if($#$starts == -1){
      if($errors[17] < $max_error_count){
        print $warnings
            "Missing start_codon for transcript \"".$tx->id."\".\n";
      }
      $errors[17]++;
      if (_check_errors_against_badlist($gtf,17)) {
        push @bad_list, $tx->id;
      }
    }
    else{	    
      my $length = 0;
      foreach my $start (@$starts){
        $length += $start->stop - $start->start + 1;
        if(($start_codon_start < 0) || ($start_codon_start > $start->start)){
          $start_codon_start = $start->start;
        }
        if(($start_codon_stop < 0) || ($start_codon_stop < $start->stop)){
          $start_codon_stop = $start->stop;
        }
      }
      if($length != 3){
        if($errors[16] < $max_error_count){
          print $warnings
              "Start Codon length is not three in transcript \"".$tx->id."\".\n";
        }
        $errors[16]++;
        if (_check_errors_against_badlist($gtf,16)) {
          push @bad_list, $tx->id;
        }
      }
    }
    my $stop_codon_start = -1;
    my $stop_codon_stop = -1;
    if($#$stops == -1){
      if($errors[19] < $max_error_count){
        print $warnings
            "Missing stop_codon for transcript \"".$tx->id."\".\n";
      }
      $errors[19]++;
      if (_check_errors_against_badlist($gtf,19)) {
        push @bad_list, $tx->id;
      }
    }
    else{
      my $length = 0;
      foreach my $stop (@$stops){
        $length += $stop->stop - $stop->start + 1;
        if(($stop_codon_start < 0) || ($stop_codon_start > $stop->stop)){
          $stop_codon_start = $stop->start;
        }
        if(($stop_codon_stop < 0) || ($stop_codon_stop < $stop->stop)){
          $stop_codon_stop = $stop->stop;
        }
      }
      if($length != 3){
        if($errors[18] < $max_error_count){
          print $warnings
              "Stop Codon length is not three in gene \"".$tx->id."\".\n";
        }
        $errors[18]++;
        if (_check_errors_against_badlist($gtf,18)) {
          push @bad_list, $tx->id;
        }
      }
    }

    #check that no cds or utr features overlap each other
    # also counts introns that are too short
    # introns shorter than 4 bp are in all likelihood 
    # biologically impossible (at least 2 bp for each splice signal)
    my $p_exons;
    my $overlap = 0;
    my $zero_intron_count = 0;
    my $short_intron_count = 0;
    foreach $p_exons ($utr5, $starts, $cds, $stops, $utr3)
    {
      my $last_stop;
      if ($#$p_exons > 0)
      {
        $last_stop = $$p_exons[0]->stop;
      }

      for(my $i = 1;$i <= $#$p_exons;$i++){
        my $intron_length = $$p_exons[$i]->start - $last_stop - 1;
        if($intron_length < 0){
          $overlap++;
        }
        elsif($intron_length < 4){
          $zero_intron_count++;
        }
        elsif($intron_length < 20){
          $short_intron_count++;
        }
        $last_stop = $$p_exons[$i]->stop;
      }
    }

    # check CDS - UTR junctions for overlaps
    # here, introns with length 0 are allowed because some
    # exons could be in part UTR and in part CDS
    if ($tx->strand eq "+")
    {
      my $ilen;
      if (($#$utr5 > -1) && ($#$starts > -1)){
        $ilen = $$starts[0]->start - $$utr5[$#$utr5]->stop - 1;
        $overlap++ if ($ilen < 0);
        $zero_intron_count++ if (($ilen > 0) && ($ilen < 4));
      }
      if (($#$cds > -1) && ($#$stops > -1)){
        # assumes stop codons are not included in CDS
        $ilen = $$stops[0]->start - $$cds[$#$cds]->stop - 1;
        $overlap++ if ($ilen < 0);
        $zero_intron_count++ if (($ilen > 0) && ($ilen < 4));
      }
      if (($#$stops > -1) && ($#$utr3 > -1)){
        $ilen = $$utr3[0]->start - $$stops[$#$stops]->stop - 1;
        $overlap++ if ($ilen < 0);
        $zero_intron_count++ if (($ilen > 0) && ($ilen < 4));
      }
    }
    else{
      my $ilen;
      if (($#$utr3 > -1) && ($#$stops > -1)){
        $ilen = $$stops[0]->start - $$utr3[$#$utr3]->stop - 1;
        $overlap++ if ($ilen < 0);
        $zero_intron_count++ if (($ilen > 0) && ($ilen < 4));
      }
      if (($#$stops > -1) && ($#$cds > -1)){
        # assumes stop codon is not included in CDS
        $ilen = $$cds[0]->start - $$stops[$#$stops]->stop - 1;
        $overlap++ if ($ilen < 0);
        $zero_intron_count++ if (($ilen > 0) && ($ilen < 4));
      }
      if (($#$starts > -1) && ($#$utr5 > -1)){
        $ilen = $$utr5[0]->start - $$starts[$#$starts]->stop - 1;
        $overlap++ if ($ilen < 0);
        $zero_intron_count++ if (($ilen > 0) && ($ilen < 4));
      }
    }

    if($overlap > 0){
      if($errors[45] < $max_error_count){
        print $warnings
            "Overlapping CDS or UTR features in transcript \"".$tx->id."\".\n";
      }
      $errors[45]++;
      if (_check_errors_against_badlist($gtf,45)) {
        push @bad_list, $tx->id;
      }
    }
    if($zero_intron_count > 0){
      if($errors[48] < $max_error_count){
        print $warnings "$zero_intron_count intron(s) shorter than 4 bp found on in transcript \"".$tx->id."\".\n" ;
      }
      $errors[48]++;
      if (_check_errors_against_badlist($gtf,48)) {
        push @bad_list, $tx->id;
      }
    }
    if($short_intron_count > 0){
      if($errors[49] < $max_error_count){
        print $warnings "$short_intron_count short (<20bp) intron(s) found on in transcript \"".$tx->id."\".\n" ;
      }
      $errors[49]++;
      if (_check_errors_against_badlist($gtf,49)) {
        push @bad_list, $tx->id;
      }
    }
    #check that each cds is between the start and stop and that the initial and 
    #terminal exons are placed correctly with respect to the start/stop codons
    if($start_codon_start >= 0){
      if($strand eq '-'){
        for(my $i = $#$cds;$i >= 0;$i--){
          if($$cds[$i]->end > $start_codon_stop){
            if($errors[21] < $max_error_count){
              print $warnings
                  "CDS before start codon in transcript_id = \"".
                      "".$tx->id."\".\n";
            }
            $errors[21]++;
            if (_check_errors_against_badlist($gtf,21)) {
              push @bad_list, $tx->id;
            }
          }
        }
        my $cds = $tx->initial_exon;
        if($cds->end != $start_codon_stop){
          if($errors[33] < $max_error_count){
            print $warnings
                "Start codon location not consistent with initial exon ".
                    "locaiton in transcript_id = \"".$tx->id."\".\n";
          }
          $errors[33]++;
          if (_check_errors_against_badlist($gtf,33)) {
            push @bad_list, $tx->id;
          }
        }
        if($cds->length < 3){
          my $cds = $tx->cds;
          my $error = 0;
          for(my $i = 0;$i <= $#$starts;$i++){
            if($i == $#$starts){
              unless($$cds[$#$cds-$i]->start <= $$starts[$#$starts-$i]->start){
                $error = 1;
              }
            }
            else{
              unless($$cds[$#$cds-$i]->start == $$starts[$#$starts-$i]->start){
                $error = 1;
              }
            }
          }
          if($error){
            if($errors[47] < $max_error_count){
              print $warnings "Start codon annotated in intron region on ".
                  "transcript_id = \"".$tx->id."\".\n";
            }
            $errors[47]++;
            if (_check_errors_against_badlist($gtf,47)) {
              push @bad_list, $tx->id;
            }
            my $pos = 0;
            my $start_pos = 0;
            my $cds_pos = $#$cds;
            #fix the start codon by splitting it across the cds
            while($pos < 3){
              my $end;
              if($$cds[$cds_pos]->length < 3-$pos){
                $end = $$cds[$cds_pos]->start;
                $pos += $$cds[$cds_pos]->length;
              }
              else{
                $end = $$cds[$cds_pos]->end-2+$pos;
                $pos = 3;
              }
              if($start_pos <= $#$starts){
                $$starts[$start_pos]->set_start($end);
                $$starts[$start_pos]->set_stop($$cds[$cds_pos]->end);
              }
              else{
                my $sc = GTF::Feature::new("start_codon", $end, $$cds[$cds_pos]->end, ".", ".");
                $tx->add_feature($sc);
              }
              $cds_pos--;
              $start_pos++;
            }
            $starts = [sort {$a->start <=> $b->start} @$starts];
          }
        }
      }
      else{
        for(my $i = 0;$i <= $#$cds;$i++){
          if($$cds[$i]->start < $start_codon_start){
            if($errors[21] < $max_error_count){
              print $warnings
                  "CDS before start codon in transcript_id = \"".
                      "".$tx->id."\".\n";
            }
            $errors[21]++;
            if (_check_errors_against_badlist($gtf,21)) {
              push @bad_list, $tx->id;
            }
          }
        }
        my $cds = $tx->initial_exon;
        if($cds->start != $start_codon_start){
          if($errors[33] < $max_error_count){
            print $warnings
                "Start codon location not consistent with initial exon ".
                    "locaiton in transcript_id = \"".$tx->id."\".\n";
          }
          $errors[33]++;
          if (_check_errors_against_badlist($gtf,33)) {
            push @bad_list, $tx->id;
          }
        }
        if($cds->length < 3){
          my $cds = $tx->cds;
          my $error = 0;
          for(my $i = 0; $i <= $#$starts;$i++){
            if($i == $#$starts){
              unless($$cds[$i]->stop >= $$starts[$i]->stop){
                $error = 1;
              }
            }
            else{
              unless($$cds[$i]->stop == $$starts[$i]->stop){
                $error = 1;
              }
            }
          }
          if($error){
            if($errors[47] < $max_error_count){
              print $warnings "Start codon annotated in intron region on ".
                  "transcript_id = \"".$tx->id."\".\n";
            }
            $errors[47]++;
            if (_check_errors_against_badlist($gtf,47)) {
              push @bad_list, $tx->id;
            }
            my $pos = 0;
            my $start_pos = 0;
            my $cds_pos = 0;
            #fix the start codon by splitting it across the cds
            while($pos < 3){
              my $end;
              if($$cds[$cds_pos]->length < 3-$pos){
                $end = $$cds[$cds_pos]->stop;
                $pos += $$cds[$cds_pos]->length;
              }
              else{
                $end = $$cds[$cds_pos]->start+2-$pos;
                $pos = 3;
              }
              if($start_pos <= $#$starts){
                $$starts[$start_pos]->set_start($$cds[$cds_pos]->start);
                $$starts[$start_pos]->set_stop($end);
              }
              else{
                my $sc = GTF::Feature::new("start_codon", $$cds[$cds_pos]->start, $end, ".", ".");
                $tx->add_feature($sc);
              }
              $cds_pos++;
              $start_pos++;
            }
            $starts = [sort {$a->start <=> $b->start} @$starts];
          }
        }
      }
    }
    if($stop_codon_start >= 0){
      if($strand =~ /\-/){
        for(my $i = 0;$i <= $#$cds;$i++){
          if($$cds[$i]->start < $stop_codon_start){
            if($errors[22] < $max_error_count){
              print $warnings
                  "CDS after stop codon in transcript_id = \"".
                      "".$tx->id."\".\n";
            }
            $errors[22]++;
            if (_check_errors_against_badlist($gtf,22)) {
              push @bad_list, $tx->id;
            }
          }
        }
        my $cds = $tx->terminal_exon;
        if($cds->start == $stop_codon_start){
          unless($cds->stop < $stop_codon_stop + 1){
            $cds->set_start($stop_codon_stop + 1);
          }
          if($errors[46] < $max_error_count){
            print $warnings
                "Stop codon included in terminal CDS transcript_id = ".
                    "\"".$tx->id."\".\n";
          }
          $errors[46]++;
          if (_check_errors_against_badlist($gtf,46)) {
            push @bad_list, $tx->id;
          }
        }
        if($cds->start != $stop_codon_stop + 1){
          if($errors[34] < $max_error_count){
            print $warnings
                "Stop codon location not consistent with terminal exon ".
                    "locaiton in transcript_id = \"".$tx->id."\".\n";
          }
          $errors[34]++;
          if (_check_errors_against_badlist($gtf,34)) {
            push @bad_list, $tx->id;
          }
          
        }
      }
      else{
        for(my $i = $#$cds;$i >= 0;$i--){
          if($$cds[$i]->end > $stop_codon_stop){
            if($errors[22] < $max_error_count){
              print $warnings
                  "CDS after stop codon in transcript_id = \"".$tx->id."\".\n";
            }
            $errors[22]++;
            if (_check_errors_against_badlist($gtf,22)) {
              push @bad_list, $tx->id;
            }
          }		    
        }
        my $cds = $tx->terminal_exon;
        if($cds->stop == $stop_codon_stop){
          unless($cds->start > $stop_codon_start -1){
            $cds->set_stop($stop_codon_start - 1);
          }
          if($errors[46] < $max_error_count){
            print $warnings
                "Stop codon included in terminal CDS transcript_id = ".
                    "\"".$tx->id."\".\n";
          }
          $errors[46]++;
          if (_check_errors_against_badlist($gtf,46)) {
            push @bad_list, $tx->id;
          }
        }
        if($cds->stop != $stop_codon_start - 1){
          if($errors[34] < $max_error_count){
            print $warnings
                "Stop codon location not consistent with terminal exon ".
                    "locaiton in transcript_id = \"".$tx->id."\".\n";
          }
          $errors[34]++;
          if (_check_errors_against_badlist($gtf,34)) {
            push @bad_list, $tx->id;
          }
        }
      }
    }
    #check that complete transcripts have even number of codons
    if(($start_codon_start >= 0) && ($stop_codon_stop >= 0)){
      my $len = 0;
      for(my $i = $#$cds;$i >= 0;$i--){
        $len += $$cds[$i]->length;
      }
      unless(($len % 3) == 0){
        if($errors[41] < $max_error_count){
          print $warnings
              "Complete transcript length not a multiple of three".
                  " on transcript_id = \"".$tx->id."\".\n";
        }
        $errors[41]++;
        if (_check_errors_against_badlist($gtf,41)) {
          push @bad_list, $tx->id;
        }
      }
    }
    #check for consistent phase
    my $phase = -1;
    if($start_codon_start >= 0){
      $phase = 0;
    }
    elsif($start_codon_stop >= 0){
      my $len = 0;
      for(my $i = $#$cds;$i >= 0;$i--){
        $len += $$cds[$i]->length;
      }
      $phase = $len % 3;
    }
    if($strand eq "-"){
      if($phase == -1){
        $phase = $$cds[$#$cds]->frame;
        if($phase eq "."){
          my $pos = -1;
          for(my $i = $#$cds; $i >= 0;$i--){
            unless($$cds[$i]->frame eq "."){
              $pos = $i;
              $phase = $$cds[$i]->frame;
              last;
            }
          }
          if($pos >= 0){
            for(my $i = $pos + 1; $i <= $#$cds;$i++){
              my $len = $$cds[$i]->length + $phase;
              $phase = $len % 3;
            }
          }
        }
        
      }
      unless($phase eq "."){
        my $bad_frame = 0;
        for(my $i = $#$cds;$i >= 0;$i--){
          unless($phase eq $$cds[$i]->frame){
            unless($$cds[$i]->frame eq "."){
              $bad_frame = 1;
            }
            $$cds[$i]->set_frame($phase);
          }
          $phase = ($$cds[$i]->length - $phase) % 3;
          if($phase == 1){
            $phase = 2;
          }
          elsif($phase == 2){
            $phase = 1;
          }
        }
        if($bad_frame){
          if($errors[42] < $max_error_count){
            print $warnings
                "Inconsistent frame accross transcript_id = ".
                    "\"".$tx->id."\".\n";
          }
          $errors[42]++;
          if (_check_errors_against_badlist($gtf,42)) {
            push @bad_list, $tx->id;
          }
        }
      }
    }
    else{
      if($phase == -1){
        $phase = $$cds[0]->frame;
        if($phase eq "."){
          my $pos = -1;
          for(my $i = 0; $i <= $#$cds;$i++){
            unless($$cds[$i]->frame eq "."){
              $pos = $i;
              $phase = $$cds[$i]->frame;
              last;
            }
          }
          if($pos >= 0){
            for(my $i = $pos - 1; $i >= 0;$i--){
              my $len = $$cds[$i]->length + $phase;
              $phase = $len % 3;
            }
          }
        }
      }
      unless($phase eq "."){
        my $bad_frame = 0;
        for(my $i = 0; $i <= $#$cds;$i++){
          unless($phase eq $$cds[$i]->frame){
            unless($$cds[$i]->frame eq "."){
              $bad_frame = 1;
            }
            $$cds[$i]->set_frame($phase);
          }
          $phase = ($$cds[$i]->length - $phase) % 3;
          if($phase == 1){
            $phase = 2;
          }
          elsif($phase == 2){
            $phase = 1;
          }
        }
        if($bad_frame){
          if($errors[42] < $max_error_count){
            print $warnings
                "Inconsistent frame accross transcript_id = ".
                    "\"".$tx->id."\".\n";
          }
          $errors[42]++;
          if (_check_errors_against_badlist($gtf,42)) {
            push @bad_list, $tx->id;
          }
        }
      }
    }
    # check selenocysteine (SEC) features here.  must be inside a 
    # CDS and have frame 0
    if($#$sec >= 0){
      foreach my $s (@$sec){
        if($s->frame ne "0" && $s->frame ne "."){
          #bad SEC frame
          if($errors[51] < $max_error_count){
            print $warnings
                "Bad frame, ".$s->frame."on SEC feature on tx ".$tx->id.".  Must of 0 or \".\"\n";
          }
          $errors[51]++;
          if (_check_errors_against_badlist($gtf,51)) {
            push @bad_list, $tx->id;
          }
        }
        my $good = 0;
        foreach my $c (@$cds){
          if($c->stop < $s->start){
            next;
          }
          if($c->start <= $s->start &&
             $c->stop >= $s->stop){
            $good = 1;
            last;
          }
          if($c->start > $s->stop){
            last;
          }
        }
        if($good == 0){
          # SEC outside CDS
          if($errors[52] < $max_error_count){
            print $warnings
                "SEC feature outside of CDS region on tx ".$tx->id.".\n";
          }
          $errors[52]++;
          if (_check_errors_against_badlist($gtf,52)) {
            push @bad_list, $tx->id;
          }
          
        }
      }
    }
  }

  #if the conservation sequence is available store it's info;
  my $conseqname = $gtf->{Conseq};
  my %total_conserv = (0 => 0,1 => 0,2 => 0);
  if($conseqname){
    open(Conseq, $conseqname)
        or die "Could not open conservation sequence file: \"$conseqname\".\n";
    my @conseq_info = stat(Conseq);
    my @comp;
    foreach my $tx (@all_txs){
      my $cds = $tx->cds;
      my $exons = $tx->exons;
      my $starts = $tx->start_codons;
      my $stops = $tx->stop_codons;	    
      foreach my $start (@$starts){
        push @comp,{feature => $start, "0" => 0, "1" => 0, "2" => 0};
      }
      foreach my $stop (@$stops){
        push @comp,{feature => $stop, "0" => 0, "1" => 0, "2" => 0};
      }
      foreach my $exon (@$cds){
        push @comp,{feature => $exon, "0" => 0, "1" => 0, "2" => 0};
      }
      foreach my $exon (@$exons){
        push @comp,{feature => $exon, "0" => 0, "1" => 0, "2" => 0};
      }
    }
    @comp = sort {$a->{feature}->start <=> $b->{feature}->start} @comp;
    my $base = "X";
    my $nuc_pos = 0;
    my $file_pos = 0;
    my $line = "X";
    my $next_comp = 0;
    my $max_comps = $#comp;
    my %checking;
    while(($next_comp <= $max_comps) and ($file_pos < $conseq_info[7])){
      $base = getc(Conseq);	    
      unless(defined($base)){
        print $warnings "Features beyond end of conservation sequence.\n";
        last;
      }
      $file_pos++;		   
      if($base =~ /\d/){
        $nuc_pos++;
        if(defined($total_conserv{$base})){
          $total_conserv{$base}++;
        }
        else{
          $total_conserv{X}++;
          if($errors[43] < $max_error_count){
            print $warnings "Bad conservation sequence character, $base.  ".
                "Should be \'0\',\'1\', or \'2\'.\n";
          }
          $errors[43]++;
        }
        #compile composition stats
        while(($next_comp <= $max_comps) and
              ($nuc_pos == $comp[$next_comp]->{feature}->start)){
          $checking{$comp[$next_comp]} = $comp[$next_comp];
          $next_comp++;
        }
        foreach my $id (keys %checking){
          $checking{$id}->{$base}++;
          if($checking{$id}->{feature}->stop == $nuc_pos){
            delete $checking{$id};
          }
        }
      }
    }
    while($file_pos < $conseq_info[7]){
      $base = getc(Conseq);	    
      $file_pos++;
      if($base =~ /\d/){
        $nuc_pos++;
        if(defined($total_conserv{$base})){
          $total_conserv{$base}++;
        }
        else{
          $total_conserv{X}++;
          if($errors[43] < $max_error_count){
            print $warnings "Bad conservation sequence character, $base.  ".
                "Should be \'0\',\'1\', or \'2\'.\n";
          }
          $errors[43]++;
        }
      }
    }
    close(Conseq);
    foreach my $info (@comp){
      $info->{feature}->set_conseq($info->{0},$info->{1},$info->{2});
    }
  }
  $gtf->{Total_Conseq} = \%total_conserv;
  #if the sequence is available use it to fix any problems
  my @inframe_stops;
  my $seqname = $gtf->{Sequence};
  my %tx_phase;
  foreach my $tx (@all_txs){
    $tx_phase{$tx->id} = -1;
  }
  my %total_seq = ('A' => 0,'C' => 0,'G' =>0,'T' => 0,'N' => 0,'X' => 0);
  #EVAN need to rewrite all this sequence loading stuff because it is fugly
  if($seqname){
    open(SEQ, "<$seqname")
        or die "Could not open sequence file: \"$seqname\".\n";
    my @seq_info = stat(SEQ);
    #try to fix things with the sequence info here
    #check start codons, stop codons, splice site, internal stop
    my %sequence;
    my %seq_pos;
    my @checks;
    foreach my $tx (@all_txs){
      $sequence{$tx->id} = "";
      $seq_pos{$tx->id} = 0;
      my $starts = $tx->start_codons;
      my $stops = $tx->stop_codons;
      my $utr3 = $tx->utr3;
      my $utr5 = $tx->utr5;
      my $cds = $tx->cds;
      if($#$cds == -1){
        next;
      }
      my $exons = $tx->exons;
      foreach my $start (@$starts){
        push @checks, {pos => $start->start,
                       type => "start_sequence",
                       tx => $tx,
                       object => $start,
                       stop => $start->stop};
      }
      foreach my $stop (@$stops){
        push @checks, {pos => $stop->start,
                       type => "stop_sequence",
                       tx => $tx,
                       object => $stop,
                       stop => $stop->stop};
      }
      foreach my $exon (@$exons){
        push @checks, {pos => $exon->start,
                       type => "exon_sequence",
                       tx => $tx,
                       object => $exon,
                       stop => $exon->stop};
      }
      foreach my $coding (@$cds){
        push @checks, {pos => $coding->start,
                       type => "coding_sequence",
                       tx => $tx,
                       object => $coding,
                       stop => $coding->stop};
      }
    
      my($p_exons, $splice_type1, $splice_type2);
      if ($tx->strand eq "+"){
        $splice_type1 = "donor_splice";
        $splice_type2 = "acceptor_splice";
      }
      else{
        $splice_type1 = "acceptor_splice";
        $splice_type2 = "donor_splice";
      }

      foreach $p_exons ($utr5, $starts, $cds, $stops, $utr3){
        next if ($#$p_exons < 1);

        my($i, $subtype);
        $subtype = ($$p_exons[0]->type =~ /UTR/) ? "utr" : "cds";
        for ($i = 1; $i <= $#$p_exons; $i++){
          push @checks, {pos  => $$p_exons[$i-1]->stop+2, 
                         type => $subtype."_".$splice_type1, 
                         tx => $tx,
                         stop => 0};
          push @checks, {pos  => $$p_exons[$i]->start-1, 
                         type => $subtype."_".$splice_type2, 
                         tx => $tx,
                         stop => 0};
        }
      }

      # this assumes that the innermost start codon is included in CDS
      # therefore, no checks are run on start_codon - CDS junction
      # also, this allows zero-length introns at UTR5 - start_codon 
      # CDS - stop_codon and UTR3 - stop_codon junctions 
      if ($tx->strand eq "+"){
        if (($#$utr5 > -1) && ($#$starts > -1)){
          my $ilen = $$starts[0]->start - $$utr5[$#$utr5]->stop - 1;
          if ($ilen != 0){
            push @checks, {pos  => $$utr5[$#$utr5]->stop+2, 
                           type => "utr_donor_splice", 
                           tx => $tx,
                           stop => 0};
            push @checks, {pos  => $$starts[0]->start-1, 
                           type => "utr_acceptor_splice", 
                           tx => $tx,
                           stop => 0};
          }
        }
        if (($#$cds > -1) && ($#$stops > -1)){
          my $ilen = $$stops[0]->start - $$cds[$#$cds]->stop - 1;
          if ($ilen != 0){
            push @checks, {pos  => $$cds[$#$cds]->stop+2, 
                           type => "cds_donor_splice", 
                           tx => $tx,
                           stop => 0};
            push @checks, {pos  => $$stops[0]->start-1, 
                           type => "cds_acceptor_splice", 
                           tx => $tx,
                           stop => 0};
          }
        }  
        if (($#$stops > -1) && ($#$utr3 > -1)){
          my $ilen = $$utr3[0]->start - $$stops[$#$stops]->stop - 1;
          if ($ilen != 0){
            push @checks, {pos  => $$stops[$#$stops]->stop+2, 
                           type => "utr_donor_splice", 
                           tx => $tx,
                           stop => 0};
            push @checks, {pos  => $$utr3[0]->start-1, 
                           type => "utr_acceptor_splice", 
                           tx => $tx,
                           stop => 0};
          }
        }
        if ($#{$stops} < 0){
          push @checks, {pos => $$cds[$#{$cds}]->stop+1,
                         type => "stop_sequence?",
                         tx => $tx,
                         stop => $$cds[$#{$cds}]->stop+3};
        }
        if ($#{$starts} < 0){
          if ($$cds[0]->start-3 > 0){
            push @checks, {pos => $$cds[0]->start-3,
                           type => "start_sequence?",
                           tx => $tx,
                           stop => $$cds[0]->start-1};
          }
          else{
            $sequence{$tx->id} = "NNN";
          }
        }
      }	    
      else{
        if (($#$utr3 > -1) && ($#$stops > -1)){
          my $ilen = $$stops[0]->start - $$utr3[$#$utr3]->stop - 1;
          if ($ilen != 0){
            push @checks, {pos  => $$utr3[$#$utr3]->stop+2, 
                           type => "utr_acceptor_splice", 
                           tx => $tx,
                           stop => 0};
            push @checks, {pos  => $$stops[0]->start-1, 
                           type => "utr_donor_splice", 
                           tx => $tx,
                           stop => 0};
          }
        }
        if (($#$stops > -1) && ($#$cds > -1)){
          my $ilen = $$cds[0]->start - $$stops[$#$stops]->stop - 1;
          if ($ilen != 0){
            push @checks, {pos  => $$stops[$#$stops]->stop+2, 
                           type => "cds_acceptor_splice", 
                           tx => $tx,
                           stop => 0};
            push @checks, {pos  => $$cds[0]->start-1, 
                           type => "cds_donor_splice", 
                           tx => $tx,
                           stop => 0};
          }
        }  
        if (($#$starts > -1) && ($#$utr5 > -1)){
          my $ilen = $$utr5[0]->start - $$starts[$#$starts]->stop - 1;
          if ($ilen != 0){
            push @checks, {pos  => $$starts[$#$starts]->stop+2, 
                           type => "utr_acceptor_splice", 
                           tx => $tx,
                           stop => 0};
            push @checks, {pos  => $$utr5[0]->start-1, 
                           type => "utr_donor_splice", 
                           tx => $tx,
                           stop => 0};
          }
        }
        if ($#{$stops} < 0){
          if ($$cds[0]->start-3 > 0){
            push @checks, {pos => $$cds[0]->start-3,
                           type => "stop_sequence?",
                           tx => $tx,
                           stop => $$cds[0]->start-1};
          }
        }
        if ($#{$starts} < 0){
          push @checks, {pos => $$cds[$#{$cds}]->stop+1,
                         type => "start_sequence?",
                         tx => $tx,
                         stop => $$cds[$#{$cds}]->stop+3};
        }
      }	    
    }
    
    @checks  = sort{$a->{pos} <=> $b->{pos} || $a->{stop} <=> $b->{stop}} @checks;
    my $file_pos = length(<SEQ>);
    my $base = "X";
    my $nuc_pos = 0;
    my $line = "X";
    my $next = 0;
    my $prev_base = "X";
    my $max_checks = $#checks;
    my %checking;
    my $splice_type;
    
    while((($next <= $max_checks) || ((keys %checking) >= 0)) 
          && ($file_pos < $seq_info[7])){
      $base = getc(SEQ);
      unless(defined($base)){
        print $warnings "Features beyond end of sequence.\n";
        last;
      }
      $file_pos++;		   
      if($base =~ /\S/){
        $base = uc($base);
        if((defined($total_seq{$base})) && ($base ne 'X')){
          $total_seq{$base}++;
        }
        else{
          $total_seq{X}++;
          if($errors[44] < $max_error_count){
            print $warnings "Bad fasta sequence character, $base. Should be".
                " \'A\',\'C\',\'T\',\'G\',or \'N\'.\n";
          }
          $errors[44]++;
        }
        $nuc_pos++;
        while(($next <= $max_checks) && ($checks[$next]{pos} == $nuc_pos)){
          if($checks[$next]{type} =~ /sequence/){
            $checking{$checks[$next]{tx}->id . "\t" . 
                          $checks[$next]{type}} = {
                            check => $checks[$next],
                            A => 0,
                            C => 0,
                            G => 0,
                            T => 0,
                            N => 0};
          }
          elsif(($checks[$next]{type} =~ /^(\w+)_acceptor_splice$/)
                && ($splice_type = $1)) {
            if($checks[$next]{tx}->strand eq "-"){
              unless(($prev_base eq "C") && ($base eq "T")){
                if (($prev_base eq "G") && ($base eq "T")) {
                  if ($errors[50] < $max_error_count) {
                    print $warnings
                        "Possible non-canonical splice site sequence, ".
                            "$prev_base$base, on negative strand transcript \"".
                                $checks[$next]{tx}->id.
                                    "\".\n";
                  }
                  $errors[50]++;
                  if ((_check_errors_against_badlist($gtf,50)) 
                      && ($splice_type ne "utr")) {
                    push @bad_list, $checks[$next]{tx}->id;
                  }
                }
                else {
                  if($errors[26] < $max_error_count){
                    print $warnings
                        "Bad acceptor splice site sequence, $prev_base".
                            "$base, on negative strand transcript \"".
                                $checks[$next]{tx}->id.
                                    "\".  Should be CT or GT.\n";
                  }
                  $errors[26]++;
                  if (_check_errors_against_badlist($gtf,26)) {
                    push @bad_list, $checks[$next]{tx}->id;
                  }
                }
              }
              
            }
            else{
              unless(($prev_base eq "A") && ($base eq "G")){
                if (($prev_base eq "A") && ($base eq "C")) {
                  if ($errors[50] < $max_error_count) {
                    print $warnings
                        "Possible non-canonical splice site sequence, ".
                            "$prev_base$base, on transcript \"".
                                $checks[$next]{tx}->id.
                                    "\".\n";
                  }
                  $errors[50]++;
                  if ((_check_errors_against_badlist($gtf,50)) 
                      && ($splice_type ne "utr")) {
                    push @bad_list, $checks[$next]{tx}->id;
                  }
                }
                else {
                  if($errors[26] < $max_error_count){
                    print $warnings
                        "Bad acceptor splice site sequence, $prev_base".
                            "$base, on transcript \"".
                                $checks[$next]{tx}->id.
                                    "\".  Should be AG or AC.\n";
                  }
                  $errors[26]++;
                  if (_check_errors_against_badlist($gtf,26)) {
                    push @bad_list, $checks[$next]{tx}->id;
                  }
                }
              }
            }
          }
          elsif(($checks[$next]{type} =~ /^(\w+)_donor_splice$/)
                && ($splice_type = $1)) {
            if($checks[$next]{tx}->strand eq "-"){
              unless(($prev_base eq "A") && ($base eq "C")){
                if (($prev_base eq "A") && ($base eq "T")) {
                  if ($errors[50] < $max_error_count) {
                    print $warnings
                        "Possible non-canonical splice site sequence, ".
                            "$prev_base$base, on negative strand transcript \"".
                                $checks[$next]{tx}->id.
                                    "\".\n";
                  }
                  $errors[50]++;
                  if ((_check_errors_against_badlist($gtf,50)) 
                      && ($splice_type ne "utr")) {
                    push @bad_list, $checks[$next]{tx}->id;
                  }
                }
                elsif (($prev_base eq "G") && ($base eq "C")) {
                  if ($errors[50] < $max_error_count) {
                    print $warnings
                        "Possible non-canonical splice site sequence, ".
                            "$prev_base$base, on negative strand transcript \"".
                                $checks[$next]{tx}->id.
                                    "\".\n";
                  }
                  $errors[50]++;
                  if ((_check_errors_against_badlist($gtf,50)) 
                      && ($splice_type ne "utr")) {
                    push @bad_list, $checks[$next]{tx}->id;
                  }
                }					
                else {
                  if ($errors[27] < $max_error_count){
                    print $warnings
                        "Bad donor splice site sequence, $prev_base".
                            "$base, on negative strand transcript \"".
                                $checks[$next]{tx}->id.
                                    "\".  Should be AC, GC, or AT.\n";
                  }
                  $errors[27]++;
                  if (_check_errors_against_badlist($gtf,27)) {
                    push @bad_list, $checks[$next]{tx}->id;
                  }
                }
              }
            }
            else{
              unless(($prev_base eq "G") && ($base eq "T")){
                if (($prev_base eq "G") && ($base eq "C")) {
                  if ($errors[50] < $max_error_count) {
                    print $warnings
                        "Possible non-canonical splice site sequence, ".
                            "$prev_base$base, on transcript \"".
                                $checks[$next]{tx}->id.
                                    "\".\n";
                  }
                  $errors[50]++;
                  if ((_check_errors_against_badlist($gtf,50)) 
                      && ($splice_type ne "utr")) {
                    push @bad_list, $checks[$next]{tx}->id;
                  }
                }
                elsif (($prev_base eq "A") && ($base eq "T")) {
                  if ($errors[50] < $max_error_count) {
                    print $warnings
                        "Possible non-canonical splice site sequence, ".
                            "$prev_base$base, on transcript \"".
                                $checks[$next]{tx}->id.
                                    "\".\n";
                  }
                  $errors[50]++;
                  if ((_check_errors_against_badlist($gtf,50)) 
                      && ($splice_type ne "utr")) {
                    push @bad_list, $checks[$next]{tx}->id;
                  }
                }					
                else {
                  if ($errors[27] < $max_error_count){
                    print $warnings
                        "Bad donor splice site sequence, $prev_base".
                            "$base, on transcript \"".
                                $checks[$next]{tx}->id.
                                    "\".  Should be GT, GC, or AT.\n";
                  }
                  $errors[27]++;
                  if (_check_errors_against_badlist($gtf,27)) {
                    push @bad_list, $checks[$next]{tx}->id;
                  }
                }
              }
            }
          }
          $next++;
        }
        foreach my $check (keys %checking){		
          $check =~ /^(.+)\t.+$/;
          my $tx_id = $1;
          unless(($checking{$check}{check}{type} =~ /exon/) || 
                 ($seq_pos{$tx_id} >= $nuc_pos)){
            $sequence{$tx_id}.= uc($base);
            $seq_pos{$tx_id} = $nuc_pos;
          }
          $checking{$check}{uc($base)}++;
          if($checking{$check}{check}{stop} eq $nuc_pos){			
            my $object = $checking{$check}{check}{object};
            if(defined($object)){
              if($object->strand eq "-"){
                $object->set_bases($checking{$check}{T},
                                   $checking{$check}{G},
                                   $checking{$check}{C},
                                   $checking{$check}{A},
                                   $checking{$check}{N});
                
              }
              else{
                $object->set_bases($checking{$check}{A},
                                   $checking{$check}{C},
                                   $checking{$check}{G},
                                   $checking{$check}{T},
                                   $checking{$check}{N});
              }
            }
            delete $checking{$check};
          }
        }	
        $prev_base = $base;
      }
    }
    foreach my $tx (@all_txs){
      my $cds = $tx->cds;
      unless($#$cds >= 0){
        next;
      }
      my $sec = $tx->sec;
      my $starts = $tx->start_codons;
      my $stops = $tx->stop_codons;
      my $strand = $tx->strand;
      my $tx_id = $tx->id;
      my $phase = $$cds[0]->frame;
      my $old_phase = $phase;
      if($strand eq "-"){
        $phase = $$cds[$#$cds]->frame;
      }
      my $seq = $sequence{$tx->id};	    
      if($seq =~ /^$/){
        print STDERR "No sequence for $tx_id\n";
        next;
      }
      if($tx->strand eq "-"){
        $seq = _rev_comp($seq);
      }
      my $start = 0;
      my $stop = 0;
      my $found_start = -1;
      my $found_stop = -1;	    
      my @seq = split //, $seq;	    
      #check some SEC stuff here;
      foreach my $s (@$sec){
        my $pos = 0;
        if($tx->strand eq '-'){
          foreach my $c (reverse @$cds){
            if($c->start > $s->stop){
              $pos += $c->stop-$c->start+1;
            }
            else{
              $pos += $c->stop-$s->stop;
              last;
            }
          }
          if($#$stops == -1){
            $pos += 3;
          }
        }
        else{
          foreach my $c (@$cds){
            if($c->stop < $s->start){
              $pos += $c->stop-$c->start+1;
            }
            else{
              $pos += $s->start-$c->start;
              last;
            }
          }
          if($#$starts == -1){
            $pos += 3;
          }
        }
        my $codon = substr($seq,$pos,3);
        unless($codon eq "TGA"){
          #bad SEC codon
          $error_msgs[53] = "SEC feature had bad sequence.  Must be \"TGA\"";
          if($errors[53] < $max_error_count){
            print $warnings "SEC feature had bad sequence, $codon, on tx, ".$tx->id.
                ".  Must be \"TGA\"\n";
          }
          $errors[53]++;
          if (_check_errors_against_badlist($gtf,53)) {
            push @bad_list, $tx->id;
          }
        }
        substr($seq,$pos,3) = "NNN";
      }
      if($#$starts >= 0){
        my $sc = substr($seq,0,3);
        unless($sc eq "ATG"){
          if($errors[24] < $max_error_count){
            print $warnings
                "Bad start codon sequence, $sc, on tx ".
                    $tx->id.".  Should be ATG.\n";
          }
          $errors[24]++;
          if (_check_errors_against_badlist($gtf,24)) {
            push @bad_list, $tx->id;
          }
        }
      }
      else{
        my $sc1 = substr($seq,0,3);
        my $sc2 = substr($seq,3,3);
        if($sc2 eq "ATG"){
          $found_start = 3;
          $start = 3;
        }
        elsif($sc1 eq "ATG"){
          $found_start = 0;
        }
        else{
          $start = 3;
        }
      }
      if($#$stops >= 0){
        my $sc = substr($seq,$#seq-2,3);
        unless(($sc eq "TAA") ||
               ($sc eq "TAG") ||
               ($sc eq "TGA")){
          if($errors[25] < $max_error_count){
            print $warnings
                "Bad stop codon sequence, $sc, on tx ".
                    $tx->id.".  Should be TAA, TAG, or TGA.\n";
          }
          $errors[25]++;
          if (_check_errors_against_badlist($gtf,25)) {
            push @bad_list, $tx->id;
          }
        }		       
      }
      else{
        my $sc1 = substr($seq,$#seq - 2,3);
        my $sc2 = substr($seq,$#seq - 5,3);
        if(($sc2 eq "TAA") ||
           ($sc2 eq "TAG") ||
           ($sc2 eq "TGA")){
          $found_stop = $#seq - 5;
          $stop = 3;
        }
        elsif(($sc1 eq "TAA") ||
              ($sc1 eq "TAG") ||
              ($sc1 eq "TGA")){
          $found_stop = $#seq - 2;
        }
        else{
          $stop = 3;
        }
      }
      my @sc = (0,0,0);
      my $scphase = $phase;
      $scphase = 0 if($scphase eq '.');
      for(my $i = $start;$i <= $#seq - $stop - 5;$i++){
        my $codon = substr($seq,$i,3);
        if(($codon eq "TAA") ||
           ($codon eq "TAG") ||
           ($codon eq "TGA")){
          $sc[($i+$scphase) % 3] = 1;
        }
      }
      if(($phase ne ".") &&
         ($sc[0])){
        if($errors[29] < $max_error_count){
          print $warnings "In frame stop codon(s) found on transcript ".
              $tx->id."\n";
        }
        $errors[29]++;
        if (_check_errors_against_badlist($gtf,29)) {
          push @bad_list, $tx->id;
        }
        push @inframe_stops, $tx->id;
      }
      else{
        if(($sc[0] == 1) &&
           ($sc[1] == 1) &&
           ($sc[2] == 1)){
          push @inframe_stops, $tx->id;
          if($errors[37] < $max_error_count){
            print $warnings "Stop Codons in all frames on transcript ".
                $tx->id."\n";
          }
          $errors[37]++;
          if (_check_errors_against_badlist($gtf,37)) {
            push @bad_list, $tx->id;
          }
        }
      }
      # add start or stop if possible
      if($found_stop != -1){
        if(($phase eq ".") ||
           ($phase == (($#seq + 1) % 3))){
          if($sc[($#seq +1) % 3] == 0){
            $phase = ($#seq + 1) % 3;
            if($errors[32] < $max_error_count){
              print $warnings
                  "Unannotated stop codon found on transcript ".
                      $tx->id."\n";
            }
            $errors[32]++;
            if (_check_errors_against_badlist($gtf,32)) {
              push @bad_list, $tx->id;
            }
            if($fix_gtf){
              if($strand eq "+"){
                my $terminal_exon = $$cds[$#$cds];
                my $sc_start = $terminal_exon->stop + 1 - $stop;
                my $stop_codon = 
                  GTF::Feature::new("stop_codon",$sc_start,
                                    $sc_start + 2,0,0);
                $terminal_exon->set_stop($sc_start - 1);
                $tx->add_feature($stop_codon);
              }
              else{
                my $terminal_exon = $$cds[0];
                my $sc_start = $terminal_exon->start - 3 + $stop;
                my $stop_codon = 
                  GTF::Feature::new("stop_codon",$sc_start,
                                    $sc_start + 2,0,0);
                $terminal_exon->set_start($sc_start + 3);
                $tx->add_feature($stop_codon);
              }
            }
          }
        }
      }
      if($found_start != -1){
        if(($phase eq ".") ||
           ($phase == 0)){
          if($sc[0] == 0){
            if($errors[31] < $max_error_count){
              print $warnings
                  "Unannotated start codon found on transcript ".
                      $tx->id."\n";
            }
            $errors[31]++;
            if (_check_errors_against_badlist($gtf,31)) {
              push @bad_list, $tx->id;
            }
            if($fix_gtf){
              if($strand eq "+"){
                my $initial_exon = $$cds[0];
                my $sc_start = $initial_exon->start - 3 + $start;
                my $start_codon = 
                  GTF::Feature::new("start_codon",$sc_start,
                                    $sc_start + 2,0,0);
                $initial_exon->set_start($sc_start);
                $tx->add_feature($start_codon);
              }
              else{
                my $initial_exon = $$cds[$#$cds];
                my $sc_start = $initial_exon->stop + 1 - $start;
                my $start_codon = 
                  GTF::Feature::new("start_codon",$sc_start,
                                    $sc_start + 2,0,0);
                $initial_exon->set_stop($sc_start + 2);
                $tx->add_feature($start_codon);
              }
            }
          }
        }
      }
      # set phase if unset here;
      if(($phase ne ".") && ($old_phase eq ".") && ($fix_gtf)){
        if($strand eq "+"){
          for(my $i = 0;$i <= $#$cds;$i++){
            $$cds[$i]->set_frame($phase);
            $phase = ($$cds[$i]->length - $phase) % 3;
            if($phase == 1){
              $phase = 2;
            }
            elsif($phase == 2){
              $phase = 1;
            }
          }
        }
        else{
          for(my $i = $#$cds;$i >= 0;$i--){
            $$cds[$i]->set_frame($phase);
            $phase = ($$cds[$i]->length - $phase) % 3;
            if($phase == 1){
              $phase = 2;
            }
            elsif($phase == 2){
              $phase = 1;
            }
          }
        }
      }
      if($gtf->{Tx}){
        my $tx_fh = $gtf->{Tx};
        print $tx_fh ">".$tx->id ."\t".$tx->start."\t".$tx->stop."\n".
            substr($seq,$start,$#seq - $stop - $start + 1)."\n";
      }
    }
  }
  $gtf->{Total_Seq} = \%total_seq;
  for(my $i = 0;$i <= $#error_msgs;$i++){
    if(($errors[$i] > 0) && !($$no_warn[$i])){
      print $warnings 
          "\nWarnings encountered:\n",
          "Count\tDescription\n";
      for(my $j = $i;$j <= $#error_msgs;$j++){
        if(($errors[$j] > 0) && !($$no_warn[$j])){
          print $warnings
              $errors[$j],"\t$error_msgs[$j]\n";
        }
      }
      last;
    }
  }
  
  #sort genes, transcripts, cds and utr by start position
  @all_genes  = sort {$a->start <=> $b->start} @all_genes;
  @all_txs  = sort {$a->start <=> $b->start} @all_txs;
  @all_cds  = sort {$a->start <=> $b->start} @all_cds;
  @all_inter = sort {$a->start <=> $b->start} @all_inter;
  @all_cns = sort {$a->start <=> $b->start} @all_cns;

  print $warnings "\nStatistics:\n";
  print $warnings 
      "\t", $#all_genes + 1, " genes with ", $#all_txs + 1,
      " transcripts containing ", $#all_cds + 1 , " cds.\n";
  if($gtf->{Inframe_Stops}){
    print $warnings "Inframe Stop Genes:\n";
    foreach my $stop (@inframe_stops){
      print $warnings "$stop\n";
    }
  }
  if ($gtf->{Bad_List}) {
    print $warnings "Bad Genes:\n";
    my %bad_out;
    foreach my $bad (@bad_list) {
      $bad_out{$bad}++;
    }
    my @out = keys %bad_out; 
    while (@out) {
      print $warnings pop(@out),"\n";
    }
  }
  close(WARN);
  close(GTF);
  $gtf->{Genes}        = \@all_genes;
  $gtf->{Transcripts}  = \@all_txs;
  $gtf->{CDS}          = \@all_cds;
  $gtf->{Inter}        = \@all_inter;
  $gtf->{Inter_CNS}    = \@all_cns;
  $gtf->{Comments}     = $all_line_comments;
  $gtf->{Parse_Errors} = \@errors;
}

# GTF::_get_error_messages()
#    This function returns an array of the error messages associated with
#    each error number.
sub _get_error_messages{
  my @error_msgs;
  $error_msgs[0] = "Not enough fields on line.";
  $error_msgs[1] = "Illegal type of whitespace between fields.  Sould be tab.";
  $error_msgs[2] = "Illegal value for <feature> field.  Should be ".
      "\'CDS\', \'exon\', \'start_codon\', or \'stop_codon\'.";
  $error_msgs[3] = "Illegal value for <start> field.  Should be numerical.";
  $error_msgs[4] = "Illegal value for <stop> field.  Should be numerical.";
  $error_msgs[5] = "Illegal value for <score> field.  Should be numerical or \'.\'.";
  $error_msgs[6] = "Illegal value for <strand> field.  ".
      "Should be \'+\', \'-\', or \'.\'.";
  $error_msgs[7] = "Illegal value for <frame> field.  ".
      "Should be \'0\', \'1\', \'2\', or \'.\'.";
  $error_msgs[8] = "Illegal \'\#\' character in attribute name.";
  $error_msgs[9] = "Illegal tab character in <attributes> field.";
  $error_msgs[10] = "Missing \';\' terminator after attribute in <attributes> field.";
  $error_msgs[11] = "Missing \'\"\'s around attribute values in  <attributes> field.";
  $error_msgs[12] = "Incorrect separator between attribute name-value pairs in ".
      "<attributes> field.  Should be \' \'.";
  $error_msgs[13] = "Missing \'gene_id\' attribute in <attributes> field.";
  $error_msgs[14] = "Missing \'transcript_id\' attribute in <attributes> field.";
  $error_msgs[15] = "Transcript contains no CDS, inter or inter_CNS features.";
  $error_msgs[16] = "Start Codon length is not three.";
  $error_msgs[17] = "Transcript has no start codon.";
  $error_msgs[18] = "Stop Codon length is not three.";
  $error_msgs[19] = "Transcript has no stop codon.";	    
  $error_msgs[20] = "Inconsistent <strand> value across gene_id.";
  $error_msgs[21] = "CDS before start codon.";
  $error_msgs[22] = "CDS after stop codon.";
  $error_msgs[23] = "Multiple genes using the same transcript_id.";
  $error_msgs[24] = "Bad start codon sequence.  Should be ATG.";
  $error_msgs[25] = "Bad stop codon sequence.  Should be TAA, TAG, or TGA.";
  $error_msgs[26] = "Bad acceptor splice site sequence.";
  $error_msgs[27] = "Bad donor splice site sequence.";
  $error_msgs[28] = "In frame stop codon found.";
  $error_msgs[29] = "In frame stop codon in transcript.";
  $error_msgs[30] = "Illegal \'=\' character in <attributes> field.";
  $error_msgs[31] = "Unannotated start codon found.";
  $error_msgs[32] = "Unannotated stop codon found.";
  $error_msgs[33] = "Start codon location inconsistent with initial exon location.";
  $error_msgs[34] = "Stop codon location inconsistent with terminal exon location.";
  $error_msgs[35] = "Start codon length is wrong.";
  $error_msgs[36] = "Start codon length is wrong.";
  $error_msgs[37] = "Transcript does not translate in any frame.";
  $error_msgs[38] = "<start> field is greater than <stop> field.";
  $error_msgs[39] = "Wrong phase value.";
  $error_msgs[40] = "Missing phase value.";
  $error_msgs[41] = "Complete transcript length not multiple of 3.";
  $error_msgs[42] = "Inconsistant phase value accross transcript.";
  $error_msgs[43] = "Bad conservation sequence character.  Should be \'0\',\'1\', ".
      "or \'2\'.";
  $error_msgs[44] = "Bad fasta sequence character. Should be \'A\',\'C\',\'T\',\'G\',".
      "or \'N\'.";
  $error_msgs[45] = "Overlapping CDS features in transcript.";
  $error_msgs[46] = "Stop codon included in terminal CDS.";
  $error_msgs[47] = "Start codon annotated in intron region.";
  $error_msgs[48] = "Transcript contains zero length intron(s).";
  $error_msgs[49] = "Transcript contains short (<20bp) intron(s).";
  $error_msgs[50] = "Possible non-canonical splice site sequence.";
  $error_msgs[51] = "Bad frame on SEC feature.  Must be 0 or \".\"";
  $error_msgs[52] = "SEC feature outside of CDS feature.";
  $error_msgs[53] = "SEC feature had bad sequence.  Must be \"TGA\"";
  return @error_msgs;
}

#this checks to see if the error is one of the errors that
#we want to return the transcript name and remove
sub _check_errors_against_badlist{
  my ($gtf,$error_num) = @_;
  my $bad = $gtf->{Bad_List};
  if($$bad[$error_num]) {
    return 1;
  }
  else {
    return 0;
  }
}


#this loads the gtf file without checking for errors
sub _load_no_check{
  my ($gtf,$warnings) = @_;
  my (@all_genes,@all_txs,@all_cds,@all_inter,@all_cns);
  my %TxObj;
  my %GeneObj;
  my $strict = $gtf->{Strict};
  #open file 
  open(GTF, "<$gtf->{Filename}") 
      or die "Could not open $gtf->{Filename}.\n";
  while(my $in_line = <GTF>){
    chomp $in_line;
    if($in_line =~ /^(.*)\#/){
      $in_line = $1;
    }
    if($in_line =~ /^\s+(.*)$/){
      $in_line = $1;
    }
    if($in_line !~ /\S/){
      next;
    }
    my @data = split /\t/,$in_line;
    if($#data != 8){
      die "Wrong number of fields on line:\n$in_line\n";
    }
    if(!$strict && $data[5] eq "."){
      $data[5] = 0;
    }
    my %attributes;
    while($data[8] =~ /^\s*(\S+) \"(\S*)\";(.*)$/){
      my $key = $1;
      my $val = $2;
      $data[8] = $3;
      $attributes{$key} = $val;
    }
    my $gid;
    if(defined($attributes{gene_id})){
      $gid = $attributes{gene_id};
    }
    else{
      die "Missing gene_id\n";
    }
    my $tid;
    if(defined($attributes{transcript_id})){
      $tid = $attributes{transcript_id};
    }
    else{
      die "Missing transcript_id\n";
    }
    #create objects
    my $feature = 
      GTF::Feature::new($data[2], $data[3], $data[4], $data[5], $data[7]);
    if($feature->type eq "CDS"){
      push @all_cds, $feature;
    }
    elsif($feature->type eq "inter") {
      push @all_inter, $feature;
    }
    elsif($feature->type eq "inter_CNS") {
      push @all_cns, $feature;
    }

    my $tx;
    my $gene;
    unless(defined($TxObj{$tid}) && length($tid)){
      $tx = GTF::Transcript::new($tid);
      $TxObj{$tid} = $tx;
      unless($feature->type eq "inter" || $feature->type eq "inter_CNS") {
        push @all_txs, $tx;
      }
      unless(defined($GeneObj{$gid}) && length($gid)){
        $gene = GTF::Gene::new($gid,$data[0],$data[1],$data[6]);
        $GeneObj{$gid} = $gene;
        unless($feature->type eq "inter" || $feature->type eq "inter_CNS") {
          push @all_genes, $gene;
        }
      }
      $gene = $GeneObj{$gid};
      $gene->add_transcript($tx);
    }
    else{
      $gene = $GeneObj{$gid};
    }
    $tx = $TxObj{$tid};
    if($tx->strand ne $data[6]){
      die "Features on transcript ".$tx->id." have different strands\n";
    }
    $tx->add_feature($feature);
  }
  close(GTF);
  #sort genes, exons, and cds by start position
  @all_genes  = sort {$a->start <=> $b->start} @all_genes;
  @all_txs  = sort {$a->start <=> $b->start} @all_txs;
  @all_cds  = sort {$a->start <=> $b->start} @all_cds;
  @all_inter = sort {$a->start <=> $b->start} @all_inter;
  @all_cns = sort {$a->start <=> $b->start} @all_cns;
  $gtf->{Genes}        = \@all_genes;
  $gtf->{Transcripts}  = \@all_txs;
  $gtf->{CDS}          = \@all_cds;
  $gtf->{Inter_CNS}    = \@all_cns;
  $gtf->{Inter}        = \@all_inter;
}

sub _rev_comp{
  my ($seq) = @_;
  $seq = uc $seq;
  my @bases = split //, $seq;
  my $out = "";
  foreach my $base (@bases){
    if($base eq 'A'){
      $base = 'T';
    }
    elsif($base eq 'C'){
      $base = 'G';
    }
    elsif($base eq 'G'){
      $base = 'C';
    }
    elsif($base eq 'T'){
      $base = 'A';
    }			
    $out = $base.$out;
  }
  return $out;
  
}

###############################################################################
# GTF::Gene
###############################################################################
# The GTF::Gene object stores all gtf data for one gene.  A gene contains one
# or more transcripts
package GTF::Gene;
# GTF::Gene::new(gene_id,seqname,source,strand)
#    This is the contrcutor for a GTF::Gene object.  It has 4 required 
#    parameters.  The are: 
#      gene_id    - The gene_id for this gene
#      seqname   - The <seqname> field of the line.  Should be a string 
#                   containing no whitespace.
#      source     - The <source> field of the line.  Should be a string 
#                   containing no whitespace.
#      strand     - The <strand> field of the line.  Should be either
#                   '+', '-', or '.'.
sub new {
  my $gene = bless {};
  ($gene->{Id},$gene->{Seqname}, $gene->{Source}, $gene->{Strand}) = @_;
  $gene->{Transcripts} = [];
  $gene->{CDS} = [];
  $gene->{Modified} = 0;
  $gene->{Start} = -1;
  $gene->{Stop} = -1;
  return $gene;
}

sub add_transcript{
  my ($gene,$tx) = @_;
  my $txs = $gene->{Transcripts};
  push @$txs, $tx;
  $tx->_set_gene($gene);
  $tx->_update;
  $gene->{Modified} = 1;
}

sub id {shift->{Id}}

sub set_id{
  my ($gene,$id) = @_;
  $gene->{Id} = $id;
}

# GTF::Feature::seqname()
#    This function returns the sequnce name, <seqname> field, that this 
#    feature came from as a string.
sub seqname    {shift->{Seqname}}

# GTF::Feature::set_source()
#    This function sets the sequence name, <seqname> field, of this line
sub set_seqname{
  my ($feature,$seqname) = @_;
  $feature->{Seqname} = $seqname;
}

# GTF::Feature::source()
#    This function returns the source, <source> field, of this line as a 
#    string.
sub source      {shift->{Source}}

# GTF::Feature::set_source()
#    This function sets the source, <source> field, of this line
sub set_source{
  my ($feature,$source) = @_;
  $feature->{Source} = $source;
}

# GTF::Feature::strand()
#    This function returns the strand, <strand> field, of this line as a 
#   string.  Should be '+', '-', or '.'.
sub strand      {shift->{Strand}}

# GTF::Gene::id()
#    This function returns this transcript's unique transcript_id
sub gene_id{
  my ($gene) = @_;
  return $gene->{Id};
}

# GTF::Gene::transcript_ids()
#    This function returns an array of all trnacsripts in this gene
sub transcripts{
  my ($gene) = @_;
  $gene->_update;
  return $gene->{Transcripts};# [sort {$a->start <=> $b->start || $a->stop <=> $b->stop} @{$gene->{Transcripts}}];
}

# GTF::Gene::remove_transcript()
sub remove_transcript{
  my($gene, $tx_id) = @_;
  my $txs = $gene->transcripts;
  my @new_txs = ();

  @new_txs = grep($_->id ne $tx_id, @$txs);
  $gene->{Transcripts} = \@new_txs if ($#new_txs < $#$txs);
}

sub cds{
  my ($gene) = @_;

  my $txs = $gene->transcripts;
  my @cds;
  foreach my $tx (@$txs){
    push @cds, @{$tx->cds}; 
  }

  # employ a slow sort to avoid perl sort problems (see GTF::Gene::_update) rpz
  my $tmp;
  for(my $a = 0; $a <= $#cds; $a++){
    for(my $b = $a+1; $b <= $#cds; $b++){
      if(($cds[$a]->start > $cds[$b]->start) ||
         (($cds[$a]->start == $cds[$b]->start) &&
          ($cds[$a]->stop > $cds[$b]->stop))){
        $tmp = $cds[$a];
        $cds[$a] = $cds[$b];
        $cds[$b] = $tmp;
      }
    }
  }

  return \@cds;
}

# GTF::Gene::copy()
#    This function returns a new object which is a copy of this object.  Also
#    creates copies of all transcripts of this gene
sub copy{
  my ($gene) = @_;
  my $copy = GTF::Gene::new($gene->id,$gene->seqname,$gene->source,$gene->strand);
  my $txs = $gene->transcripts;
  my @new_txs;
  my $next = 0;
  foreach my $tx (@$txs){	
    $next++;
    my $ntx = $tx->copy;
    $copy->add_transcript($ntx);
  }
  return($copy);
}

# GTF::Gene::offset(ammount)
#    This function will offset all locations in this gene by the given ammount
sub offset{
  my ($gene,$offset) = @_;
  my $txs = $gene->transcripts;
  foreach my $tx (@$txs){
    $tx->offset($offset);
  }
}

# GTF::Gene::output_gtf([file_handle])
#    This function outputs the information for this gene in standard 
#    gtf2 format.  If takes and optional file_handle as the only argument 
#    to which it outputs the data.  If no argument is given it outputs 
#    the data to stdout.
sub output_gtf{
  my ($gene,$out_handle) = @_;
  unless(defined($out_handle)){
    $out_handle = \*STDOUT;
  }
  my $txs = $gene->transcripts;
  foreach my $tx (@$txs){
    $tx->output_gtf($out_handle);
    print $out_handle "\n";
  }
}

# GTF::Gene::output_gff([file_handle])
#    This function outputs the information for this gene in standard 
#    GFF format.  If takes and optional file_handle as the only argument 
#    to which it outputs the data.  If no argument is given it outputs 
#    the data to stdout.
sub output_gff{
  my ($gene,$out_handle) = @_;
  unless(defined($out_handle)){
    $out_handle = \*STDOUT;
  }
  my $txs = $gene->transcripts;
  foreach my $tx (@$txs){
    $tx->output_gff($out_handle);
  }
}

# GTF::Gene::reverse_complement(seq_length)
#    This function takes the length of the sequence this gene came from and 
#    reverse complements everything in the gene.  
sub reverse_complement{
  my ($gene, $seq_length) = @_;
  my $txs = $gene->transcripts;
  foreach my $tx (@$txs){
    $tx->reverse_complement($seq_length);
  }
  $gene->{Modified} = 1;
}

# GTF::Gene::start()
#    This function return the lowest coord of any feature in any tx of this gene
sub start{
  my ($gene) = @_;
  $gene->_update;
  return $gene->{Start};
}

# GTF::Gene::stop()
#    This function returns the highest coord of any feature in any tx of this gene
sub stop{
  my ($gene) = @_;
  $gene->_update;
  return $gene->{Stop};
}

sub equals{
  my ($gene,$compare) = @_;
  $gene->_update;
  unless($gene && $compare){
    return 0;
  }
  unless(($gene->start == $compare->start) &&
         ($gene->stop == $compare->stop) &&
         ($gene->strand eq $compare->strand)){
    return 0;
  }
  my $gene_txs = $gene->transcripts;
  my $compare_txs = $compare->transcripts;
  unless($#$gene_txs == $#$compare_txs){
    return 0;
  }
  for(my $i = 0;$i <= $#$gene_txs;$i++){
    unless($$gene_txs[$i]->equals($$compare_txs[$i])){
      return 0;
    }
  }
  return 1;
}

sub length{
  my ($gene) = @_;
  my $txs = $gene->transcripts;
  if($#$txs == -1){
    return 0;
  }
  my $start = $$txs[0]->start;
  my $stop = $$txs[0]->stop;
  my $total = 0;
  foreach my $tx (@$txs){
    if($tx->start < $start){
      $start = $tx->start;
    }
    if($tx->stop > $stop){
      $stop = $tx->stop;
    }
  }
  return($stop - $start + 1);
}

sub gc_percentage{
  my ($gene) = @_;
  my $txs = $gene->transcripts;
  if($#$txs == -1){
    return 0;
  }
  my $total = 0;
  foreach my $tx (@$txs){
    $total += $tx->gc_percentage;
  }
  return($total/($#$txs+1));
}

sub match_percentage{
  my ($gene) = @_;
  my $txs = $gene->transcripts;
  if($#$txs == -1){
    return 0;
  }
  my $total = 0;
  foreach my $tx (@$txs){
    $total += $tx->match_percentage;
  }
  return($total/($#$txs+1));
}

sub mismatch_percentage{
  my ($gene) = @_;
  my $txs = $gene->transcripts;
  if($#$txs == -1){
    return 0;
  }
  my $total = 0;
  foreach my $tx (@$txs){
    $total += $tx->mismatch_percentage;
  }
  return($total/($#$txs+1));
}

sub unaligned_percentage{
  my ($gene) = @_;
  my $txs = $gene->transcripts;
  if($#$txs == -1){
    return 0;
  }
  my $total = 0;
  foreach my $tx (@$txs){
    $total += $tx->unaligned_percentage;
  }
  return($total/($#$txs+1));
}

sub tag{
  my ($gene) = @_;
  return $gene->{Tag};
}

sub set_tag{
  my ($gene,$tag) = @_;
  $gene->{Tag} = $tag;
}

sub _update{
  my ($gene) = @_;
  unless($gene->{Modified}){
    return;
  }
  $gene->{Modified} = 0;
  my $txs = $gene->{Transcripts};


  # this slow sort is being done because the commented out sort below
  # causes crashes sometimes and I have no idea why since the uncommented
  # sort has no problems
  # -- the reason for the problems is that perl's sort routine does not
  #    afford you the capability of modifying the $a or $b variable in 
  #    the usersub callback, since they are aliases.  rpz
  $gene->{Start} = -1;
  $gene->{Stop} = -1;
  my $tmp;
  for(my $a = 0; $a <= $#$txs; $a++){
    for(my $b = $a+1; $b <= $#$txs; $b++){
      if(($$txs[$a]->start > $$txs[$b]->start) ||
         (($$txs[$a]->start == $$txs[$b]->start) &&
          ($$txs[$a]->stop > $$txs[$b]->stop))){
        $tmp = $$txs[$a];
        $$txs[$a] = $$txs[$b];
        $$txs[$b] = $tmp;
      }
    }
    if($$txs[$a]->stop > $gene->{Stop}){
      # highest stop may not be stop of last tx so we check them all
      $gene->{Stop} = $$txs[$a]->stop;
    }
  }
  if($#$txs != -1){
    # lowest start is start of first tx  
    $gene->{Start} = $$txs[0]->start;
  }
}

###############################################################################
# GTF::Transcript
###############################################################################
# The GTF::Transcript object stores all data for a single transcript of a gtf
# file.  
package GTF::Transcript;
# GTF::Transcript::new(transcript_id)
#    This is the constructor for GTF::Feature objects.  It takes 1 arguments.
#      transciprt_id - the id for this transcript
sub new{
  my $tx = bless {};
  ($tx->{Id}) = @_;
  $tx->{Exons} = [];
  $tx->{CDS} = [];
  $tx->{UTR3} = [];
  $tx->{UTR5} = [];
  $tx->{Introns} = [];
  $tx->{Starts} = [];
  $tx->{Stops} = [];
  $tx->{SEC} = [];
  $tx->{Intron_CNS} = [];
  $tx->{Inter_CNS} = [];
  $tx->{Inter} = [];
  $tx->{Coding_Start} = -1;
  $tx->{Coding_Stop} = -1;
  $tx->{Coding_Length} = -1;
  $tx->{Gene} = -1;
  $tx->{Start} = -1;
  $tx->{Stop} = -1;
  $tx->{Modified} = 1;
  return $tx;
}

sub _set_gene{
  my ($tx,$gene) = @_;
  $tx->{Gene} = $gene;
}

sub add_feature{
  my ($tx,$feature) = @_;
  my $type = $feature->type;
  if($type eq 'CDS'){
    push @{$tx->{CDS}}, $feature; 
    $tx->{CDS} = [sort {$a->start <=> $b->start} @{$tx->{CDS}}];
  }
  elsif($type eq '3UTR'){
    push @{$tx->{UTR3}}, $feature; 
    $tx->{UTR3} = [sort {$a->start <=> $b->start} @{$tx->{UTR3}}];
  }
  elsif($type eq '5UTR'){
    push @{$tx->{UTR5}}, $feature; 
    $tx->{UTR5} = [sort {$a->start <=> $b->start} @{$tx->{UTR5}}];
  }
  elsif($type eq 'start_codon'){
    push @{$tx->{Starts}}, $feature; 
    $tx->{Starts} = [sort {$a->start <=> $b->start} @{$tx->{Starts}}];
  }
  elsif($type eq 'stop_codon'){
    push @{$tx->{Stops}}, $feature; 
    $tx->{Stops} = [sort {$a->start <=> $b->start} @{$tx->{Stops}}];
  }
  elsif($type eq 'exon'){
    push @{$tx->{Exons}}, $feature; 
    $tx->{Exons} = [sort {$a->start <=> $b->start} @{$tx->{Exons}}];
  }
  elsif($type eq 'SEC'){
    push @{$tx->{SEC}}, $feature; 
    $tx->{SEC} = [sort {$a->start <=> $b->start} @{$tx->{SEC}}];
  }
  elsif($type eq 'intron_CNS'){
    push @{$tx->{Intron_CNS}}, $feature;
    $tx->{Intron_CNS} = [sort {$a->start <=> $b->start} @{$tx->{Intron_CNS}}];
  }
  elsif($type eq 'inter_CNS'){
    push @{$tx->{Inter_CNS}}, $feature;
    $tx->{Inter_CNS} = [sort {$a->start <=> $b->start} @{$tx->{Inter_CNS}}];
  }
  elsif($type eq 'inter'){
    push @{$tx->{Inter}}, $feature;
    $tx->{Inter} = [sort {$a->start <=> $b->start} @{$tx->{Inter}}];
  }
  $feature->set_transcript($tx);
  $tx->{Modified} = 1;
  if($tx->{Gene} != -1){ $tx->{Gene}->{Modified} = 1;}
}

sub transfer_missing_utr{
  my($dest, $src, $mode) = @_;
  my $dest_utr3 = $dest->utr3;
  my $dest_utr5 = $dest->utr5;
  my $src_utr3 = $src->utr3;
  my $src_utr5 = $src->utr5;
  
  $mode = "Full" if ($#_ == 1); # default

  if (($mode eq "3UTR") || ($mode eq "Full")) {
    if (($#$dest_utr3 == -1) && ($#$src_utr3 > -1)){
      print "Transfering 3UTR from ".$src->id." to ".$dest->id."\n";
      foreach my $src_exon (@$src_utr3)
      {
        my $utr_exon = $src_exon->copy;
        $utr_exon->set_transcript($dest);
        $dest->add_feature($utr_exon);
      }
    }
  }
  if (($mode eq "5UTR") || ($mode eq "Full")) {  
    if (($#$dest_utr5 == -1) && ($#$src_utr5 > -1)){
      print "Transfering 5UTR from ".$src->id." to ".$dest->id."\n";
      foreach my $src_exon (@$src_utr5)
      {
        my $utr_exon = $src_exon->copy;
        $utr_exon->set_transcript($dest);
        $dest->add_feature($utr_exon);
      }
    }
  }
}

# copies the exon obejcts info into utr5 and utr3 objects
# WARNING: don't call this if you already have utr5 and utr3 info
sub create_utr_objects_from_exons{
  my ($tx) = @_;
  my $exons = $tx->exons;
  my $start = $tx->coding_start;
  my $stop = $tx->coding_stop;
  if($start != -1){
    my $type = "5UTR";
    if($tx->strand eq "-"){
      $type = "3UTR";
    }
    for(my $i = 0; $i <= $#$exons; $i++){
      if($$exons[$i]->start < $start){
        my $utr = $$exons[$i]->copy;
        $utr->set_type($type);
        if($start < $utr->stop){
          $utr->set_stop($start);
        }
        $tx->add_feature($utr);
      }
      else{
        last;
      }
    }
  }
  if($stop != -1){
    my $type = "3UTR";
    if($tx->strand eq "-"){
      $type = "5UTR";
    }
    for(my $i = $#$exons; $i >= 0; $i--){
      if($$exons[$i]->stop > $stop){
        my $utr = $$exons[$i]->copy;
        $utr->set_type($type);
        if($stop > $utr->start){
          $utr->set_start($stop);
        }
        $tx->add_feature($utr);
      }
      else{
        last;
      }
    }    
  }
}

sub exons{
  my ($tx) = @_;
  $tx->_update;
  return $tx->{Exons};
}

sub cds{
  my ($tx) = @_;
  $tx->_update;
  return $tx->{CDS};
}

sub utr3{
  my ($tx) = @_;
  $tx->_update;
  return $tx->{UTR3};
}

sub utr5{
  my ($tx) = @_;
  $tx->_update;
  return $tx->{UTR5};
}

sub introns{
  my ($tx) = @_;
  $tx->_update;
  return $tx->{Introns};
}

sub start_codons{
  my ($tx) = @_;
  $tx->_update;
  return $tx->{Starts};
}

sub stop_codons{
  my ($tx) = @_;
  $tx->_update;
  return $tx->{Stops};
}

sub sec{
  my ($tx) = @_;
  $tx->_update;
  return $tx->{SEC};
}

sub intron_cns{
  my ($tx) = @_;
  $tx->_update;
  return $tx->{Intron_CNS};
}

sub inter_cns{
  my ($tx) = @_;
  $tx->_update;
  return $tx->{Inter_CNS};
}

sub inter{
  my ($tx) = @_;
  $tx->_update;
  return $tx->{Inter};
}

sub seqname{
  my ($tx) = @_;
  my $gene = $tx->gene;
  return $gene->seqname;
}

sub source{
  my ($tx) = @_;
  my $gene = $tx->gene;
  return $gene->source;
}

sub strand{
  my ($tx) = @_;
  my $gene = $tx->gene;
  return $gene->strand;
}

sub set_id{
  my ($tx,$id) = @_;
  $tx->{Id} = $id;
}

sub id{
  my ($tx) = @_;
  return $tx->{Id};
}

sub gene_id{
  my ($tx) = @_;
  my $gene = $tx->gene;
  return $gene->{Id};
}

sub gene{
  my ($tx) = @_;
  return $tx->{Gene};
}

# GTF::Transcript::remove_exon(exon start pos)
#    Allows you to remove an exon from the list
#    Removes the all exons at the specified position
sub remove_exon{
  my ($tx, $pos) = @_;
  my $new_exons = [];
  my $removed = 0;
  foreach my $exon (@{$tx->{Exons}}){
    if($exon->start != $pos){
      push @$new_exons, $exon;
    }
    else{
      $removed++;
    }
  }
  $tx->{Exons} = $new_exons;
  return $removed;
}

# GTF::Transcript::remove_cds(cds start pos)
#    Allows you to remove a cds from the list
#    Removes the all cdss at the specified position
sub remove_cds{
  my ($tx, $pos) = @_;
  my $new_cds = [];
  my $removed = 0;
  foreach my $cds (@{$tx->{CDS}}){
    if($cds->start != $pos){
      push @$new_cds, $cds;
    }
    else{
      $removed++;
    }
  }
  $tx->{CDS} = $new_cds;
  return $removed;
}

# GTF::Transcript::length()
#    This function return the length of the tx from the start to the stop.
sub length{
  my ($tx) = @_;
  return($tx->stop - $tx->start + 1);
}

# GTF::Transcript::start()
#    This function returns the start of this tx (lowest coord).  
sub start{    
  my ($tx) = @_;
  $tx->_update;
  return $tx->{Start};
}

# GTF::Transcript::stop()
#    This function return the end of the tx (highest coord). 
sub stop{
  my ($tx) = @_;
  $tx->_update;
  return $tx->{Stop};
}

# GTF::Transcript::coding_start()
#    This function returns the start position (lowest coord) of coding region of 
#    this tx.  Same as the start codon's <start> field.
sub coding_start{
  my ($tx) = @_;
  $tx->_update;
  return $tx->{Coding_Start};
}

# GTF::Transcript::coding_stop()
#    This function return the end position (highest coord) of the coding region of 
#    the tx.  Same as the stop codon's <stop> field.
sub coding_stop{
  my ($tx) = @_;
  $tx->_update;
  return $tx->{Coding_Stop};
}

# GTF::Transcript::coding_length()
#    This function return the coding_length of the tx from the coding_start 
#    to the coding_stop.
sub coding_length{
  my ($tx) = @_;
  $tx->_update;
  return $tx->{Coding_Length};
}

sub score{
  my ($tx) = @_;
  my $features = $tx->_all_features;
  my $score = 0;
  foreach my $feature (@$features){
    if($feature->score ne "."){
      $score += $feature->score;
    }
  }
  return $score;
}

# GTF::Transcript::initial_exon()
#    This function returns a CDS::Feature object representing the  
#    initial_exon (cds) of this tx
#    NOTE:  Isn't exon the wrong word for this?
sub initial_exon{
  my ($tx) = @_;
  my $starts = $tx->start_codons;
  if($#$starts == -1){
    return 0;
  }
  $tx->_update;
  my $exons = $tx->{CDS};
  if($tx->strand eq "-"){
    return $$exons[$#{$exons}];
  }
  else{
    return $$exons[0];
  }
}

# GTF::Transcript::terminal_exon()
#    This function returns a CDS::Feature object representing the  
#    terminal exon (cds) of this tx
#    NOTE:  Isn't exon the wrong word for this?
sub terminal_exon{
  my ($tx) = @_;
  my $stops = $tx->stop_codons;
  if($#$stops == -1){
    return 0;
  }
  $tx->_update;
  my $exons = $tx->cds;
  if($tx->strand eq "-"){
    return $$exons[0];
  }
  else{
    return $$exons[$#{$exons}];
  }
}

sub equals{
  my ($tx,$compare, $mode) = @_;
  $mode = "Full" if ($#_ == 1); # default

  unless($tx && $compare){
    return 0;
  }
  unless(($tx->start == $compare->start) &&
         ($tx->stop == $compare->stop) &&
         ($tx->strand eq $compare->strand)){
    return 0;
  }

  if (($mode eq "CDS") || ($mode eq "Full")) {
    my $tx_cds = $tx->cds;
    my $comp_cds = $compare->cds;
    unless($#$tx_cds == $#$comp_cds){
      return 0;
    }
    for(my $i = 0;$i <= $#$tx_cds;$i++){
      unless($$tx_cds[$i]->equals($$comp_cds[$i])){
        return 0;
      }
    }
    my $tx_starts = $tx->start_codons;
    my $comp_starts = $compare->start_codons;
    unless($#$tx_starts == $#$comp_starts){
      return 0;
    }
    for(my $i = 0;$i <= $#$tx_starts;$i++){
      unless($$tx_starts[$i]->equals($$comp_starts[$i])){
        return 0;
      }
    }
    my $tx_stops = $tx->stop_codons;
    my $comp_stops = $compare->stop_codons;
    unless($#$tx_stops == $#$comp_stops){
      return 0;
    }
    for(my $i = 0;$i <= $#$tx_stops;$i++){
      unless($$tx_stops[$i]->equals($$comp_stops[$i])){
        return 0;
      }
    }
  }

  if (($mode eq "3UTR") || ($mode eq "Full")) {
    my $tx_utr3 = $tx->utr3;
    my $comp_utr3 = $compare->utr3;
    unless($#$tx_utr3 == $#$comp_utr3){
      return 0;
    }
    for(my $i = 0;$i <= $#$tx_utr3;$i++){
      unless($$tx_utr3[$i]->equals($$comp_utr3[$i])){
        return 0;
      }
    }
  }

  if (($mode eq "5UTR") || ($mode eq "Full")) {
    my $tx_utr5 = $tx->utr5;
    my $comp_utr5 = $compare->utr5;
    unless($#$tx_utr5 == $#$comp_utr5){
      return 0;
    }
    for(my $i = 0;$i <= $#$tx_utr5;$i++){
      unless($$tx_utr5[$i]->equals($$comp_utr5[$i])){
        return 0;
      }
    }
  }

  my $tx_exon = $tx->exons;
  my $comp_exon = $compare->exons;
  unless($#$tx_exon == $#$comp_exon){
    return 0;
  }
  for(my $i = 0;$i <= $#$tx_exon;$i++){
    unless($$tx_exon[$i]->equals($$comp_exon[$i])){
      return 0;
    }
  }

  my $tx_sec = $tx->sec;
  my $comp_sec = $compare->sec;
  unless($#$tx_sec == $#$comp_sec){
    return 0;
  }
  for(my $i = 0;$i <= $#$tx_sec;$i++){
    unless($$tx_sec[$i]->equals($$comp_sec[$i])){
      return 0;
    }
  }
  return 1;
}

# GTF::Transcript::offset(ammount)
#    This function will offset all locations in this tx by the given ammount
sub offset{
  my ($tx,$offset) = @_;
  unless(defined($offset)){
    print STDERR "Undefined value passed to GTF::Gene::offset.\n";
    return;
  }
  my $features = $tx->_all_features;
  foreach my $feature (@$features){
    if(defined($feature)){
      $feature->offset($offset);
    }
  }
  $tx->{Modified} = 1;
  if($tx->{Gene} != -1){ $tx->{Gene}->{Modified} = 1;}
}

# GTF::Transcript::output_gtf([file_handle])
#    This function outputs the information for this tx in standard 
#    gtf2 format.  If takes and optional file_handle as the only argument 
#    to which it outputs the data.  If no argument is given it outputs 
#    the data to stdout.
sub output_gtf{
  my ($tx,$out_handle) = @_;
  unless(defined $out_handle){
    $out_handle = \*STDOUT;
  }
  my @features = 
      sort {$a->start <=> $b->start || $a->stop <=> $b->stop} @{$tx->_all_features};
  foreach my $feature (@features){
    $feature->output_gtf($out_handle);
  }
}

# GTF::Transcript::output_gff([file_handle])
#    This function outputs the information for this tx in standard 
#    GFF format.  If takes and optional file_handle as the only argument 
#    to which it outputs the data.  If no argument is given it outputs 
#    the data to stdout.
sub output_gff{
  my ($tx,$out_handle) = @_;
  unless(defined $out_handle){
    $out_handle = \*STDOUT;
  }
  my @features = 
      sort {$a->start <=> $b->start || $a->stop <=> $b->stop} @{$tx->_all_features};
  foreach my $feature (@features){
    $feature->output_gff($out_handle);
  }
}

# GTF::Transcript::reverse_complement(seq_length)
#    This function takes the length of the sequence this tx came from and 
#    reverse complements everything in the gene.  
sub reverse_complement{
  my ($tx, $seq_length) = @_;
  unless(defined($seq_length)){
    print STDERR 
        "Undefined value for seq_length passed to GTF::Gene::reverse_complement.\n";
  }
  my $features = $tx->_all_features;
  foreach my $feature (@$features){
    $feature->reverse_complement($seq_length);
  }
}

sub _all_features{
  my ($tx) = @_;
  my @features;
  my $starts = $tx->start_codons;    
  push @features, @$starts;
  my $stops = $tx->stop_codons;
  push @features, @$stops;
  my $exons = $tx->exons;
  push @features, @$exons;
  my $cds = $tx->cds;
  push @features, @$cds;
  my $utr3 = $tx->utr3;
  push @features, @$utr3;
  my $utr5 = $tx->utr5;
  push @features, @$utr5;
  my $sec = $tx->sec;
  push @features, @$sec;
  my $cns = $tx->intron_cns;
  push @features, @$cns;
  return \@features;
}

sub copy{
  my ($tx) = @_;
  my $copy = GTF::Transcript::new($tx->id);
  my $starts = $tx->start_codons;
  foreach my $old (@$starts){
    my $new = $old->copy;
    $copy->add_feature($new);
  }
  my $stops = $tx->stop_codons;
  foreach my $old (@$stops){
    my $new = $old->copy;
    $copy->add_feature($new);
  }
  my $exons = $tx->exons;
  foreach my $old (@$exons){
    my $new = $old->copy;
    $copy->add_feature($new);
  }
  my $cds = $tx->cds;
  foreach my $old (@$cds){
    my $new = $old->copy;
    $copy->add_feature($new);
  }
  my $utr3 = $tx->utr3;
  foreach my $old (@$utr3){
    my $new = $old->copy;
    $copy->add_feature($new);
  }
  my $utr5 = $tx->utr5;
  foreach my $old (@$utr5){
    my $new = $old->copy;
    $copy->add_feature($new);
  }
  my $sec = $tx->sec;
  foreach my $old (@$sec){
    my $new = $old->copy;
    $copy->add_feature($new);
  }
  return($copy);
}

sub gc_percentage{
  my ($tx) = @_;
  my $features = $tx->_all_features;
  my $total = 0;
  my $percent = 0;
  foreach my $feature (@$features){
    $percent += $feature->length * $feature->gc_percentage;
    $total += $feature->length;
  }
  $percent /= $total;
  return $percent;
}

sub match_percentage{
  my ($tx) = @_;
  my $features = $tx->_all_features;
  my $total = 0;
  my $percent = 0;
  foreach my $feature (@$features){
    $percent += $feature->length * $feature->match_percentage;
    $total += $feature->length;
  }
  $percent /= $total;
  return $percent;
}

sub mismatch_percentage{
  my ($tx) = @_;
  my $features = $tx->_all_features;
  my $total = 0;
  my $percent = 0;
  foreach my $feature (@$features){
    $percent += $feature->length * $feature->mismatch_percentage;
    $total += $feature->length;
  }
  $percent /= $total;
  return $percent;
}

sub unaligned_percentage{
  my ($tx) = @_;
  my $features = $tx->_all_features;
  my $total = 0;
  my $percent = 0;
  foreach my $feature (@$features){
    $percent += $feature->length * $feature->unaligned_percentage;
    $total += $feature->length;
  }
  $percent /= $total;
  return $percent;
}

sub tag{
  my ($tx) = @_;
  return $tx->{Tag};
}

sub set_tag{
  my ($tx,$tag) = @_;
  $tx->{Tag} = $tag;
}

sub _update{
  my ($tx) = @_;
  unless($tx->{Modified}){
    return;
  }
  $tx->{Modified} = 0;
  my $starts = $tx->start_codons;    
  my $stops = $tx->stop_codons;
  my $cds = $tx->cds;
  my $utr5 = $tx->utr5;
  my $utr3 = $tx->utr3;
  my $exons = $tx->exons;
  foreach my $utr (@$utr5){
    $utr->set_subtype("UTR5");
  }
  foreach my $utr (@$utr3){
    $utr->set_subtype("UTR3");
  }
  #set subtypes
  if($#$cds == 0){
    if(($#$starts >= 0) && ($#$stops >= 0)){
      $$cds[0]->set_subtype("Single");
    }
    elsif($#$starts >= 0){
      $$cds[0]->set_subtype("Initial");
    }
    elsif($#$stops >= 0){
      $$cds[0]->set_subtype("Terminal");
    }
    else{
      $$cds[0]->set_subtype("Internal");	    
    }
  }
  elsif($#$cds >= 0){
    my $strand = $tx->strand;
    my $starts = $tx->start_codons;
    my $stops = $tx->stop_codons;
    my $start = 0;
    my $stop = $#$cds;
    if($strand eq '-'){
      if($#$starts >= 0){
        $$cds[$#$cds]->set_subtype("Initial");
        $stop--;
      }
      if($#$stops >= 0){
        $$cds[0]->set_subtype("Terminal");
        $start++;
      }
    }
    else{
      if($#$starts >= 0){
        $$cds[0]->set_subtype("Initial");
        $start++;
      }
      if($#$stops >= 0){
        $$cds[$#$cds]->set_subtype("Terminal");
        $stop--;
      }
    }
    for(my $i = $start;$i <= $stop;$i++){
      $$cds[$i]->set_subtype("Internal");
    }
  }
  #get introns and coding length
  my $clen = 0;
  my @introns;
  for(my $i = 0;$i < $#$cds;$i++){
    $clen += $$cds[$i]->stop - $$cds[$i]->start +1;
    my $start = $$cds[$i]->stop + 1;
    my $stop = $$cds[$i+1]->start - 1;
    if($start <= $stop){
      my $intron = GTF::Feature::new("Intron",$start,$stop,0,'.');
      $intron->set_subtype("Intron");
      $intron->set_transcript($tx);
      push @introns, $intron;
    }
  }
  if($#$cds >= 0){
    $clen += $$cds[$#$cds]->stop - $$cds[$#$cds]->start +1;
  }
  $tx->{Coding_Length} = $clen;
  $tx->{Introns} = \@introns;
  #get start/coding start
  my $start = -1;
  if(($#{$cds} >= 0) &&
     (($start == -1) || ($start > $$cds[0]->start))){
    $start = $$cds[0]->start;
  }
  $tx->{Coding_Start} = $start;
  if(($#{$starts} >= 0) &&
     (($start == -1) || ($start > $$starts[0]->start))){
    $start = $$starts[0]->start;
  }
  if(($#{$stops} >= 0) &&
     (($start == -1) || ($start > $$stops[0]->start))){
    $start = $$stops[0]->start;
  }
  if(($#{$exons} >= 0) &&
     (($start == -1) || ($start > $$exons[0]->start))){
    $start = $$exons[0]->start;
  }
  if(($#{$utr5} >= 0) &&
     (($start == -1) || ($start > $$utr5[0]->start))){
    $start = $$utr5[0]->start;
  }
  if(($#{$utr3} >= 0) &&
     (($start == -1) || ($start > $$utr3[0]->start))){
    $start = $$utr3[0]->start;
  }
  $tx->{Start} = $start;
  #get stop/coding stop
  my $stop = -1;
  if(($#{$cds} >= 0) &&
     (($stop == -1) || ($stop < $$cds[$#$cds]->stop))){
    $stop = $$cds[$#$cds]->stop;
  }
  $tx->{Coding_Stop} = $stop;
  if(($#{$starts} >= 0) && 
     (($stop == -1) || ($stop < $$starts[$#$starts]->stop))){
    $stop = $$starts[$#$starts]->stop;
  }
  if(($#{$stops} >= 0) &&
     (($stop == -1) || ($stop < $$stops[$#$stops]->stop))){
    $stop = $$stops[$#$stops]->stop;
  }
  if(($#{$exons} >= 0) &&
     (($stop == -1) || ($stop < $$exons[$#$exons]->stop))){
    $stop = $$exons[$#$exons]->stop;
  }
  if(($#{$utr5} >= 0) &&
     (($stop == -1) || ($stop < $$utr5[$#$utr5]->stop))){
    $stop = $$utr5[$#$utr5]->stop;
  }
  if(($#{$utr3} >= 0) &&
     (($stop == -1) || ($stop < $$utr3[$#$utr3]->stop))){
    $stop = $$utr3[$#$utr3]->stop;
  }
  $tx->{Stop} = $stop;
}



###############################################################################
# GTF::Feature
###############################################################################
# The GTF::Feature object stores all data for a single feature of a gtf
# file.  This object basically stores all data for a single non-comment
# line of a gtf file.
package GTF::Feature;
# GTF::Feature::new(feature, start, end, score, frame) 
#    This is the constructor for GTF::Feature objects.  It takes 5 required
#    arguments.  They are:
#      type       - The <feature> field of the line.  Should be either
#                   'cds', 'exon', 'start_codon', or 'stop_codon'.
#      start      - The <start> field of the line.  Should be a number.
#      end        - The <end> field of the line.  Should be a number.
#      score      - The <score> field of the line.  Should be a number or '.'.
#      frame      - The <frame> field of the line.  Should be either
#                   '0', '1', '2', or '.'.
sub new {
  my $feature = bless {};
  ($feature->{Type}, $feature->{Start}, $feature->{End}, $feature->{Score}, 
   $feature->{Frame}) = @_;    
  my $start = $feature->{Start};
  my $stop = $feature->{Stop};
  if(defined($start) && defined($stop) && ($start > $stop)){
    $feature->{Start} = $stop;
    $feature->{Stop} = $start;
  }
  $feature->{Transcript} = 0;
  $feature->{Seq} = 0;
  $feature->{Conseq} = 0;
  $feature->{Subtype} = 0;
  $feature->{ASE} = 0;          # rpz - Alternative Splicing Event
  $feature->{Match} = -1;
  $feature->{Mismatch} = -1;
  $feature->{Unaligned} = -1;
  $feature->{GC} = -1;
  $feature->{A} = -1;
  $feature->{C} = -1;
  $feature->{G} = -1;
  $feature->{T} = -1;
  $feature->{N} = -1;
  $feature->{0} = -1;
  $feature->{1} = -1;
  $feature->{2} = -1;
  return $feature;
}

# GTF::Feature::copy()
#    This function returns a new object which is a copy of this object
sub copy{
  my ($feature) = @_;
  my $copy = GTF::Feature::new($feature->type,$feature->start,$feature->stop,
                               $feature->score,$feature->frame);
  $copy->set_bases($feature->get_a_count,$feature->get_c_count,$feature->get_g_count,
                   $feature->get_t_count,$feature->get_n_count);
  $copy->set_conseq($feature->get_match_count,$feature->get_mismatch_count,
                    $feature->get_unaligned_count);
  return($copy);
}


# GTF::Feature::offset(ammount)
#    This function will offset all locations in this Feature by the given ammount
sub offset{
  my ($feature,$offset) = @_;
  unless(defined($offset)){
    print STDERR "Undefined value passed to GTF::Feature::offset.\n";
    return;
  }
  $feature->{Start} += $offset;
  $feature->{End} += $offset;
}

# GTF::Feature::reverse_complement(seq_length)
#    This function takes the length of the sequence this feature came from and 
#    reverse complements it.  
sub reverse_complement{
  my ($feature, $seq_length) = @_;
  unless(defined($seq_length)){
    print STDERR 
        "Undefined value for seq_length passed to ",
        "GTF::Feature::reverse_complement.\n";
  }
  my $start = $seq_length - $feature->{Start} + 1;
  my $stop = $seq_length - $feature->{End} + 1;
  if($start < $stop){
    $feature->{Start} = $start;
    $feature->{End} = $stop;
  }
  else{
    $feature->{Start} = $stop;
    $feature->{End} = $start;
  }
}   

# GTF::Feature::type()
#    This function returns the feature type, <feature> field of this line as
#    a string.  Should be either 'CDS', 'exon', 'start_codon', or 'stop_codon'.
sub type     {shift->{Type}}

sub set_type{
  my ($feature,$type) = @_;
  $feature->{Type} = $type;
}

# GTF::Feature::type()
#    This function returns the feature sub type, currently inly implemented for 
#    cds, should be 'Initial', 'Internal', 'Terminal', or 'Single'
sub subtype     {shift->{Subtype}}

sub set_subtype{
  my ($feature,$subtype) = @_;
  $feature->{Subtype} = $subtype;
}

# GTF::Feature::ase()
#   This function returns the feature's alternative splicing event, 
#   if there is one.  Currently the only implemented ASE is 'Optional'
# rpz
sub ase         {shift->{ASE}}

sub set_ase {
    my ($feature, $ASE) = @_;
    $feature->{ASE} = $ASE;
}

# GTF::Feature::length()
#    This function returns the length of this feature in base pairs
#    as a number.
sub length{
  my ($feature) = @_;
  return($feature->stop - $feature->start + 1);
}

# GTF::Feature::start()
#    This function returns the start location ,<start> field, of this feature
#    as a number.
sub start       {shift->{Start}}

# GTF::Feature::set_start()
#    This function sets the start location ,<start> field, of this feature
#    as a number.
sub set_start{
  my ($feature,$start) = @_;
  $feature->{Start} = $start;
}

# GTF::Feature::end()
#    This function returns the end location, <end> field, of this feature
#    as a number.
sub end         {shift->{End}}

# GTF::Feature::stop()
#    This function returns the end location, <end> field, of this feature
#    as a number.
#    Same as GTF::Feature::end();
sub stop         {shift->{End}}

# GTF::Feature::set_stop()
#    This function sets the stop location ,<stop> field, of this feature
#    as a number.
sub set_stop{
  my ($feature,$stop) = @_;
  $feature->{End} = $stop;
}

# GTF::Feature::score()
#    This function returns the score, <score> field, of this line.  This will 
#    be either a number or '.'.
sub score       {shift->{Score}}


# GTF::Feature::frame()
#    This function returns the frame, <frame> field, of this line.  Should be
#    '0', '1', '2', or '.'.
sub frame       {shift->{Frame}}

# GTF::Feature::set_frame()
#    This function sets the frame, <frame> field, of this line.  Should be
#    '0', '1', '2', or '.'.
sub set_frame{
  my ($feature,$frame) = @_;
  if(($frame eq '0') || ($frame eq '1') ||
     ($frame eq '2') || ($frame eq '.')){
    $feature->{Frame} = $frame;
  }
  else{
    print STDERR "Bad Frame Value \'$frame\' should be \'0\', \'1\',".
        "\'2\', or \'.\'.\n";
  }
}

# GTF::Feature::transcript_id()
#    This function returns the transcript_id of this exon's transcript
sub transcript_id{
  my ($feature) = @_;
  my $tx = $feature->{Transcript};
  return $tx->id;
}

# GTF::Feature::transcript()
#    This function returns the transcript of this exon's transcript
sub transcript{
  my ($feature) = @_;
  return $feature->{Transcript};
}

# GTF::Feature::transcript()
#    This function returns the transcript of this exon's transcript
sub set_transcript{
  my ($feature,$tx) = @_;
  $feature->{Transcript} = $tx;
}

# GTF::Feature::transcript_id()
#    This function returns the transcript_id of this exon's transcript
sub gene_id{
  my ($feature) = @_;
  my $tx = $feature->transcript;
  my $gene = $tx->gene;
  return $gene->id;
}

# GTF::Feature::transcript()
#    This function returns the transcript of this exon's transcript
sub gene{
  my ($feature) = @_;
  my $tx = $feature->{Transcript};
  return $tx->gene;
}


# GTF::Feature::seqname()
#    This function returns the sequnce name, <seqname> field, that this 
#    feature came from as a string.
sub seqname{
  my ($feature) = @_;
  my $tx = $feature->{Transcript};
  return $tx->seqname;
}    

# GTF::Feature::source()
#    This function returns the source, <source> field, of this line as a 
#    string.
sub source{
  my ($feature) = @_;
  my $tx = $feature->{Transcript};
  return $tx->source;
}

# GTF::Feature::strand()
#    This function returns the strand, <strand> field, of this line as a 
#   string.  Should be '+', '-', or '.'.
sub strand{
  my ($feature) = @_;
  my $tx = $feature->{Transcript};
  return $tx->strand;
}

sub equals{
  my ($feature,$compare) = @_;
  unless($feature && $compare){
    return 0;
  }
  if(($feature->start == $compare->start) &&
     ($feature->stop == $compare->stop) &&
     ($feature->strand eq $compare->strand) &&
     ($feature->type eq $compare->type)){
    return 1;
  }
  return 0;
}

# GTF::Feature::output_gtf([file_handle])
#    This function outputs the information for this feature in standard 
#    gtf2 format.  If takes and optional file_handle as the only argument 
#    to which it outputs the data.  If no argument is given it outputs 
#    the data to stdout.
sub output_gtf{
  my ($feature,$out_handle) = @_;
  unless(defined $out_handle){
    $out_handle = \*STDOUT;
  }
  print $out_handle
      "".$feature->seqname."\t".$feature->source."\t$feature->{Type}\t",
      "$feature->{Start}\t$feature->{End}\t$feature->{Score}\t".$feature->strand."\t",
      "$feature->{Frame}\tgene_id \"".$feature->gene_id."\"; transcript_id \"",
      $feature->transcript_id."\";\n";
}   

# GTF::Feature::output_gff([file_handle])
#    This function outputs the information for this feature in standard 
#    GFF format.  If takes and optional file_handle as the only argument 
#    to which it outputs the data.  If no argument is given it outputs 
#    the data to stdout.
sub output_gff{
  my ($feature,$out_handle) = @_;
  unless(defined $out_handle){
    $out_handle = \*STDOUT;
  }
  
  print $out_handle
      "".$feature->seqname."\t".$feature->source."\t$feature->{Type}\t",
      "$feature->{Start}\t$feature->{End}\t$feature->{Score}\t".$feature->strand."\t",
      "$feature->{Frame}\t".$feature->transcript_id."\n";
}   

# GTF::Feature::set_bases(A,C,G,T,N)
#    This funtion allows you to set the number of A,C,G,T nucleotides 
#    in this feature
sub set_bases{    
  my ($feature,$a,$c,$g,$t,$n) = @_;
  $feature->{A} = $a;
  $feature->{C} = $c;
  $feature->{G} = $g;
  $feature->{T} = $t;    
  $feature->{N} = $n;    
  $feature->{Seq} = 1;
}

# GTF::Feature::set_conseq(n0,n1,n2)
#    This funtion allows you to set the number of 0,1,2 in the  
#    conservation sequnce for this feature
sub set_conseq{
  my ($feature,$n0,$n1,$n2) = @_;
  $feature->{0} = $n0;
  $feature->{1} = $n1;
  $feature->{2} = $n2;
  $feature->{Conseq} = 1;
}

# GTF::Feature::gc_percentage()
#    This funciton returns the gc_percentage for this feature if the 
#    sequence was loaded otherwise it returns -1.
sub gc_percentage{
  my ($feature) = @_;
  if($feature->{GC} != -1){
    return $feature->{GC};
  }
  unless($feature->{Seq}){
    return -1;
  }
  my $a = $feature->{A};
  my $c = $feature->{C};
  my $g = $feature->{G};
  my $t = $feature->{T};
  my $n = $feature->{N};
  my $total = $a + $c + $g + $t + $n;
  my $gc = $g + $c;
  my $percent = $gc/$total;
  $feature->{GC} = $percent;
  return $percent;
}

# GTF::Feature::match_percentage()
#    This funciton returns the percentage of matches in the conservation sequence
#    for this feature if it was loaded otherwise it returns -1.
sub match_percentage{
  my ($feature) = @_;
  if($feature->{Match} != -1){
    return $feature->{Match};
  }
  unless($feature->{Conseq}){
    return -1;
  }
  my $match = $feature->{1};
  my $mismatch = $feature->{0};
  my $unaligned = $feature->{2};
  my $total = $match + $mismatch + $unaligned;
  my $percent = $match/$total;
  $feature->{Match} = $percent;
  return $percent;
}

# GTF::Feature::mismatch_percentage()
#    This funciton returns the percentage of mismatches in the conservation sequence
#    for this feature if it was loaded otherwise it returns -1.
sub mismatch_percentage{
  my ($feature) = @_;
  if($feature->{Mismatch} != -1){
    return $feature->{Mismatch};
  }
  unless($feature->{Conseq}){
    return -1;
  }
  my $match = $feature->{1};
  my $mismatch = $feature->{0};
  my $unaligned = $feature->{2};
  my $total = $match + $mismatch + $unaligned;
  my $percent = $mismatch/$total;
  $feature->{Mismatch} = $percent;
  return $percent;
}

# GTF::Feature::unaligned_percentage()
#    This funciton returns the percentage of unaligned nucs in the conservation sequence
#    for this feature if it was loaded otherwise it returns -1.
sub unaligned_percentage{
  my ($feature) = @_;
  if($feature->{Unaligned} != -1){
    return $feature->{Unaligned};
  }
  unless($feature->{Conseq}){
    return -1;
  }
  my $match = $feature->{1};
  my $mismatch = $feature->{0};
  my $unaligned = $feature->{2};
  my $total = $match + $mismatch + $unaligned;
  my $percent = $unaligned/$total;
  $feature->{Unaligned} = $percent;
  return $percent;
}

# GTF::Feature::get_a_count()
#    This funciton returns the number of A nucleotides in this features sequence
#    if the sequence was loaded, otherwise it returns -1.
sub get_a_count{   
  my ($feature) = @_;
  unless($feature->{Seq}){
    return -1;
  }
  return $feature->{A};
}

# GTF::Feature::get_c_count()
#    This funciton returns the number of C nucleotides in this features sequence
#    if the sequence was loaded, otherwise it returns -1.
sub get_c_count{
  my ($feature) = @_;
  unless($feature->{Seq}){
    return -1;
  }
  return $feature->{C};
}

# GTF::Feature::get_g_count()
#    This funciton returns the number of G nucleotides in this features sequence
#    if the sequence was loaded, otherwise it returns -1.
sub get_g_count{
  my ($feature) = @_;
  unless($feature->{Seq}){
    return -1;
  }
  return $feature->{G};
}

# GTF::Feature::get_t_count()
#    This funciton returns the number of T nucleotides in this features sequence
#    if the sequence was loaded, otherwise it returns -1.
sub get_t_count{
  my ($feature) = @_;
  unless($feature->{Seq}){
    return -1;
  }
  return $feature->{T};
}

# GTF::Feature::get_n_count()
#    This funciton returns the number of N nucleotides in this features sequence
#    if the sequence was loaded, otherwise it returns -1.
sub get_n_count{
  my ($feature) = @_;
  unless($feature->{Seq}){
    return -1;
  }
  return $feature->{N};
}

# GTF::Feature::get_match_count()
#    This funciton returns the number of matched nucleotides in this features 
#    conservation sequence if it was loaded, otherwise it returns -1.
sub get_match_count{
  my ($feature) = @_;
  unless($feature->{Conseq}){
    return -1;
  }
  return $feature->{1};
}

# GTF::Feature::get_mismatch_count()
#    This funciton returns the number of mismatched nucleotides in this features 
#    conservation sequence if it was loaded, otherwise it returns -1.
sub get_mismatch_count{
  my ($feature) = @_;
  unless($feature->{Conseq}){
    return -1;
  }
  return $feature->{0};
}

# GTF::Feature::get_unaligned_count()
#    This funciton returns the number of unaligned nucleotides in this features 
#    conservation sequence if it was loaded, otherwise it returns -1.
sub get_unaligned_count{
  my ($feature) = @_;
  unless($feature->{Conseq}){
    return -1;
  }
  return $feature->{2};
}

sub tag{
  my ($feature) = @_;
  return $feature->{Tag};
}

sub set_tag{
  my ($feature,$tag) = @_;
  $feature->{Tag} = $tag;
}

1;
__END__