File: gmap2gff3

package info (click to toggle)
genometools 1.5.10%2Bds-3
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 53,852 kB
  • sloc: ansic: 355,882; ruby: 30,295; python: 4,880; sh: 3,190; makefile: 1,197; perl: 219; pascal: 159; haskell: 37; sed: 5
file content (146 lines) | stat: -rwxr-xr-x 4,483 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#!/usr/bin/env ruby
#
# Copyright (c) 2006-2007 Gordon Gremme <gordon@gremme.org>
# Copyright (c) 2006-2007 Center for Bioinformatics, University of Hamburg
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#

$:.push(File.dirname($0))
require 'gff3'

class Scanner
  def initialize(input)
    @input = input
    @line_buf = nil
  end
  def peek
    if @line_buf then
      return @line_buf
    end
    @line_buf = @input.gets
  end
  def gets
    if @line_buf then
      buf = @line_buf
      @line_buf = nil
      return buf
    end
    @input.gets
  end
  def lineno
    @input.lineno
  end
  def filename
    @input.filename
  end
end

class GMAPParser
  def initialize(sequences)
    @sequences = sequences
    @current_accession = nil
    @current_strand = nil
  end
  def parse(scanner)
    while scanner.peek
      if scanner.gets =~ /^>/ then
        if parse_paths(scanner) then
          parse_alignment(scanner)
        end
      end
    end
  end
  private
  def parse_paths(scanner)
    line = scanner.gets
    if line =~ /^Paths/ then
      # XXX: change parser to handle multiple paths
      if line =~ /^Paths \(1\):/ then
        while line = scanner.gets do
          if line =~ /Genomic pos:/ then
            if line =~ /^.*\(([+-])/ then
              @current_strand = $1
            else
              raise "could not parse strand on line #{scanner.lineno} of " +
                    "file #{scanner.filename}"
            end
          elsif line =~ /Accessions:/ then
            accession_parts = line.split(':')
            @current_accession = accession_parts[1].strip
            return true
          end
        end
      elsif line !~ /^Paths \(0\):/
        STDERR.puts "warning: skipping multiple path alignment on line " +
                    "#{scanner.lineno} of file #{scanner.filename}"
        return false
      end
    else
      raise "expecting 'Paths' on line #{scanner.lineno} of file " +
            "#{scanner.filename}"
    end
  end
  def parse_alignment(scanner)
    while line = scanner.gets do
      if line =~ /^Alignments:/ then
        # skip two lines
        scanner.gets
        scanner.gets
        exons = []
        while (line = scanner.gets) =~ /:/ do
          # process all alignment lines
          line =~ /^.*:(\d+)-(\d+)/
          exon = Range.new($1.to_i, $2.to_i)
          if exon.begin > exon.end then
            exon = Range.new(exon.end, exon.begin)
          end
          exons.push(exon)
        end
        if exons.size == 0 then
          raise "could not parse exon up to line #{scanner.lineno} in file " +
                "#{scanner.filename}"
        end
        # construct gene
        exon_border = []
        exon_border.push(exons.first.begin)
        exon_border.push(exons.first.end)
        exon_border.push(exons.last.begin)
        exon_border.push(exons.last.end)
        gene_range = Range.new(exon_border.min, exon_border.max)
        raise if gene_range.begin > gene_range.end
        if current_sequence = @sequences[@current_accession] then
          # the current sequence exists already -> make sure it is maximal
          current_sequence.update_range(gene_range.begin, gene_range.end)
        else
          # the current sequence does not exist -> add it
          @sequences[@current_accession] = Sequence.new(gene_range.begin,
                                                        gene_range.end)
        end
        current_sequence =  @sequences[@current_accession]
        gene = Gene.new(gene_range, @current_strand)
        exons.each { |exon| gene.add_exon(exon) }
        current_sequence.add_gene(gene)
        return
      end
    end
  end
end

# read input
sequences = {}
gmap_parser = GMAPParser.new(sequences)
gmap_parser.parse(Scanner.new(ARGF))

# output
gff3_output(sequences, "gmap")