1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
|
#!/usr/bin/env ruby
#
# Copyright (c) 2006-2007 Gordon Gremme <gordon@gremme.org>
# Copyright (c) 2006-2007 Center for Bioinformatics, University of Hamburg
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#
$:.push(File.dirname($0))
require 'gff3'
class Scanner
def initialize(input)
@input = input
@line_buf = nil
end
def peek
if @line_buf then
return @line_buf
end
@line_buf = @input.gets
end
def gets
if @line_buf then
buf = @line_buf
@line_buf = nil
return buf
end
@input.gets
end
def lineno
@input.lineno
end
def filename
@input.filename
end
end
class GMAPParser
def initialize(sequences)
@sequences = sequences
@current_accession = nil
@current_strand = nil
end
def parse(scanner)
while scanner.peek
if scanner.gets =~ /^>/ then
if parse_paths(scanner) then
parse_alignment(scanner)
end
end
end
end
private
def parse_paths(scanner)
line = scanner.gets
if line =~ /^Paths/ then
# XXX: change parser to handle multiple paths
if line =~ /^Paths \(1\):/ then
while line = scanner.gets do
if line =~ /Genomic pos:/ then
if line =~ /^.*\(([+-])/ then
@current_strand = $1
else
raise "could not parse strand on line #{scanner.lineno} of " +
"file #{scanner.filename}"
end
elsif line =~ /Accessions:/ then
accession_parts = line.split(':')
@current_accession = accession_parts[1].strip
return true
end
end
elsif line !~ /^Paths \(0\):/
STDERR.puts "warning: skipping multiple path alignment on line " +
"#{scanner.lineno} of file #{scanner.filename}"
return false
end
else
raise "expecting 'Paths' on line #{scanner.lineno} of file " +
"#{scanner.filename}"
end
end
def parse_alignment(scanner)
while line = scanner.gets do
if line =~ /^Alignments:/ then
# skip two lines
scanner.gets
scanner.gets
exons = []
while (line = scanner.gets) =~ /:/ do
# process all alignment lines
line =~ /^.*:(\d+)-(\d+)/
exon = Range.new($1.to_i, $2.to_i)
if exon.begin > exon.end then
exon = Range.new(exon.end, exon.begin)
end
exons.push(exon)
end
if exons.size == 0 then
raise "could not parse exon up to line #{scanner.lineno} in file " +
"#{scanner.filename}"
end
# construct gene
exon_border = []
exon_border.push(exons.first.begin)
exon_border.push(exons.first.end)
exon_border.push(exons.last.begin)
exon_border.push(exons.last.end)
gene_range = Range.new(exon_border.min, exon_border.max)
raise if gene_range.begin > gene_range.end
if current_sequence = @sequences[@current_accession] then
# the current sequence exists already -> make sure it is maximal
current_sequence.update_range(gene_range.begin, gene_range.end)
else
# the current sequence does not exist -> add it
@sequences[@current_accession] = Sequence.new(gene_range.begin,
gene_range.end)
end
current_sequence = @sequences[@current_accession]
gene = Gene.new(gene_range, @current_strand)
exons.each { |exon| gene.add_exon(exon) }
current_sequence.add_gene(gene)
return
end
end
end
end
# read input
sequences = {}
gmap_parser = GMAPParser.new(sequences)
gmap_parser.parse(Scanner.new(ARGF))
# output
gff3_output(sequences, "gmap")
|