File: blat2gff3

package info (click to toggle)
genometools 1.6.1%2Bds-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 50,412 kB
  • sloc: ansic: 271,241; ruby: 30,339; python: 4,880; sh: 3,193; makefile: 1,194; perl: 219; pascal: 159; haskell: 37; sed: 5
file content (59 lines) | stat: -rwxr-xr-x 2,096 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env ruby
#
# Copyright (c) 2006-2007 Gordon Gremme <gordon@gremme.org>
# Copyright (c) 2006-2007 Center for Bioinformatics, University of Hamburg
#
# Permission to use, copy, modify, and distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
# OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
#

require 'getoptlong'

$:.push(File.dirname($0))
require 'gff3'

# parse options
opts = GetoptLong.new(
  [ "--max-mismatches", "-m", GetoptLong::REQUIRED_ARGUMENT ]
)

max_mismatches = nil
opts.each do |opt, arg|
  raise if opt != "--max-mismatches"
  max_mismatches = arg.to_i
end

# read input
sequences = {}
ARGF.each do |line|
  matches, misMatches, repMatches, nCount, qNumInsert, qBaseInsert, \
  tNumInsert, tBaseInsert, strand, qName, qSize, qStart, qEnd, tName, tSize, \
  tStart, tEnd, blockCount, blockSizes, qStarts, tStarts = line.split
  if not max_mismatches or misMatches.to_i <= max_mismatches then
    if sequences[tName] then
      sequences[tName].update_range(tStart.to_i, tEnd.to_i)
    else
      sequences[tName] = Sequence.new(tStart.to_i, tEnd.to_i)
    end
    gene = Gene.new(Range.new(tStart.to_i + 1, tEnd.to_i + 1), strand[0..0].to_s)
    exon_start_pos = tStarts.split(',')
    exon_sizes = blockSizes.split(',')
    1.upto(blockCount.to_i) do |i|
      gene.add_exon(Range.new(exon_start_pos[i-1].to_i + 1, \
                              exon_start_pos[i-1].to_i + exon_sizes[i-1].to_i));
    end
    sequences[tName].add_gene(gene);
  end
end

# output
gff3_output(sequences, "blat")