File: SEmatch.rb

package info (click to toggle)
genometools 1.6.6%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 50,576 kB
  • sloc: ansic: 271,876; ruby: 29,930; python: 5,106; sh: 3,083; makefile: 1,213; perl: 219; pascal: 159; haskell: 37; sed: 5
file content (151 lines) | stat: -rw-r--r-- 4,615 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
class SEmatch
  def initialize(matchfile)
    @matchfile = matchfile
  end
  def each()
    begin
      @fp = File.new(@matchfile)
    rescue => err
      STDERR.puts "#{$0}: cannot open #{@matchfile}"
      exit 1
    end
    @options = nil
    @fields = nil
    @runtime = nil
    @spacepeak = nil
    @fp.each_line do |line|
      if line.match(/^#/)
        if m = line.match(/^# Options:(.*)$/)
          @options = m[1]
        elsif m = line.match(/^# Fields:\s(.*)$/)
          @fields = m[1].gsub(/\./,"_").split(/, /)
          @fields.map!{|f| f.gsub(/\s/,"_").gsub(/__/,"_").gsub(/%_/,"")}
        end
        if m = line.match(/^# TIME.*\s(\S+)$/)
          @runtime = m[1].to_f
        elsif m = line.match(/^# combined space peak in megabytes:\s(\S+)$/)
          @spacepeak = m[1].to_f
        end
      else
        if @fields.nil?
          STDERR.puts "#{$0}: fields: not defined"
          exit 1
        end
        value_hash = Hash.new()
        line.gsub(/^\s+/,"").split(/\s+/).each_with_index do |value,idx|
          if value.match(/^\d+\.\d+$/)
            value_hash[@fields[idx].to_sym] = value.to_f
          elsif value.match(/^\d+$/)
            value_hash[@fields[idx].to_sym] = value.to_i
          else
            value_hash[@fields[idx].to_sym] = value
          end
        end
        if value_hash.has_key?(:s_start)
          if value_hash.has_key?(:s_len)
            value_hash[:s_end] = value_hash[:s_start] + value_hash[:s_len] - 1
          elsif value_hash.has_key?(:s_end)
            if value_hash[:s_start] < value_hash[:s_end]
              value_hash[:s_len] = value_hash[:s_end] - value_hash[:s_start] + 1
            else
              value_hash[:s_len] = value_hash[:s_start] - value_hash[:s_end] + 1
            end
          else
            STDERR.puts "#{$0}: either length of match on subject or end " +
                        "position must be given"
            exit 1
          end
        else
          STDERR.puts "#{$0}: start of match on subject must be given"
          exit 1
        end
        if value_hash.has_key?(:q_start)
          if value_hash.has_key?(:q_len)
            value_hash[:q_end] = value_hash[:q_start] + value_hash[:q_len] - 1
          elsif value_hash.has_key?(:q_end)
            if value_hash[:q_start] < value_hash[:q_end]
              value_hash[:q_len] = value_hash[:q_end] - value_hash[:q_start] + 1
            else
              value_hash[:q_len] = value_hash[:q_start] - value_hash[:q_end] + 1
            end
          else
            value_hash[:q_len] = value_hash[:s_len]
          end
        else
          STDERR.puts "#{$0}: start of match on query must be given"
          exit 1
        end
        value_hash[:origline] = line.chomp
        yield value_hash
      end
    end
  end
  def spacepeak_get()
    return @spacepeak
  end
  def runtime_get()
    return @runtime
  end
end

def match_is_identical(m0,m1)
  if m0[:s_seqnum] != m1[:s_seqnum] or m0[:q_seqnum] != m1[:q_seqnum]
    STDERR.puts "#{$0}: expect same sequence numbers"
    exit 1
  end
  if [m0[:s_start],m0[:s_end],m0[:q_start],m0[:q_end]] ==
     [m1[:s_start],m1[:s_end],m1[:q_start],m1[:q_end]]
    return true
  end
  return false
end

def coords_contained_in(start0,end0,start1,end1)
  if start1 <= start0 and end0 <= end1
    return true
  end
  return false
end

def match_proper_contained_in(m0,m1)
  if m0[:s_seqnum] != m1[:s_seqnum] or m0[:q_seqnum] != m1[:q_seqnum]
    STDERR.puts "#{$0}: expect same sequence numbers"
    exit 1
  end
  if coords_contained_in(m0[:s_start],m0[:s_end],m1[:s_start],m1[:s_end]) and
     coords_contained_in(m0[:q_start],m0[:q_end],m1[:q_start],m1[:q_end]) and
     not match_is_identical(m0,m1)
    return true
  end
  return false
end

def coords_overlap_size(start0,end0,start1,end1)
  if start0 > start1
    STDERR.puts "#{$0}: start0=#{start0} > #{start1}=start1 not expected"
    exit 1
  end
  if end0 < start1
    return 0
  elsif end0 < end1
    return end0 - start1 + 1
  else
    return end1 - start1 + 1
  end
end

def matches_overlap(m0,m1)
  ovl = 0
  if m0[:s_start] <= m1[:s_start]
    ovl += coords_overlap_size(m0[:s_start],m0[:s_end],m1[:s_start],m1[:s_end])
  else
    ovl += coords_overlap_size(m1[:s_start],m1[:s_end],m0[:s_start],m0[:s_end])
  end
  if m0[:q_start] <= m1[:q_start]
    ovl += coords_overlap_size(m0[:q_start],m0[:q_end],m1[:q_start],m1[:q_end])
  else
    ovl += coords_overlap_size(m1[:q_start],m1[:q_end],m0[:q_start],m0[:q_end])
  end
  len = (m0[:s_len] + m0[:q_len])/2 + (m1[:s_len] + m1[:q_len])/2
  return (ovl.to_f/len.to_f).round
end