File: evalseedhash.rb

package info (click to toggle)
genometools 1.6.1%2Bds-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 50,412 kB
  • sloc: ansic: 271,241; ruby: 30,339; python: 4,880; sh: 3,193; makefile: 1,194; perl: 219; pascal: 159; haskell: 37; sed: 5
file content (345 lines) | stat: -rw-r--r-- 9,691 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
require "set"

Match = Struct.new("Match",:len1,:seq1,:start1,:len2,:seq2,:start2,:score,
                            :distance,:identity,:other,:result1,:result2)

def match_to_s(m)
  return [m.len1,m.seq1,m.start1,m.len2,m.seq2,m.start2,m.score,m.distance].
                      join(" ") + sprintf(" %.2f",m.identity)
end

def prefixlength_get(prjfile)
  prefixlength = nil
  File.open(prjfile).each_line do |line|
    if m = line.match(/^prefixlength=(\d+)$/)
      prefixlength = m[1].to_i
    end
  end
  if prefixlength.nil?
    STDERR.puts "#{prjfile} does not contain definition of prefixlength"
    exit 1
  end
  return prefixlength
end

def createrepfindcall(indexname,seedlength,extend_opt,optionlist,
                      emptyenv,
                      verbose = true)
  verboseoption = if verbose then "-v" else "" end
  repfindcall = "bin/gt repfind -scan #{verboseoption} -minid 80 " +
                "-seedlength #{seedlength} -#{extend_opt} -ii #{indexname} " + 
                optionlist.join(" ")
  if emptyenv 
    return "env -i #{repfindcall}" 
  else 
    return repfindcall 
  end
end

def match_new_without_result(a)
  return Match.new(a[0].to_i,a[1].to_i,a[2].to_i,a[4].to_i,a[5].to_i,a[6].to_i,
                   a[7].to_i,a[8].to_i,a[9].to_f,nil,nil,nil)
end

def makeseedhash(indexname,seedlength,extend_opt,optionlist,gencall = false)
  seedhash = Hash.new()
  key = nil
  if seedlength == 0
    seedlength = 2 * prefixlength_get("#{indexname}.prj")
  end
  repfindcall = createrepfindcall(indexname,seedlength,extend_opt,optionlist,
                                  true,if gencall then false else true end)
  if gencall
    return repfindcall
  end
  STDERR.puts "\# #{repfindcall}"
  IO.popen(repfindcall.split(/\s/)).each_line do |line|
    if line.match(/# seed:/)
      key = line.chomp.gsub(/# seed:\s+/,"")
    elsif line.match(/^#/)
      next
    elsif line.match(/^\d/)
      a = line.split(/\s/)
      if key.nil?
        STDERR.puts "#{$0}: expect that key is defined"
        exit 1
      end
      if seedhash.has_key?(key)
        STDERR.puts "#{$0}: key #{key} already occurs"
        exit 1
      end
      seedhash[key] = match_new_without_result(a)
      key = nil
    end
  end
  if "#{$?}" != "" and not "#{$?}".match(/exit 0$/)
    STDERR.puts "FAILURE: #{repfindcall}: \"#{$?}\""
    exit 1
  end
  return seedhash
end

def inputseedhash(matchfile)
  seedhash = Hash.new()
  seed = nil
  File.open(matchfile,"r").each_line do |line|
    if line.match(/# seed:/)
      seed = line.chomp.gsub(/# seed:\s+/,"")
    elsif line.match(/^#/)
      next
    elsif line.match(/^\d/)
      a = line.split(/\s/)
      if seed.nil?
        STDERR.puts "#{$0}: expect that seed is defined"
        exit 1
      end
      if seedhash.has_key?(seed)
        STDERR.puts "#{$0}: seed #{seed} already occurs"
        exit 1
      end
      seedhash[seed] = match_new_without_result(a)
      seed = nil
    end
  end
  return seedhash
end

def is_contained(start1,end1,start2,end2)
  if start2 <= start1 and end1 <= end2
    return true
  else
    return false
  end
end

def overlap(start1,end1,start2,end2)
  if start1 <= start2
    if start2 <= end1
      return end1 - start2 + 1
    else
      return 0
    end
  elsif start1 <= end2
    return end2 - start1 + 1
  else
    return 0
  end
end

Results = Struct.new("Result",:contained,:containing,:ovperc)

def evaluate_itv(start1,len1,start2,len2)
  end1 = start1 + len1 - 1
  end2 = start2 + len2 - 1
  contained = false
  containing = false
  if is_contained(start1,end1,start2,end2)
    contained = true
  end
  if is_contained(start2,end2,start1,end1)
    containing = true
  end
  ovperc = nil
  if not contained and not containing
    ov = overlap(start1,end1,start2,end2)
    ovperc = 100.0 * ov.to_f/[len1,len2].max.to_f
  end
  return Results.new(contained,containing,ovperc)
end

def addpercentage(h,val)
  sval = sprintf("%.1f",val)
  if h.has_key?(sval)
    h[sval] += 1
  else
    h[sval] = 1
  end
end

def seedhash2seqnum_pairs(seedhash)
  return Set.new(seedhash.values.map {|m| [m.seq1,m.seq2]})
end

def showcomment(size,sum_size,comment)
  if sum_size > 0
    perc = 100.0 * size.to_f/sum_size.to_f
    STDERR.printf("# %d (%.0f%%) of %d sequence pairs %s\n",size,perc,
                                                  sum_size,comment)
    return perc.to_i
  else
    return 0
  end
end

def calcdifference(seqnumpair_set1,seqnumpair_set2)
  sum_size = seqnumpair_set1.length + seqnumpair_set2.length
  size_both = (seqnumpair_set1 & seqnumpair_set2).length
  perc_both = showcomment(2 * size_both,sum_size,"occur in set 1 and set 2")
  size_only_1 = (seqnumpair_set1 - seqnumpair_set2).length
  if size_only_1 > 0
    perc_only_1 = showcomment(size_only_1,sum_size,
                                   "occur in set 1 but not set 2")
  else
    perc_only_1 = 0
  end
  size_only_2 = (seqnumpair_set2 - seqnumpair_set1).length
  if size_only_2 > 0
    perc_only_2 = showcomment(size_only_2,sum_size,
                                  "occur in set 2 but not set 1")
  else
    perc_only_2 = 0
  end
  return [sum_size,perc_both,perc_only_1,perc_only_2]
end

def fill_other_hash(h1,h2)
  h1.each_pair do |k,v1|
    if h2.has_key?(k)
      v2 = h2[k]
      if [v1.seq1,v1.seq2] != [v2.seq1,v2.seq2]
        STDERR.puts "seq: #{taglist[0]}=[#{v1.seq1},#{v1.seq2}] != " +
                    "     [#{v2.seq1},#{v2.seq2}]=#{taglist[1]}"
        exit 1
      end
      h1[k].result1 = evaluate_itv(v1.start1,v1.len1,v2.start1,v2.len1)
      h1[k].result2 = evaluate_itv(v1.start2,v1.len2,v2.start2,v2.len2)
      h1[k].other = v2
    end
  end
end

def percent(a,b)
  return 100.0 * a.to_f/b.to_f
end

def cmpseedhashes(checkbetter,minidentity,taglist,h1,h2,silent = false)
  fill_other_hash(h1,h2)
  nobrother = 0
  h1.each_pair do |k,v1|
    if v1.identity >= minidentity and not h2.has_key?(k)
      if not silent
        puts "#{taglist[0]}: #{k}=>#{match_to_s(v1)}, #{taglist[1]}=[]"
      end
      nobrother += 1
    end
  end
  count_identical = 0
  count_contained = 0
  count_containing = 0
  ovperc_hash = Hash.new()
  miniddiff = 3
  commentcheckbetter = "matches in #{taglist[1]} >= #{miniddiff}%-points " +
                       "better than the corresponding matches in " +
                       "#{taglist[0]}:"
  h1.each_pair do |k,v|
    if v.result2.nil?
      next
    end
    if (v.result1.contained and v.result2.contained) or
       (v.result1.containing and v.result2.containing)
      count_identical += 1
    elsif v.result1.contained or v.result2.contained
      count_contained += 1
    elsif v.result1.containing or v.result2.containing
      count_containing += 1
    elsif checkbetter
      if v.other.identity >= minidentity
        if (v.other.len1 > v.len1 or v.other.len2 > v.len2) and
           (v.other.identity > v.identity) and 
           (v.other.identity - v.identity >= miniddiff.to_f)
          if not commentcheckbetter.nil?
             puts commentcheckbetter
             commentcheckbetter = nil
          end
          puts "#{taglist[0]}=#{match_to_s(v)}"
          puts "#{taglist[1]}=#{match_to_s(v.other)}"
          addpercentage(ovperc_hash,v.result1.ovperc)
          addpercentage(ovperc_hash,v.result2.ovperc)
        end
      end
    end
  end
  h1perc = percent(count_identical,h1.length)
  h2perc = percent(count_identical,h2.length)
  puts "matches in #{taglist[0]}: #{h1.length}"
  puts "matches in #{taglist[1]}: #{h2.length}"
  printf("identical=%d: %.2f of %s/%.2f of %s\n",count_identical,
             h1perc,taglist[0],h2perc,taglist[1])
  puts "contained=#{count_contained}"
  puts "containing=#{count_containing}"
  ovperc_hash.sort.each do |k|
    puts "#{k[0]}%\t#{k[1]}"
  end
  puts "matches in #{taglist[0]} with no corresponding matches in " +
       "#{taglist[1]}=#{nobrother}"
end

def cmpextendedlength(minidentity,h1,h2)
  fill_other_hash(h1,h2)
  lendiff_dist = Hash.new()
  h1.each_pair do |k,v1|
    if v1.identity < minidentity
      next
    end
    h1len = (v1.len1 + v1.len2)/2
    if not h2.has_key?(k)
      h2len = 0
    else
      h2len = (h2[k].len1 + h2[k].len2)/2
    end
    lendiff = (100.0 * (h1len - h2len).to_f/[h1len,h2len].max.to_f).to_i
    if not lendiff_dist.has_key?(lendiff)
      lendiff_dist[lendiff] = 1
    else
      lendiff_dist[lendiff] += 1
    end
  end
  h2.each_pair do |k,v2|
    if v2.identity < minidentity
      next
    end
    h2len = (v2.len1 + v2.len2)/2
    if not h1.has_key?(k)
      lendiff = -100
      if not lendiff_dist.has_key?(lendiff)
        lendiff_dist[lendiff] = 1
      else
        lendiff_dist[lendiff] += 1
      end
    end
  end
  lendiff_dist.sort.each do |diff,count|
    puts "diff #{diff}\t#{count}"
  end
end

def preprocess_index(inputfile)
  suffixeratorcall = "env -i bin/gt suffixerator -suftabuint " +
                     "-db #{inputfile} " +
                     "-dna -suf -tis -lcp -md5 no -des no -sds no "
  indexname = File.basename(inputfile)
  if not File.exist?("#{indexname}.prj") or
    File.stat("#{indexname}.prj").mtime < File.stat(inputfile).mtime
    puts "# #{suffixeratorcall}"
    if not system(suffixeratorcall)
      STDERR.puts "FAILURE: #{suffixeratorcall}"
      exit 1
    end
  end
  return indexname
end

def readmatchesfromfile(resultmatchfile)
  thishash = Hash.new()
  key = 0
  File.open(resultmatchfile,"r").each_line do |line|
    if line.match(/^\d/)
      thishash[key] = match_new_without_result(line.split(/\s/))
      key += 1
    elsif not line.match(/^\#/)
      STDERR.print "#{$0}: #{resultfile}: illegal line #{line}"
      exit 1
    end
  end
  return thishash
end