File: coverage-MEM.rb

package info (click to toggle)
genometools 1.6.1%2Bds-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 50,412 kB
  • sloc: ansic: 271,241; ruby: 30,339; python: 4,880; sh: 3,193; makefile: 1,194; perl: 219; pascal: 159; haskell: 37; sed: 5
file content (97 lines) | stat: -rwxr-xr-x 3,287 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
#!/usr/bin/env ruby

# Determine how many MEMs computed by repfind are contained in local alignments
# computed by seed_extend.

require "scripts/evalseedhash.rb"

def makesystemcall(argstring, proceed=false)
  if not system(argstring)
    STDERR.puts "system \"#{argstring}\" failed: errorcode #{$?}"
    exit 1 unless proceed
  end
end

# binary search and sort decisions
def comparefunc(mem, ali, sorting=false)
  if mem[:seq1] != ali[:seq1] then return mem[:seq1] <=> ali[:seq1] end
  if mem[:seq2] != ali[:seq2] then return mem[:seq2] <=> ali[:seq2] end
  if sorting then
    return mem[:start1] <=> ali[:start1]
  elsif mem[:start1] < ali[:start1] then  # alignment starts behind MEM
    return -1
  else
    return 0
  end
end

def calculateratio(repfindmatches, seedextmatches)
  miss = 0
  succ = 0
  alignments = seedextmatches.values.sort{|a,b|comparefunc(a,b,true)}
  repfindmatches.each_value{|mem|
    found = false
    idx = (0...alignments.size).bsearch{|ali| comparefunc(mem, alignments[ali])}
    if not idx.nil? then
      while idx >= 0 and comparefunc(mem, alignments[idx]) == 0
        idx -= 1
      end
      idx += 1
      while not found and comparefunc(mem, alignments[idx]) == 0
        found = (alignments[idx][:start1] + alignments[idx][:len1] >=
                 mem[:start1] + mem[:len1] and
                 alignments[idx][:start2] <= mem[:start2] and
                 alignments[idx][:start2] + alignments[idx][:len2] >=
                 mem[:start2] + mem[:len2])
        idx += 1
      end
    end
    if found then
      succ += 1
    else
      miss += 1
      puts "MEM not found (#{mem[:seq1]}, #{mem[:seq2]}, " +
           "#{mem[:start1]}+#{mem[:len1]}, #{mem[:start2]}+#{mem[:len2]})"
    end
  }
  puts "success=#{succ}  miss=#{miss}  total=#{repfindmatches.size}  " +
       "ratio=#{succ*100/repfindmatches.size}%"
end

if __FILE__ == $0
  use_apos = ""
  if ARGV.size == 4 and ARGV[3] == "a" then
    use_apos = "-use-apos "
  elsif ARGV.size != 3 then
    puts "Usage: #{$0} <referencefile> <queryfile> <leastlen> [a]"
    exit 1
  end
  referencefile = ARGV[0]
  queryfile = ARGV[1]
  leastlen = ARGV[2].to_i
  filedir="coverage-MEM"
  makesystemcall("rm -rf #{filedir}")
  makesystemcall("mkdir -p #{filedir}")
  
  # create index files
  makesystemcall("bin/gt suffixerator -tis -sds no -des no -md5 -suf -lcp " +
                 "-indexname #{filedir}/reference -db #{referencefile}")
  makesystemcall("bin/gt encseq encode -des no -sds no -md5 -indexname " +
                 "#{filedir}/query #{queryfile}")
  # run repfind and seed_extend
  makesystemcall("bin/gt repfind -ii #{filedir}/reference -q #{queryfile} " +
                 "-l #{leastlen} -p -f > #{filedir}/repfind.out")
  makesystemcall("bin/gt seed_extend -ii #{filedir}/reference " +
                 "-qii #{filedir}/query -l #{leastlen} -mincoverage 1 " +
                 "-seedlength #{[leastlen, 32].min} -minidentity 99 " +
                 "#{use_apos} > #{filedir}/seedext.out")
  # compare results
  repfindmatches = readmatchesfromfile("#{filedir}/repfind.out")
  if repfindmatches.size > 0 then
    seedextmatches = readmatchesfromfile("#{filedir}/seedext.out")
    calculateratio(repfindmatches, seedextmatches)
  else
    puts "no results"
  end
end