File: gfa2lint.rb

package info (click to toggle)
genometools 1.6.1%2Bds-3
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 50,412 kB
  • sloc: ansic: 271,241; ruby: 30,339; python: 4,880; sh: 3,193; makefile: 1,194; perl: 219; pascal: 159; haskell: 37; sed: 5
file content (148 lines) | stat: -rwxr-xr-x 3,852 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env ruby

require "set"

def show_difference(tag,set1,set2)
  STDERR.puts tag
  set1.difference(set2).each do |elem|
    STDERR.puts "#{elem}"
  end
end

def lint_trace(trace_delta,ulen,vlen,trace_string)
  usum = 0
  vsum = 0
  trace_string.split(/,/).each do |vspec|
    usum += trace_delta
    vsum += vspec.to_i
  end
  if vsum != vlen
    STDERR.puts "#{$0}: #{__method__}: vsum = #{vsum} != #{vlen} = vlen"
    exit 1
  end
  usum -= trace_delta
  if usum > ulen
    STDERR.puts "#{$0}: #{__method__}: usum = #{usum} > #{ulen} = ulen"
    exit 1
  end
end

class String
  def segment_name
    return self.gsub(/[-+]$/,"")
  end
end

set_E_lines = Set.new()
set_S_lines = Set.new()
set_S_ids = Set.new()
set_E_ids = Set.new()
segment_hash = Hash.new()

if ARGV.length != 1
  STDERR.puts "Usage: #{$0} <inputfile in gfa2 format>"
  exit 1
end

inputfile = ARGV[0]
trace_delta = nil
File.new(inputfile,"r").each_line do |line|
  if line.match(/^E/)
    e_elems = line.split(/\t/)
    if e_elems.length != 9
      STDERR.puts "#{$0}: expect 9 columns in E-lines"
      exit 1
    end
    id = e_elems[1]
    if set_E_ids.include?(id)
      STDERR.puts "#{$0}: edge identifier #{id} is not unique"
      exit 1
    end
    set_E_ids.add(id)
    set_E_lines.add(e_elems[2].segment_name)
    set_E_lines.add(e_elems[3].segment_name)
  elsif line.match(/^S/)
    s_elems = line.split(/\t/)
    if s_elems.length != 4
      STDERR.puts "#{$0}: expect 4 columns in S-lines"
      exit 1
    end
    key = s_elems[1]
    if segment_hash.has_key?(key)
      STDERR.puts "#{$0}: duplicated segment: #{key}"
      exit 1
    end
    len = s_elems[2].to_i
    segment = s_elems[3].chomp
    if segment.length != len
      STDERR.puts "#{$0}: segment.length = #{segment.length} != #{len} = len"
      exit 1
    end
    segment_hash[key] = len
    set_S_lines.add(key)
    id = s_elems[1]
    if set_S_ids.include?(id)
      STDERR.puts "#{$0}: segment identifier #{id} is not unique"
      exit 1
    end
    set_S_ids.add(id)
  elsif line.match(/^H/)
    if m = line.match(/TS:i:(\d+)/)
      trace_delta = m[1].to_i
    end
  end
end

if set_S_lines != set_E_lines
  show_difference("elems in E but not in S",set_E_lines,set_S_lines)
  show_difference("elems in S but not in E",set_S_lines,set_E_lines)
  exit 1
end

linenum = 1
File.new(inputfile,"r").each_line do |line|
  if line.match(/^E/)
    e_elems = line.split(/\t/)
    lseg = e_elems[2].segment_name
    lstart = e_elems[4].to_i
    lend = e_elems[5].to_i
    if lstart > lend
      STDERR.print "#{$0}: file #{inputfile}, line #{linenum}: "
      STDERR.puts "lstart = #{lstart} > #{lend} = lend"
      exit 1
    end
    if not segment_hash.has_key?(lseg)
      STDERR.print "#{$0}: file #{inputfile}, line #{linenum}: "
      STDERR.puts "lseg=#{lseg} is not a valid segment name"
      exit 1
    end
    len_lseg = segment_hash[lseg]
    if lend >= len_lseg
      STDERR.print "#{$0}: file #{inputfile}, line #{linenum}: "
      STDERR.puts "lend=#{lend} >= #{len_lseg} = len_lseg"
    end
    rseg = e_elems[3].segment_name
    rstart = e_elems[6].to_i
    rend = e_elems[7].to_i
    if rstart > rend
      STDERR.print "#{$0}: file #{inputfile}, line #{linenum}: "
      STDERR.puts "rstart = #{rstart} > #{rend} = rend"
      exit 1
    end
    if not segment_hash.has_key?(rseg)
      STDERR.print "#{$0}: file #{inputfile}, line #{linenum}: "
      STDERR.puts "rseg=#{rseg} is not a valid segment name"
      exit 1
    end
    len_rseg = segment_hash[rseg]
    if rend >= len_rseg
      STDERR.print "#{$0}: file #{inputfile}, line #{linenum}: "
      STDERR.puts "rend=#{rend} >= #{len_rseg} = len_rseg"
    end
    if not trace_delta.nil?
      lint_trace(trace_delta,lend - lstart + 1,rend - rstart + 1,
                 e_elems[8].chomp)
    end
  end
  linenum += 1
end