File: seqselect.rb

package info (click to toggle)
genometools 1.6.6%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 50,576 kB
  • sloc: ansic: 271,876; ruby: 29,930; python: 5,106; sh: 3,083; makefile: 1,213; perl: 219; pascal: 159; haskell: 37; sed: 5
file content (69 lines) | stat: -rwxr-xr-x 1,554 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env ruby

require "set"

def getselectedseqnums(numofsequences,numtoselect)
  if numofsequences < numtoselect
    STDERR.puts "#{$0}: cannot select #{numotoselect} sequences from file " +
                "#{inputfile}: this contains #{numofsequences} sequences"
    exit 1
  end
  selectedseqnums = Set.new
  loop do
    idx = rand(numofsequences)
    if not selectedseqnums.member?(idx)
      selectedseqnums.add(idx)
      if selectedseqnums.size == numtoselect
        break
      end
    end
  end
  return selectedseqnums
end

def countnumofsequences(inputfile)
  seqcount = 0
  File.open(inputfile).each_line do |line|
    if line.match(/^>/)
      seqcount+=1
    end
  end
  return seqcount
end

def outputselectedsequences(selectedseqnums,inputfile,fp=STDOUT)
  currentseqnum = 0
  dooutseq = false
  File.open(inputfile).each_line do |line|
    if line.match(/^\s*$/)     # discard blank line
      next
    elsif line.match(/^\s*#/)  # discard comment line
      next
    elsif line.match(/^>/)
      if selectedseqnums.member?(currentseqnum)
        dooutseq = true
      else
        dooutseq = false
      end
      currentseqnum += 1
    end
    if dooutseq
      fp.print line
    end
  end
end

if ARGV.length != 2
  STDERR.puts "Usage: #{$0} <num of seq to select> <fastafile>"
  exit 1
end

numtoselect = ARGV[0].to_i
inputfile = ARGV[1]

numofsequences = countnumofsequences(inputfile)

srand(37739292920)

selectedseqnums = getselectedseqnums(numofsequences,numtoselect)
outputselectedsequences(selectedseqnums,inputfile)