File: splitmultifasta.rb

package info (click to toggle)
genometools 1.6.6%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 50,576 kB
  • sloc: ansic: 271,876; ruby: 29,930; python: 5,106; sh: 3,083; makefile: 1,213; perl: 219; pascal: 159; haskell: 37; sed: 5
file content (93 lines) | stat: -rwxr-xr-x 2,291 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
#!/usr/bin/env ruby

# Split a fasta file into different files
# Stefan Kurtz, October 10, 2009.

# first parameter is prefix of files to generate. 
# second parameter is number of files to generate, 0 means to generate
# one file per sequence
# third parameter is input file.

# let infile be the name of the input file in fasta format. Suppose 
# that it contain 1952 sequences. Then

# splitmultifasta.rb tmp 0 infile

# generates 1952 files named tmp-0000, tmp-0001, ..., tmp-1951.
# each containing one sequence
# To check that the split was correct, execute the following commands:
# cat tmp-* > ALL
# diff ALL infile
# when nothing is reported by diff, then everything is fine. Otherwise
# check if there are other (not generated files) that begin with
# tmp-

def countnumofsequences(inputfile)
  seqcount = 0
  File.open(inputfile).each_line do |line|
    if line.match(/^>/)
      seqcount+=1
    end
  end
  return seqcount
end

def openoutfile(filename)
begin
  outfp = File.new(filename,"w")
rescue => error
  STDERR.puts "#{$0}: cannot open \"#{filename}\": #{error}"
  exit 1
end
  return outfp
end

def log10func(n)
  return (Math.log(n.to_f)/Math.log(10.0)).to_i
end

def splitfiles(inputfile,splitprefix,numoffiles,numofsequences)
  seqcount = 0
  filenum = 0
  fh = nil
  maxseqnum = nil
  numwidth = nil

  if numoffiles == 0
    maxseqnum = 1
    numwidth = 1+log10func(numofsequences-1)
  else
    maxseqnum = numofsequences/numoffiles + numofsequences % numoffiles
    numwidth = 1+log10func(numoffiles-1)
  end
  File.open(inputfile).each_line do |line|
    if line.match(/^\s*$/)     # discard blank line
      next
    elsif line.match(/^\s*#/)  # discard comment line
      next
    elsif line.match(/^>/)
      if seqcount >= maxseqnum
        seqcount = 0
      end
      if seqcount == 0
        outfilename = sprintf("%s-%0*d",splitprefix,numwidth,filenum)
        fh = openoutfile(outfilename)
        filenum+=1
      end
      seqcount += 1
    end
    fh.print line
  end
end

if ARGV.length != 3
  STDERR.puts "Usage: #{$0} <splitprefix> <numoffiles> <fastafile>"
  exit 1
end

splitprefix = ARGV[0]
numoffiles = ARGV[1].to_i
inputfile = ARGV[2]

numofsequences = countnumofsequences(inputfile)
splitfiles(inputfile,splitprefix,numoffiles,numofsequences)