File: findfasta.rb

package info (click to toggle)
genometools 1.6.1%2Bds-3
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 50,412 kB
  • sloc: ansic: 271,241; ruby: 30,339; python: 4,880; sh: 3,193; makefile: 1,194; perl: 219; pascal: 159; haskell: 37; sed: 5
file content (112 lines) | stat: -rwxr-xr-x 3,016 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
#!/usr/bin/ruby

require 'optparse'
require 'ostruct'

def listdirectory(directory)
  # prepare regexp for entries to ignore
  # saves time for repeated regexp use, since it stays the same
  ignore_dirs = Regexp.compile(/^\.\.?$/)
  stack = Array.new
  stack.push(directory)
  while not stack.empty?
    d = stack.pop
    Dir.foreach(d) do |entry|
      if not ignore_dirs.match(entry)
        if File.stat("#{d}/#{entry}").file?
          yield "#{d}/#{entry}"
        else
          stack.push("#{d}/#{entry}")
        end
      end
    end
  end
end

def listselected(dirname,excludelist,includefastq,includegzip)
  suffixes = ["fasta","fna","fa","fsa","FASTA"]
  if includefastq
    suffixes.push("fastq")
  end
  if includegzip
    suffixes.push("fsa.gz")
    suffixes.push("FASTA.gz")
  end
  listdirectory(dirname) do |filename|
    suffixes.each do |suffix|
      if filename.match(/\.#{suffix}$/) and
         not excludelist.member?(File.basename(filename))
        yield filename
      end
    end
  end
end

def parseargs(argv)
  options = OpenStruct.new
  options.withgttestdata = true
  options.excludelist = Array.new()
  options.includefastq = true
  options.includegzip = true
  opts = OptionParser.new()
  opts.on("-n","--no-gttestdata","exclude gttestdata") do |x|
    options.withgttestdata = false
  end
  opts.on("-e","--excludelist STRING",
          "list of files (basenames) to exclude") do |x|
    x.split(/,/).each do |ef|
      options.excludelist.push(ef)
    end
  end
  opts.on("-q","--no-fastq",
          "exclude files ending with .fastq") do
    options.includefastq = false
  end
  opts.on("-g","--no-gzip",
          "exclude files ending with .gz") do
    options.includegzip = false
  end
  opts.on( '-h', '--help', 'Display this screen' ) do
    puts "Usage: #{$0} [options]"
    puts opts
    exit 0
  end
  rest = opts.parse(argv)
  if rest.length != 0
    STDERR.puts options.banner
    exit 1
  end
  return options
end

options = parseargs(ARGV)

testdata_exclude = ["solid_color_reads.fastq",
                    "test2_wrong_begin.fastq",
                    "test9_uneven_length.fastq",
                    "test7_empty_seq.fastq",
                    "test6_premature_end.fastq",
                    "test4_different_seqlengths.fastq",
                    "test3_different_seqnames.fastq",
                    "corruptpatternfile.fna",
                    "TTT-small-wrongchar.fna",
                    "sw100K1.fsa",
                    "sw100K2.fsa"] + options.excludelist

if ENV.has_key?("GTDIR")
  testdata_dir = "#{ENV["GTDIR"]}/testdata"
  listselected(testdata_dir,testdata_exclude,options.includefastq,
               options.includegzip) do |filename|
    puts filename
  end
end

if options.withgttestdata
  if ENV.has_key?("GTTESTDATA")
    gttestdata_exclude = ["trembl-section.fsa.gz"]
    listselected(ENV["GTTESTDATA"],gttestdata_exclude,options.includefastq,
                 options.includegzip) do |filename|
      puts filename
    end
  end
end