File: fastagrep.rb

package info (click to toggle)
ruby-bio 2.0.6-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,108 kB
  • sloc: ruby: 68,331; perl: 13; makefile: 11; sh: 1
file content (72 lines) | stat: -rwxr-xr-x 1,825 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
#!/usr/bin/env ruby
#
# fastagrep: Greps a FASTA file (in fact it can use any flat file input supported
#            by BIORUBY) and outputs sorted FASTA
#
#   Copyright (C) 2008 KATAYAMA Toshiaki <k@bioruby.org> & Pjotr Prins
#
#  This program is free software; you can redistribute it and/or modify
#  it under the terms of the GNU General Public License as published by
#  the Free Software Foundation; either version 2 of the License, or
#  (at your option) any later version.
#
#  This program is distributed in the hope that it will be useful,
#  but WITHOUT ANY WARRANTY; without even the implied warranty of
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#  GNU General Public License for more details.
#
#  $Id: fastagrep.rb,v 1.1 2008/05/19 12:22:05 pjotr Exp $
#

require 'bio'

include Bio

usage = <<USAGE

Usage: fastagrep.rb [--skip] [regex] infiles

    -v            Invert the sense of matching, to select non-matching lines.

  Examples:

    Output all sequence descriptors containing 'Arabidopsis' or 'Drosophila'
    regardless of case
		
	    fastagrep.rb "/Arabidopsis|Drosophila/i" *.seq > reduced.fasta

    As the result is a FASTA stream you could pipe it for sorting:
		
	    fastagrep.rb "/Arabidopsis|Drosophila/i" *.seq | fastasort.rb
USAGE

if ARGV.size == 0
  print usage
	exit 1
end

skip = (ARGV[0] == '-v')
ARGV.shift if skip

# ---- Valid regular expression - if it is not a file
regex = ARGV[0]
if regex=~/^\// and !File.exist?(regex)
  ARGV.shift
else
  print usage
  exit 1
end

ARGV.each do | fn |
  Bio::FlatFile.auto(fn).each do | item |
    if skip
  		next if eval("item.definition =~ #{regex}")
    else
  		next if eval("item.definition !~ #{regex}")
    end
    rec = Bio::FastaFormat.new('> '+item.definition.strip+"\n"+item.data)
    print rec
  end
end