File: goslim.rb

package info (click to toggle)
ruby-bio 2.0.6-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 7,108 kB
  • sloc: ruby: 68,331; perl: 13; makefile: 11; sh: 1
file content (303 lines) | stat: -rwxr-xr-x 6,775 bytes parent folder | download | duplicates (10)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
#!/usr/bin/env ruby
#
# goslim.rb - making a GO slim histgram
#
#  Usage:
#
#    % goslim.rb -p process.ontology -f function.ontology \
#       -c component.ontology -s goslim_goa.2002 -g gene_association.mgi \
#       -o mgi -r
#    % R < mgi.R
#    % gv mgi.pdf
#
# Copyright::  Copyright (C) 2003
#              Mitsuteru C. Nakao <n@bioruby.org>
# License::    The Ruby License
#
#  $Id: goslim.rb,v 1.5 2007/04/05 23:35:42 trevor Exp $
#



SCRIPT_VERSION = '$Id: goslim.rb,v 1.5 2007/04/05 23:35:42 trevor Exp $'

USAGE = "${__FILE__} - GO slim
Usage:
  #{__FILE__} -p process.ontology -f function.ontology \
     -c component.ontolgy -g gene_association.mgi -s goslim_goa.2002 \
     -o goslim.uniqued.out -r

  #{__FILE__} -p process.ontology -f function.ontology \
     -c component.ontolgy -l gene_association.list -s goslim_goa.2002 \
     -o mgi.out -r

  #{__FILE__} -p process.ontology -f function.ontology \
     -c component.ontolgy -g gene_association.mgi -s goslim_goa.2002 >\
     go_goslit.paired.list



Options;
 -p,--process <go/ontology/process.ontology>
 -f,--function <go/ontology/function.ontolgoy>
 -c,--component <go/ontology/component.ontology>
 -g,--ga <go/gene-associations/gene_association.someone>
 -l,--galist <a GO_ID list>
 -s,--goslim <go/GO_slim/goslim_someone>
 -o,--output <file_name> -- output file name.
 -r,--r_script -- Writing a R script in <file_name>.R to plot a barplot.
 -h,--help
 -v,--version

Format:
  GO ID list: /^GO:\d{7}/ for each line

Mitsuteru C. Nakao <n@bioruby.org>
"



require 'getoptlong'
parser = GetoptLong.new
parser.set_options(
		   ['--process',   '-p', GetoptLong::REQUIRED_ARGUMENT],
		   ['--function',  '-f', GetoptLong::REQUIRED_ARGUMENT],
		   ['--component', '-c', GetoptLong::REQUIRED_ARGUMENT],
		   ['--ga',        '-g', GetoptLong::REQUIRED_ARGUMENT],
		   ['--galist',    '-l', GetoptLong::REQUIRED_ARGUMENT],
		   ['--goslim',    '-s', GetoptLong::REQUIRED_ARGUMENT],
		   ['--output',    '-o', GetoptLong::REQUIRED_ARGUMENT],
		   ['--r_script',  '-r', GetoptLong::NO_ARGUMENT],
		   ['--help',      '-h', GetoptLong::NO_ARGUMENT],
		   ['--version',   '-v', GetoptLong::NO_ARGUMENT])

begin
  parser.each_option do |name, arg|
    eval "$OPT_#{name.sub(/^--/, '').gsub(/-/, '_').upcase} = '#{arg}'"
  end
rescue
  exit(1)
end

if $OPT_VERSION
  puts SCRIPT_VERSION
  exit(0)
end

if $OPT_HELP or !($OPT_PROCESS or $OPT_FUNCTION or $OPT_COMPONENT or 
		  ($OPT_GA or $OPT_GALIST))
  puts USAGE
  exit(0)
end




# subroutines

def slim2r(datname)
  tmp = "# usage: % R --vanilla < #{datname}.R
data <- read.delim2('#{datname}')
dat <- data$count
names(dat) <- paste(data$GO.Term, dat)
# set graphc format
pdf('#{datname}.pdf') 
#postscript('#{datname}.ps') 
# outside margins
par(mai = c(1,2.8,1,0.7))
barplot(dat, 
        cex.names = 0.6,  # row names font size
        las = 2,          # set horizontal row names
        horiz = T,        # set horizontal 
        main = 'GO slim', # main title
        # set color schema, proc, blue(3); func, red(2); comp, green(4)
        col = cbind(c(data$aspect == 'process'), 
		    c(data$aspect == 'function'), 
                    c(data$aspect == 'component')) %*% c(4,2,3)) # color
dev.off()
"
end


# build GOslim uniqued list
def slim(ontology, slim_ids, tmp, ga, aspect)
  tmp[aspect] = Hash.new(0)
  slim_ids.each {|slim_id|
    term = ontology.goid2term(slim_id)
    if term
      tmp[aspect][term] = 0
    else
      next
    end

    ga.each {|gaid|
      begin 
	res = ontology.bfs_shortest_path(slim_id, gaid)
	tmp[aspect][term] += 1 if res[0]
      rescue NameError 
	$stderr.puts "Warnning: GO:#{slim_id} (#{term}) doesn't exist in the #{aspect}.ontology."
	tmp[aspect].delete(term)
	break
      end
    }
  }
end


# build GO-GOslim uniqued list
def slim2(ontology, slim_ids, tmp, ga, aspect)
  tmp[aspect] = Hash.new
  slim_ids.each {|slim_id|
    term = ontology.goid2term(slim_id)
    if term
      begin
	unless tmp[aspect][term]['GOslim'].index(slim_id)
	  tmp[aspect][term]['GOslim'] << slim_id
	end
      rescue NameError
	tmp[aspect][term] = {'GOslim'=>[slim_id], 'GO'=>[]}
      end
    else
      next
    end

    ga.each {|gaid|
      begin 
	res = ontology.bfs_shortest_path(slim_id, gaid)
	tmp[aspect][term]['GO'] << gaid if res[0]
      rescue NameError

	break
      end
    }
  }
end



#
# main
#

require 'bio/db/go'

aspects = ['process', 'function', 'component']
rootids = {
  'process'   => '0008150', 
  'function'  => '0003674', 
  'component' => '0005575'}

# files open

ios = {}
files = {
  'process'   => $OPT_PROCESS, 
  'function'  => $OPT_FUNCTION, 
  'component' => $OPT_COMPONENT,  
  'ga'   => $OPT_GA,            # gene-association
  'list' => $OPT_GALIST,        # gene-association list
  'slim' => $OPT_GOSLIM}        # GO slim

files.each {|k, file_name|
  next if file_name == nil
  ios[k] = File.open(file_name)
}

if $OPT_OUTPUT
  ios['output']   = File.new($OPT_OUTPUT, "w+")
  ios['r_script'] = File.new("#{$OPT_OUTPUT}.R", "w+")
else
  ios['r_script'] = ios['output'] = $stdout
end


# start

# ontology
ontology = {}
aspects.each {|aspect|
  ontology[aspect] = Bio::GO::Ontology.new(ios[aspect].read)
}


# GO slim
goslim = Bio::GO::Ontology.new(ios['slim'].read)

# assign a aspect to terms in the GO slim.
slim_ids = Hash.new([])
goslim.to_list.map {|ent| ent.node }.flatten.uniq.each {|goid|
  rootids.each {|aspect, rootid|
    begin
      a,b = ontology[aspect].bfs_shortest_path(rootid, goid)
      slim_ids[aspect] << goid
    rescue NameError
      $stderr.puts "Error: (#{rootid}, #{goid})"
    end
  }
}




# gene-associations

ga_ids = []
if $OPT_GA
  ga = Bio::GO::GeneAssociation.parser(ios['ga'].read)
  ga_ids = ga.map {|ent| ent.goid }

elsif $OPT_GALIST
  while line = ios['list'].gets
    if /^GO:(\d{7})/ =~ line
      goid = $1
      ga_ids << goid
    end
  end
else
  puts "Error: -l or -g options"
  exit
end


# count number

count = Hash.new(0)

aspects.each {|aspect|
  slim2(ontology[aspect], slim_ids[aspect], count, ga_ids, aspect)
}




# output

if $OPT_R_SCRIPT and $OPT_OUTPUT
  tmp = [['aspect', 'count', 'GO Term'].join("\t")]
else
  tmp = [['aspect', 'GO ID', 'GOslim Term', 'GOslim ID'].join("\t")]
end

['component','function','process'].each {|aspect|
  count[aspect].sort {|a, b| b[1]['GO'].size <=> a[1]['GO'].size }.each {|term, value|
    next if term == ""

    if $OPT_R_SCRIPT and $OPT_OUTPUT
      tmp << [aspect, value['GO'].size, term].join("\t") 
    else
      value['GO'].each {|goid|
	tmp << [aspect, "GO:#{goid}", term, 
	  value['GOslim'].map {|e| "GO:#{e}" }.join(' ')].join("\t") 
      }
    end
  }
}
ios['output'].puts tmp.join("\n")


if $OPT_R_SCRIPT and $OPT_OUTPUT
  ios['r_script'].puts slim2r($OPT_OUTPUT)
end


#