File: prosite.rb

package info (click to toggle)
ruby-bio 1.5.0-2
  • links: PTS, VCS
  • area: main
  • in suites: stretch
  • size: 7,480 kB
  • ctags: 9,428
  • sloc: ruby: 74,117; xml: 3,383; makefile: 17; perl: 13; sh: 1
file content (504 lines) | stat: -rw-r--r-- 11,233 bytes parent folder | download | duplicates (8)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
#
# = bio/db/prosite.rb - PROSITE database class
#
# Copyright::  Copyright (C) 2001 Toshiaki Katayama <k@bioruby.org>
# License::    The Ruby License
#
# $Id:$
#

require 'bio/db'

module Bio

class PROSITE < EMBLDB

  # Delimiter
  DELIMITER = "\n//\n"

  # Delimiter
  RS = DELIMITER

  # Bio::DB API
  TAGSIZE = 5


  def initialize(entry)
    super(entry, TAGSIZE)
  end


  # ID  Identification                     (Begins each entry; 1 per entry)
  #
  #  ID   ENTRY_NAME; ENTRY_TYPE.  (ENTRY_TYPE : PATTERN, MATRIX, RULE)
  #
  # Returns
  def name
    unless @data['ID']
      @data['ID'], @data['TYPE'] = fetch('ID').chomp('.').split('; ')
    end
    @data['ID']
  end

  # Returns
  def division
    unless @data['TYPE']
      name
    end
    @data['TYPE']
  end


  # AC  Accession number                   (1 per entry)
  #
  #  AC   PSnnnnn;
  #
  # Returns
  def ac
    unless @data['AC']
      @data['AC'] = fetch('AC').chomp(';')
    end
    @data['AC']
  end

  alias entry_id ac


  # DT  Date                               (1 per entry)
  #
  #  DT   MMM-YYYY (CREATED); MMM-YYYY (DATA UPDATE); MMM-YYYY (INFO UPDATE).
  #
  # Returns
  def dt
    field_fetch('DT')
  end

  alias date dt


  # DE  Short description                  (1 per entry)
  #
  #  DE   Description.
  #
  # Returns
  def de
    field_fetch('DE')
  end

  alias definition de


  # PA  Pattern                            (>=0 per entry)
  #
  #  see - pa2re method
  #
  # Returns
  def pa
    field_fetch('PA')
    @data['PA'] = fetch('PA') unless @data['PA']
    @data['PA'].gsub!(/\s+/, '') if @data['PA']
    @data['PA']
  end

  alias pattern pa


  # MA  Matrix/profile                     (>=0 per entry)
  #
  #  see - ma2re method
  #
  # Returns
  def ma
    field_fetch('MA')
  end

  alias profile ma


  # RU  Rule                               (>=0 per entry)
  #
  #  RU   Rule_Description.
  #
  #  The rule is described in ordinary English and is free-format.
  #
  # Returns
  def ru
    field_fetch('RU')
  end

  alias rule ru


  # NR  Numerical results                  (>=0 per entry)
  #
  #   - SWISS-PROT scan statistics of true and false positives/negatives
  #
  # /RELEASE     SWISS-PROT release  number and  total  number  of  sequence
  #              entries in that release.
  # /TOTAL       Total number of hits in SWISS-PROT.
  # /POSITIVE    Number of  hits on proteins that are known to belong to the
  #              set in consideration.
  # /UNKNOWN     Number of  hits on  proteins that  could possibly belong to
  #              the set in consideration.
  # /FALSE_POS   Number of false hits (on unrelated proteins).
  # /FALSE_NEG   Number of known missed hits.
  # /PARTIAL     Number of  partial sequences  which belong  to the  set  in
  #              consideration, but  which  are  not  hit  by the pattern or
  #              profile because they are partial (fragment) sequences.
  #
  # Returns
  def nr
    unless @data['NR']
      hash = {}			# temporal hash
      fetch('NR').scan(%r{/(\S+)=([^;]+);}).each do |k, v|
        if v =~ /^(\d+)\((\d+)\)$/
          hits = $1.to_i		# the number of hits
          seqs = $2.to_i		# the number of sequences
          v = [hits, seqs]
        elsif v =~ /([\d\.]+),(\d+)/
          sprel = $1			# the number of SWISS-PROT release
          spseq = $2.to_i		# the number of SWISS-PROT sequences
          v = [sprel, spseq]
        else
          v = v.to_i
        end
        hash[k] = v
      end
      @data['NR'] = hash
    end
    @data['NR']
  end

  alias statistics nr

  # Returns
  def release
    statistics['RELEASE']
  end

  # Returns
  def swissprot_release_number
    release.first
  end

  # Returns
  def swissprot_release_sequences
    release.last
  end

  # Returns
  def total
    statistics['TOTAL']
  end

  # Returns
  def total_hits
    total.first
  end

  # Returns
  def total_sequences
    total.last
  end

  # Returns
  def positive
    statistics['POSITIVE']
  end

  # Returns
  def positive_hits
    positive.first
  end

  # Returns
  def positive_sequences
    positive.last
  end

  # Returns
  def unknown
    statistics['UNKNOWN']
  end

  # Returns
  def unknown_hits
    unknown.first
  end

  # Returns
  def unknown_sequences
    unknown.last
  end

  # Returns
  def false_pos
    statistics['FALSE_POS']
  end

  # Returns
  def false_positive_hits
    false_pos.first
  end

  # Returns
  def false_positive_sequences
    false_pos.last
  end

  # Returns
  def false_neg
    statistics['FALSE_NEG']
  end
  alias false_negative_hits false_neg

  # Returns
  def partial
    statistics['PARTIAL']
  end


  # CC  Comments                           (>=0 per entry)
  #
  #  CC   /QUALIFIER=data; /QUALIFIER=data; .......
  #
  # /TAXO-RANGE  Taxonomic range.
  # /MAX-REPEAT  Maximum known  number of  repetitions of  the pattern  in a
  #              single protein.
  # /SITE        Indication of an `interesting' site in the pattern.
  # /SKIP-FLAG   Indication of  an entry that can be, in some cases, ignored
  #              by a program (because it is too unspecific).
  #
  # Returns
  def cc
    unless @data['CC']
      hash = {}			# temporal hash
      fetch('CC').scan(%r{/(\S+)=([^;]+);}).each do |k, v|
        hash[k] = v
      end
      @data['CC'] = hash
    end
    @data['CC']
  end

  alias comment cc

  # Returns
  def taxon_range(expand = nil)
    range = comment['TAXO-RANGE']
    if range and expand
      expand = []
      range.scan(/./) do |x|
        case x
        when 'A'; expand.push('archaebacteria')
        when 'B'; expand.push('bacteriophages')
        when 'E'; expand.push('eukaryotes')
        when 'P'; expand.push('prokaryotes')
        when 'V'; expand.push('eukaryotic viruses')
        end
      end
      range = expand
    end
    return range
  end

  # Returns
  def max_repeat
    comment['MAX-REPEAT'].to_i
  end

  # Returns
  def site
    if comment['SITE']
      num, desc = comment['SITE'].split(',')
    end
    return [num.to_i, desc]
  end

  # Returns
  def skip_flag
    if comment['SKIP-FLAG'] == 'TRUE'
      return true
    end
  end


  # DR  Cross-references to SWISS-PROT     (>=0 per entry)
  #
  #  DR   AC_NB, ENTRY_NAME, C; AC_NB, ENTRY_NAME, C; AC_NB, ENTRY_NAME, C;
  #
  # - `AC_NB' is the SWISS-PROT primary accession number of the entry to
  #   which reference is being made.
  # - `ENTRY_NAME' is the SWISS-PROT entry name.
  # - `C' is a one character flag that can be one of the following:
  #
  # T For a true positive.
  # N For a false negative; a sequence which belongs to the set under
  #   consideration, but which has not been picked up by the pattern or
  #   profile.
  # P For a `potential' hit; a sequence that belongs to the set under
  #   consideration, but which was not picked up because the region(s) that
  #   are used as a 'fingerprint' (pattern or profile) is not yet available
  #   in the data bank (partial sequence).
  # ? For an unknown; a sequence which possibly could belong to the set under
  #   consideration.
  # F For a false positive; a sequence which does not belong to the set in
  #   consideration.
  #
  # Returns
  def dr
    unless @data['DR']
      hash = {}			# temporal hash
      if fetch('DR')
        fetch('DR').scan(/(\w+)\s*, (\w+)\s*, (.);/).each do |a, e, c|
          hash[a] = [e, c]	# SWISS-PROT : accession, entry, true/false
        end
      end
      @data['DR'] = hash
    end
    @data['DR']
  end

  alias sp_xref dr

  # Returns
  def list_xref(flag, by_name = nil)
    ary = []
    sp_xref.each do |sp_acc, value|
      if value[1] == flag
        if by_name
          sp_name = value[0]
          ary.push(sp_name)
        else
          ary.push(sp_acc)
        end
      end
    end
    return ary
  end

  # Returns
  def list_truepositive(by_name = nil)
    list_xref('T', by_name)
  end

  # Returns
  def list_falsenegative(by_name = nil)
    list_xref('F', by_name)
  end

  # Returns
  def list_falsepositive(by_name = nil)
    list_xref('P', by_name)
  end

  # Returns
  def list_potentialhit(by_name = nil)
    list_xref('P', by_name)
  end

  # Returns
  def list_unknown(by_name = nil)
    list_xref('?', by_name)
  end


  # 3D  Cross-references to PDB            (>=0 per entry)
  #
  #  3D   name; [name2;...]
  #
  # Returns
  def pdb_xref
    unless @data['3D']
      @data['3D'] = fetch('3D').split(/; */)
    end
    @data['3D']
  end


  # DO  Pointer to the documentation file  (1 per entry)
  #
  #  DO   PDOCnnnnn;
  #
  # Returns
  def pdoc_xref
    @data['DO'] = fetch('DO').chomp(';')
  end


  ### prosite pattern to regular expression
  #
  # prosite/prosuser.txt:
  #
  # The PA (PAttern) lines contains the definition of a PROSITE pattern. The
  # patterns are described using the following conventions:
  #
  # 0) The standard IUPAC one-letter codes for the amino acids are used.
  # 0) Ambiguities are indicated by listing the acceptable amino acids for a
  #   given position, between square parentheses `[ ]'. For example: [ALT]
  #   stands for Ala or Leu or Thr.
  # 1) A period ends the pattern.
  # 2) When a pattern is restricted to either the N- or C-terminal of a
  #   sequence, that pattern either starts with a `<' symbol or respectively
  #   ends with a `>' symbol.
  # 3) Ambiguities are also indicated by listing between a pair of curly
  #   brackets `{ }' the amino acids that are not accepted at a given
  #   position. For example: {AM} stands for any amino acid except Ala and
  #   Met.
  # 4) Repetition of an element of the pattern can be indicated by following
  #   that element with a numerical value or a numerical range between
  #   parenthesis. Examples: x(3) corresponds to x-x-x, x(2,4) corresponds to
  #   x-x or x-x-x or x-x-x-x.
  # 5) The symbol `x' is used for a position where any amino acid is accepted.
  # 6) Each element in a pattern is separated from its neighbor by a `-'.
  #
  # Examples:
  #
  # PA  [AC]-x-V-x(4)-{ED}.
  #
  # This pattern is translated as: [Ala or Cys]-any-Val-any-any-any-any-{any
  # but Glu or Asp}
  #
  # PA  <A-x-[ST](2)-x(0,1)-V.
  #
  # This pattern, which must be in the N-terminal of the sequence (`<'), is
  # translated as: Ala-any-[Ser or Thr]-[Ser or Thr]-(any or none)-Val
  #
  def self.pa2re(pattern)
    pattern.gsub!(/\s/, '')	# remove white spaces
    pattern.sub!(/\.$/, '')	# (1) remove trailing '.'
    pattern.sub!(/^</, '^')	# (2) restricted to the N-terminal : `<'
    pattern.sub!(/>$/, '$')	# (2) restricted to the C-terminal : `>'
    pattern.gsub!(/\{(\w+)\}/) { |m|
      '[^' + $1 + ']'		# (3) not accepted at a given position : '{}'
    }
    pattern.gsub!(/\(([\d,]+)\)/) { |m|
      '{' + $1 + '}'		# (4) repetition of an element : (n), (n,m)
    }
    pattern.tr!('x', '.')	# (5) any amino acid is accepted : 'x'
    pattern.tr!('-', '')	# (6) each element is separated by a '-'
    Regexp.new(pattern, Regexp::IGNORECASE)
  end

  def pa2re(pattern)
    self.class.pa2re(pattern)
  end

  def re
    self.class.pa2re(self.pa)
  end


  ### prosite profile to regular expression
  #
  # prosite/profile.txt:
  #
  # Returns
  def ma2re(matrix)
    raise NotImplementedError
  end

end # PROSITE

end # Bio