File: NCBI_ScoreMat.mod.dtd

package info (click to toggle)
python-biopython 1.68%2Bdfsg-3~bpo8%2B1
  • links: PTS, VCS
  • area: main
  • in suites: jessie-backports
  • size: 46,856 kB
  • sloc: python: 160,306; xml: 93,216; ansic: 9,118; sql: 1,208; makefile: 155; sh: 63
file content (579 lines) | stat: -rw-r--r-- 19,503 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
<!-- ============================================
     ::DATATOOL:: Generated from "scoremat.asn"
     ::DATATOOL:: by application DATATOOL version 2.3.1
     ::DATATOOL:: on 03/06/2012 23:04:40
     ============================================ -->

<!-- ============================================ -->
<!-- This section is mapped from module "NCBI-ScoreMat"
================================================= -->

<!--
$Id: scoremat.asn 347837 2011-12-21 15:28:42Z boratyng $
 ===========================================================================

                            PUBLIC DOMAIN NOTICE
               National Center for Biotechnology Information

  This software/database is a "United States Government Work" under the
  terms of the United States Copyright Act.  It was written as part of
  the author's official duties as a United States Government employee and
  thus cannot be copyrighted.  This software/database is freely available
  to the public for use. The National Library of Medicine and the U.S.
  Government have not placed any restriction on its use or reproduction.

  Although all reasonable efforts have been taken to ensure the accuracy
  and reliability of the software and data, the NLM and the U.S.
  Government do not and cannot warrant the performance or results that
  may be obtained by using this software or data. The NLM and the U.S.
  Government disclaim all warranties, express or implied, including
  warranties of performance, merchantability or fitness for any particular
  purpose.

  Please cite the author in any work or product based on this material.

 ===========================================================================

 Author:  Christiam Camacho

 File Description:
      ASN.1 definitions for scoring matrix

 ===========================================================================
-->

<!-- Elements used by other modules:
          Pssm,
          PssmIntermediateData,
          PssmFinalData,
          PssmParameters,
          PssmWithParameters -->

<!-- Elements referenced from other modules:
          Object-id FROM NCBI-General,
          Seq-entry FROM NCBI-Seqset -->
<!-- ============================================ -->

<!--
 a rudimentary block/core-model, to be used with block-based alignment 
 routines and threading
-->
<!ELEMENT BlockProperty (
        BlockProperty_type, 
        BlockProperty_intvalue?, 
        BlockProperty_textvalue?)>

<!ELEMENT BlockProperty_type (%INTEGER;)>

<!--
    threshold	-  score threshold for heuristics
    minscore	-  observed minimum score in CD
    maxscore	-  observed maximum score in CD
    meanscore	-  observed mean score in CD
    variance	-  observed score variance
    name	-  just name the block
    is-optional	-  block may not have to be used    
-->
<!ATTLIST BlockProperty_type value (
        unassigned |
        threshold |
        minscore |
        maxscore |
        meanscore |
        variance |
        name |
        is-optional |
        other
        ) #IMPLIED >


<!ELEMENT BlockProperty_intvalue (%INTEGER;)>

<!ELEMENT BlockProperty_textvalue (#PCDATA)>


<!ELEMENT CoreBlock (
        CoreBlock_start, 
        CoreBlock_stop, 
        CoreBlock_minstart?, 
        CoreBlock_maxstop?, 
        CoreBlock_property?)>

<!-- begin of block on query -->
<!ELEMENT CoreBlock_start (%INTEGER;)>

<!-- end of block on query -->
<!ELEMENT CoreBlock_stop (%INTEGER;)>

<!-- optional N-terminal extension -->
<!ELEMENT CoreBlock_minstart (%INTEGER;)>

<!-- optional C-terminal extension -->
<!ELEMENT CoreBlock_maxstop (%INTEGER;)>

<!ELEMENT CoreBlock_property (BlockProperty*)>


<!ELEMENT LoopConstraint (
        LoopConstraint_minlength?, 
        LoopConstraint_maxlength?)>

<!-- minimum length of unaligned region -->
<!ELEMENT LoopConstraint_minlength (%INTEGER;)>

<!-- maximum length of unaligned region -->
<!ELEMENT LoopConstraint_maxlength (%INTEGER;)>


<!ELEMENT CoreDef (
        CoreDef_nblocks, 
        CoreDef_blocks, 
        CoreDef_loops, 
        CoreDef_isDiscontinuous?, 
        CoreDef_insertions?)>

<!-- number of core elements/blocks -->
<!ELEMENT CoreDef_nblocks (%INTEGER;)>

<!-- nblocks locations -->
<!ELEMENT CoreDef_blocks (CoreBlock*)>

<!-- (nblocks+1) constraints -->
<!ELEMENT CoreDef_loops (LoopConstraint*)>

<!-- is it a discontinuous domain -->
<!ELEMENT CoreDef_isDiscontinuous EMPTY>
<!ATTLIST CoreDef_isDiscontinuous value ( true | false ) #REQUIRED >


<!-- positions of long insertions -->
<!ELEMENT CoreDef_insertions (CoreDef_insertions_E*)>


<!ELEMENT CoreDef_insertions_E (%INTEGER;)>


<!ELEMENT Site-annot (
        Site-annot_startPosition, 
        Site-annot_stopPosition, 
        Site-annot_description?, 
        Site-annot_type?, 
        Site-annot_aliases?, 
        Site-annot_motif?, 
        Site-annot_motifuse?)>

<!-- location of the annotation, -->
<!ELEMENT Site-annot_startPosition (%INTEGER;)>

<!--
 start and stop position in the
 PSSM
-->
<!ELEMENT Site-annot_stopPosition (%INTEGER;)>

<!--
 holds description or names, that
 can be used for labels in
 visualization
-->
<!ELEMENT Site-annot_description (#PCDATA)>

<!--
 type of the annotated feature,
 similarly to Align-annot in
 NCBI-Cdd
-->
<!ELEMENT Site-annot_type (%INTEGER;)>

<!--
 additional names for
 the annotation
-->
<!ELEMENT Site-annot_aliases (Site-annot_aliases_E*)>


<!ELEMENT Site-annot_aliases_E (#PCDATA)>

<!-- motif to validate mapping of sites -->
<!ELEMENT Site-annot_motif (#PCDATA)>

<!--
 0 for validation
 1 for motif in seqloc
 2 for multiple motifs in seqloc
-->
<!ELEMENT Site-annot_motifuse (%INTEGER;)>


<!ELEMENT Site-annot-set (Site-annot*)>

<!--
 ===========================================================================
 PSI-BLAST, formatrpsdb, RPS-BLAST workflow:
 ===========================================

 Two possible inputs to PSI-BLAST and formatrpsdb:
 1) PssmWithParams where pssm field contains intermediate PSSM data (matrix 
    of frequency ratios)
 2) PssmWithParams where pssm field contains final PSSM data (matrix of 
    scores and statistical parameters) - such as written by cddumper

 In case 1, PSI-BLAST's PSSM engine is invoked to create the PSSM and perform
 the PSI-BLAST search or build the PSSM to then build the RPS-BLAST database.
 In case 2, PSI-BLAST's PSSM engine is not invoked and the matrix of scores
 statistical parameters are used to perform the search in PSI-BLAST and the
 same data and the data in PssmWithParams::params::rpsdbparams is used to
 build the PSSM and ultimately the RPS-BLAST database
 
 
                 reads    ++++++++++++++ writes
 PssmWithParams  ====>    + PSI-BLAST  + =====> PssmWithParams
                          ++++++++++++++             |  ^
         ^                                           |  |
         |                                           |  |
         +===========================================+  |
                                                     |  |
         +===========================================+  |
         |                                              |
 reads   |                                              | 
         v                                              |
  +++++++++++++++ writes +++++++++++++++++++++++        |
  | formatrpsdb | =====> | RPS-BLAST databases |        |
  +++++++++++++++        +++++++++++++++++++++++        |
                                   ^                    |
                                   |                    |
                                   | reads              |
                             +++++++++++++              |
                             | RPS-BLAST |              |
                             +++++++++++++              |
                                                        |
       reads  ++++++++++++               writes         |
  Cdd ======> | cddumper | =============================+
              ++++++++++++

 ===========================================================================
 Contains the PSSM's scores and its associated statistical parameters. 
 Dimensions and order in which scores are stored must be the same as that 
 specified in Pssm::numRows, Pssm::numColumns, and Pssm::byrow
-->
<!ELEMENT PssmFinalData (
        PssmFinalData_scores, 
        PssmFinalData_lambda, 
        PssmFinalData_kappa, 
        PssmFinalData_h, 
        PssmFinalData_scalingFactor?, 
        PssmFinalData_lambdaUngapped?, 
        PssmFinalData_kappaUngapped?, 
        PssmFinalData_hUngapped?)>

<!-- PSSM's scores -->
<!ELEMENT PssmFinalData_scores (PssmFinalData_scores_E*)>


<!ELEMENT PssmFinalData_scores_E (%INTEGER;)>

<!-- Karlin & Altschul parameter produced during the PSSM's calculation -->
<!ELEMENT PssmFinalData_lambda (%REAL;)>

<!-- Karlin & Altschul parameter produced during the PSSM's calculation -->
<!ELEMENT PssmFinalData_kappa (%REAL;)>

<!-- Karlin & Altschul parameter produced during the PSSM's calculation -->
<!ELEMENT PssmFinalData_h (%REAL;)>

<!--
 scaling factor used to obtain more precision when building the PSSM.
 (i.e.: scores are scaled by this value). By default, PSI-BLAST's PSSM
 engine generates PSSMs which are not scaled-up, however, if PSI-BLAST is
 given a PSSM which contains a scaled-up PSSM (indicated by having a
 scalingFactor greater than 1), then it will scale down the PSSM to
 perform the initial stages of the search with it.
 N.B.: When building RPS-BLAST databases, if formatrpsdb is provided 
 scaled-up PSSMs, it will ensure that all PSSMs used to build the 
 RPS-BLAST database are scaled by the same factor (otherwise, RPS-BLAST 
 will silently produce incorrect results).
-->
<!ELEMENT PssmFinalData_scalingFactor (%INTEGER;)>

<!-- Karlin & Altschul parameter produced during the PSSM's calculation -->
<!ELEMENT PssmFinalData_lambdaUngapped (%REAL;)>

<!-- Karlin & Altschul parameter produced during the PSSM's calculation -->
<!ELEMENT PssmFinalData_kappaUngapped (%REAL;)>

<!-- Karlin & Altschul parameter produced during the PSSM's calculation -->
<!ELEMENT PssmFinalData_hUngapped (%REAL;)>

<!--
 Contains the PSSM's intermediate data used to create the PSSM's scores 
 and statistical parameters. Dimensions and order in which scores are 
 stored must be the same as that specified in Pssm::numRows, 
 Pssm::numColumns, and Pssm::byrow
-->
<!ELEMENT PssmIntermediateData (
        PssmIntermediateData_resFreqsPerPos?, 
        PssmIntermediateData_weightedResFreqsPerPos?, 
        PssmIntermediateData_freqRatios, 
        PssmIntermediateData_informationContent?, 
        PssmIntermediateData_gaplessColumnWeights?, 
        PssmIntermediateData_sigma?, 
        PssmIntermediateData_intervalSizes?, 
        PssmIntermediateData_numMatchingSeqs?, 
        PssmIntermediateData_numIndeptObsr?)>

<!--
 observed residue frequencies (or counts) per position of the PSSM 
 (prior to application of pseudocounts)
-->
<!ELEMENT PssmIntermediateData_resFreqsPerPos (PssmIntermediateData_resFreqsPerPos_E*)>


<!ELEMENT PssmIntermediateData_resFreqsPerPos_E (%INTEGER;)>

<!--
 Weighted observed residue frequencies per position of the PSSM.
 (N.B.: each position's weights should add up to 1.0).
 This field corresponds to f_i (f sub i) in equation 2 of 
 Nucleic Acids Res. 2001 Jul 15;29(14):2994-3005.
 NOTE: this is needed for diagnostics information only (i.e.:
 -out_ascii_pssm option in psiblast)
-->
<!ELEMENT PssmIntermediateData_weightedResFreqsPerPos (PssmIntermediateData_weightedResFreqsPerPos_E*)>


<!ELEMENT PssmIntermediateData_weightedResFreqsPerPos_E (%REAL;)>

<!-- PSSM's frequency ratios -->
<!ELEMENT PssmIntermediateData_freqRatios (PssmIntermediateData_freqRatios_E*)>


<!ELEMENT PssmIntermediateData_freqRatios_E (%REAL;)>

<!--
 Information content per position of the PSSM
 NOTE: this is needed for diagnostics information only (i.e.:
 -out_ascii_pssm option in psiblast)
-->
<!ELEMENT PssmIntermediateData_informationContent (PssmIntermediateData_informationContent_E*)>


<!ELEMENT PssmIntermediateData_informationContent_E (%REAL;)>

<!--
 Relative weight for columns of the PSSM without gaps to pseudocounts
 NOTE: this is needed for diagnostics information only (i.e.:
 -out_ascii_pssm option in psiblast)
-->
<!ELEMENT PssmIntermediateData_gaplessColumnWeights (PssmIntermediateData_gaplessColumnWeights_E*)>


<!ELEMENT PssmIntermediateData_gaplessColumnWeights_E (%REAL;)>

<!--
 Used in sequence weights computation
 NOTE: this is needed for diagnostics information only (i.e.:
 -out_ascii_pssm option in psiblast)
-->
<!ELEMENT PssmIntermediateData_sigma (PssmIntermediateData_sigma_E*)>


<!ELEMENT PssmIntermediateData_sigma_E (%REAL;)>

<!--
 Length of the aligned regions per position of the query sequence
 NOTE: this is needed for diagnostics information only (i.e.:
 -out_ascii_pssm option in psiblast)
-->
<!ELEMENT PssmIntermediateData_intervalSizes (PssmIntermediateData_intervalSizes_E*)>


<!ELEMENT PssmIntermediateData_intervalSizes_E (%INTEGER;)>

<!--
 Number of matching sequences per position of the PSSM (including the
 query)
 NOTE: this is needed for diagnostics information only (i.e.:
 -out_ascii_pssm option in psiblast)
-->
<!ELEMENT PssmIntermediateData_numMatchingSeqs (PssmIntermediateData_numMatchingSeqs_E*)>


<!ELEMENT PssmIntermediateData_numMatchingSeqs_E (%INTEGER;)>

<!--
 Number of independent observations per position of the PSSM
 NOTE: this is needed for building CDD database for DELTA-BLAST
-->
<!ELEMENT PssmIntermediateData_numIndeptObsr (PssmIntermediateData_numIndeptObsr_E*)>


<!ELEMENT PssmIntermediateData_numIndeptObsr_E (%REAL;)>

<!--
 Position-specific scoring matrix

 Column indices on the PSSM refer to the positions corresponding to the
 query/master sequence, i.e. the number of columns (N) is the same
 as the length of the query/master sequence. 
 Row indices refer to individual amino acid types, i.e. the number of 
 rows (M) is the same as the number of different residues in the 
 alphabet we use. Consequently, row labels are amino acid identifiers.

 PSSMs are stored as linear arrays of integers. By default, we store
 them column-by-column, M values for the first column followed by M
 values for the second column, and so on. In order to provide
 flexibility for external applications, the boolean field "byrow" is 
 provided to specify the storage order.
-->
<!ELEMENT Pssm (
        Pssm_isProtein?, 
        Pssm_identifier?, 
        Pssm_numRows, 
        Pssm_numColumns, 
        Pssm_rowLabels?, 
        Pssm_byRow?, 
        Pssm_query?, 
        Pssm_intermediateData?, 
        Pssm_finalData?)>

<!-- Is the this a protein or nucleotide scoring matrix? -->
<!ELEMENT Pssm_isProtein EMPTY>
<!ATTLIST Pssm_isProtein value ( true | false ) "true" >


<!-- PSSM identifier -->
<!ELEMENT Pssm_identifier (Object-id)>

<!--
 The dimensions of the matrix are returned so the client can
 verify that all data was received.
 number of rows
-->
<!ELEMENT Pssm_numRows (%INTEGER;)>

<!-- number of columns -->
<!ELEMENT Pssm_numColumns (%INTEGER;)>

<!--
 row-labels is given to note the order of residue types so that it can
 be cross-checked between applications.
 If this field is not given, the matrix values are presented in 
 order of the alphabet ncbistdaa is used for protein, ncbi4na for nucl.
 for proteins the values returned correspond to 
 (-,-), (-,A), (-,B), (-,C) ... (A,-), (A,A), (A,B), (A,C) ...
-->
<!ELEMENT Pssm_rowLabels (Pssm_rowLabels_E*)>


<!ELEMENT Pssm_rowLabels_E (#PCDATA)>

<!-- are matrices stored row by row? -->
<!ELEMENT Pssm_byRow EMPTY>
<!ATTLIST Pssm_byRow value ( true | false ) "false" >


<!-- PSSM representative sequence (master)  -->
<!ELEMENT Pssm_query (Seq-entry)>

<!--
 both intermediateData and finalData can be provided, but at least one of
 them must be provided.
 N.B.: by default PSI-BLAST will return the PSSM in its PssmIntermediateData 
 representation. 
 Intermediate or final data for the PSSM
-->
<!ELEMENT Pssm_intermediateData (PssmIntermediateData)>

<!-- Final representation for the PSSM -->
<!ELEMENT Pssm_finalData (PssmFinalData)>

<!--
 This structure is used to create the RPS-BLAST database auxiliary file 
 (*.aux) and it contains parameters set at creation time of the PSSM.
 Also, the matrixName field is used by formatrpsdb to build a PSSM from 
 a Pssm structure which only contains PssmIntermediateData.
-->
<!ELEMENT FormatRpsDbParameters (
        FormatRpsDbParameters_matrixName, 
        FormatRpsDbParameters_gapOpen?, 
        FormatRpsDbParameters_gapExtend?)>

<!--
 name of the underlying score matrix whose frequency ratios were
 used in PSSM construction (e.g.: BLOSUM62)
-->
<!ELEMENT FormatRpsDbParameters_matrixName (#PCDATA)>

<!-- gap opening penalty corresponding to the matrix above -->
<!ELEMENT FormatRpsDbParameters_gapOpen (%INTEGER;)>

<!-- gap extension penalty corresponding to the matrix above -->
<!ELEMENT FormatRpsDbParameters_gapExtend (%INTEGER;)>

<!--
 Populated by PSSM engine of PSI-BLAST, original source for these values 
 are the PSI-BLAST options specified using the BLAST options API
-->
<!ELEMENT PssmParameters (
        PssmParameters_pseudocount?, 
        PssmParameters_rpsdbparams?, 
        PssmParameters_constraints?, 
        PssmParameters_bitScoreThresh?, 
        PssmParameters_annotatedSites?)>

<!--
 pseudocount constant used for PSSM. This field corresponds to beta in 
 equation 2 of Nucleic Acids Res. 2001 Jul 15;29(14):2994-3005.
-->
<!ELEMENT PssmParameters_pseudocount (%INTEGER;)>

<!--
 data needed by formatrpsdb to create RPS-BLAST databases. matrixName is
 populated by PSI-BLAST
-->
<!ELEMENT PssmParameters_rpsdbparams (FormatRpsDbParameters)>

<!--
 alignment constraints needed by sequence-structure threader
 and other global or local block-alignment algorithms
-->
<!ELEMENT PssmParameters_constraints (CoreDef)>

<!-- bit score threshold for specific conserved domain hits -->
<!ELEMENT PssmParameters_bitScoreThresh (%REAL;)>

<!-- conserved functional sites with annotations -->
<!ELEMENT PssmParameters_annotatedSites (Site-annot-set)>

<!--
 Envelope containing PSSM and the parameters used to create it. 
 Provided for use in PSI-BLAST, formatrpsdb, and for the structure group.
-->
<!ELEMENT PssmWithParameters (
        PssmWithParameters_pssm, 
        PssmWithParameters_params?)>

<!--
 This field is applicable to PSI-BLAST and formatrpsdb.
 When both the intermediate and final PSSM data are provided in this
 field, the final data (matrix of scores and associated statistical
 parameters) takes precedence and that data is used for further
 processing. The rationale for this is that the PSSM's scores and
 statistical parameters might have been calculated by other applications
 and it might not be possible to recreate it by using PSI-BLAST's PSSM 
 engine.
-->
<!ELEMENT PssmWithParameters_pssm (Pssm)>

<!--
 This field's rpsdbparams is used to specify the values of options 
 for processing by formatrpsdb. If these are not set, the command 
 line defaults of formatrpsdb are applied. This field is used
 by PSI-BLAST to verify that the underlying scorem matrix used to BUILD
 the PSSM is the same as the one being specified through the BLAST
 Options API. If this field is omitted, no verification will be
 performed, so be careful to keep track of what matrix was used to build
 the PSSM or else the results produced by PSI-BLAST will be unreliable.
-->
<!ELEMENT PssmWithParameters_params (PssmParameters)>