File: cdd_format.py

package info (click to toggle)
python-biopython 1.42-2
  • links: PTS
  • area: main
  • in suites: etch, etch-m68k
  • size: 17,584 kB
  • ctags: 12,272
  • sloc: python: 80,461; xml: 13,834; ansic: 7,902; cpp: 1,855; sql: 1,144; makefile: 203
file content (154 lines) | stat: -rw-r--r-- 6,445 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
# Copyright 2001 by Katharine Lindner.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

"""Martel based parser to read CDD formatted files.

This is a huge regular regular expression for CDD, built using
the 'regular expressiona on steroids' capabilities of Martel.

http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd.shtml
Notes:
Just so I remember -- the new end of line syntax is:
  New regexp syntax - \R
     \R    means "\n|\r\n?"
     [\R]  means "[\n\r]"

This helps us have endlines be consistent across platforms.

# standard library
http://www.ncbi.nlm.nih.gov/Structure/cdd/cdd.shtml
"""
import string

# Martel
import Martel
from Martel import RecordReader
from Martel import Str
from Martel import AnyEol
from Martel import ToEol
from Martel import Group
from Martel import Alt
from Martel import Opt
from Martel import Rep
from Martel import Rep1
from Martel import Any
from Martel import AnyBut
from Martel import Assert
from Martel import AssertNot





# --- first set up some helper constants and functions
# Copyright 2002 by Katharine Lindner.  All rights reserved.
# This code is part of the Biopython distribution and governed by its
# license.  Please see the LICENSE file that should have been included
# as part of this package.

upper_alpha = Any( "ABCDEFGHIJKLMNOPQRSTUVWXYZ" )
white_space = Any( "\t " )
eols = chr( 13 ) + chr( 10 )
white_spaces = Rep( white_space )
summary_line = Str( "CD summary" ) + ToEol()

cd_tag = Group( "cd_tag", Str( "CD:" ) )
description_tag = Group( "description_tag", Str( "Description:" ) )
status_tag = Group( "status_tag", Str( "CD status:" ) )
source_tag = Group( "source_tag", Str( "Source:" ) )
date_tag = Group( "date_tag", Str( "Created:" ) )
reference_tag = Group( "reference_tag", Str( "References:" ) )
taxonomy_tag =  Group( "taxonomy_tag", Str( "Taxonomy spanned:" ) )
aligned_tag = Group( "aligned_tag", Str( "Aligned sequences:" ) )
representative_tag = Group( "representative_tag", Str( "Representative:" ) )
range_tag = Group( "range_tag", Str( "Aligned range:" ) )
sequence_tag = Group( "sequence_tag", Str( "Sequence:" ) )
has_tag = Alt( cd_tag, description_tag, status_tag, source_tag, date_tag, \
    reference_tag, taxonomy_tag, aligned_tag, representative_tag, range_tag, sequence_tag )

cd_key_line = cd_tag + white_spaces + AnyEol()
description_key_line = description_tag + white_spaces + AnyEol()
status_key_line = status_tag + white_spaces + AnyEol()
source_key_line = source_tag + white_spaces + AnyEol()
date_key_line = date_tag + white_spaces + AnyEol()
reference_key_line = reference_tag + white_spaces + AnyEol()
taxonomy_key_line = taxonomy_tag + white_spaces + AnyEol()
aligned_key_line = aligned_tag + white_spaces + AnyEol()
representative_key_line = representative_tag + white_spaces + AnyEol()
range_key_line = range_tag + white_spaces + AnyEol()
sequence_key_line = sequence_tag + white_spaces + AnyEol()

cd_contents_line = Group( "cd_contents_line", AssertNot( has_tag ) + ToEol() )
description_contents_line = AssertNot( has_tag ) + ToEol()
status_contents_line = AssertNot( has_tag ) + ToEol()
source_contents_line = AssertNot( has_tag ) + ToEol()
date_contents_line = AssertNot( has_tag ) + ToEol()
reference_contents_line = AssertNot( has_tag ) + ToEol()
taxonomy_contents_line = AssertNot( has_tag ) + ToEol()
aligned_contents_line = AssertNot( has_tag ) + ToEol()
representative_contents_line = AssertNot( has_tag ) + ToEol()
range_contents_line = AssertNot( has_tag ) + ToEol()
sequence_contents_line = Group( "sequence_contents_line", \
                            white_spaces + Rep1( upper_alpha ) + white_spaces + AnyEol() )
sentinel_line = white_spaces + Str( "Definition" ) + white_spaces + AnyEol()
boiler_plate = AssertNot( sentinel_line ) + ToEol()
definition_line = Group( "definition_line", \
    Rep( AnyBut( eols + '[' ) ) + Str( '[CD]' ) + white_spaces + AnyEol() )
pdb_id_line = AssertNot( definition_line ) + ToEol()
pdb_id_multiline = Group( "pdb_id_multiline", Rep1( pdb_id_line ) )
table_entry = Group( "table_entry", \
    pdb_id_multiline + definition_line )
table = Group( "table", Rep1( table_entry ) )

cd_contents_multiline = Group( "cd_contents_multiline", \
    Rep( cd_contents_line ) )
description_contents_multiline = Group( "description_contents_multiline", \
    Rep( description_contents_line ) )
status_contents_multiline = Group( "status_contents_multiline", \
    Rep( status_contents_line ) )
source_contents_multiline = Group( "source_contents_multiline", \
    Rep( source_contents_line ) )
date_contents_multiline = Group( "date_contents_multiline", \
    Rep( date_contents_line ) )
reference_contents_multiline = Group( "reference_contents_multiline", \
    Rep( reference_contents_line ) )
taxonomy_contents_multiline = Group( "taxonomy_contents_multiline", \
    Rep( taxonomy_contents_line ) )
aligned_contents_multiline = Group( "aligned_contents_multiline", \
    Rep( aligned_contents_line ) )
representative_contents_multiline = Group( "representative_contents_multiline", \
    Rep( representative_contents_line ) )
range_contents_multiline = Group( "range_contents_multiline", \
    Rep( range_contents_line ) )
sequence_contents_multiline = Group( "sequence_contents_multiline", \
    Rep( sequence_contents_line ) )

cd_block = cd_key_line + cd_contents_multiline
description_block = description_key_line + description_contents_multiline
status_block = status_key_line + status_contents_multiline
source_block = source_key_line + source_contents_multiline
date_block = date_key_line + date_contents_multiline
reference_block = Assert(reference_tag ) + reference_key_line + \
    reference_contents_multiline
taxonomy_block = taxonomy_key_line + taxonomy_contents_multiline
aligned_block = aligned_key_line + aligned_contents_multiline
representative_block = representative_key_line + representative_contents_multiline
range_block = range_key_line + range_contents_multiline
sequence_block = sequence_key_line + sequence_contents_multiline
trailer_line = ToEol()

cdd_record = summary_line + cd_block + description_block + status_block + \
    source_block + date_block + Opt( reference_block ) + taxonomy_block + \
    aligned_block + representative_block + range_block + sequence_block + \
    Rep( boiler_plate ) + sentinel_line + table