File: StructuredText.py

package info (click to toggle)
python-gendoc 0.73-3
  • links: PTS
  • area: main
  • in suites: slink
  • size: 312 kB
  • ctags: 844
  • sloc: python: 2,609; makefile: 123; sh: 26
file content (304 lines) | stat: -rw-r--r-- 9,267 bytes parent folder | download | duplicates (5)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
#! /usr/bin/env python -- # -*- python -*-
# $What$
'''Structured Text Manipulation

Parse a structured text string into a form that can be used with 
structured formats, like html.

Structured text is text that uses indentation and simple
symbology to indicate the structure of a document.  

A structured string consists of a sequence of paragraphs separated by
one or more blank lines.  Each paragraph has a level which is defined
as the minimum indentation of the paragraph.  A paragraph is a
sub-paragraph of another paragraph if the other paragraph is the last
preceedeing paragraph that has a lower level.

Special symbology is used to indicate special constructs:

- A paragraph that begins with a '-', '*', or 'o' is treated as an
  unordered list (bullet) element.

- A paragraph that begins with a sequence of digits followed by a
  white-space character is treated as an ordered list element.

- A paragraph that begins with a sequence of sequences, where each
  sequence is a sequence of digits or a sequence of letters followed
  by a period, is treated as an ordered list element.

- A paragraph with a first line that contains some text, followed by
  some white-space and '--' is treated as
  a descriptive list element. The leading text is treated as the
  element title.

- Sub-paragraphs of a paragraph that ends in the word 'example' or the
  word 'examples' is treated as example code and is output as is.

- Text enclosed single quotes (with white-space to the left of the
  first quote and whitespace or puctuation to the right of the second quote)
  is treated as example code.

- Text surrounded by '*' characters (with white-space to the left of the
  first '*' and whitespace or puctuation to the right of the second '*')
  is emphasized.

- Text surrounded by '**' characters (with white-space to the left of the
  first '**' and whitespace or puctuation to the right of the second '**')
  is emphasized.

$Id: StructuredText.py,v 1.1 1996/08/26 20:31:46 omfadmin Exp $'''
#     Copyright 
#
#       Copyright 1996 Digital Creations, L.C., 910 Princess Anne
#       Street, Suite 300, Fredericksburg, Virginia 22401 U.S.A. All
#       rights reserved.  Copyright in this software is owned by DCLC,
#       unless otherwise indicated. Permission to use, copy and
#       distribute this software is hereby granted, provided that the
#       above copyright notice appear in all copies and that both that
#       copyright notice and this permission notice appear. Note that
#       any product, process or technology described in this software
#       may be the subject of other Intellectual Property rights
#       reserved by Digital Creations, L.C. and are not licensed
#       hereunder.
#
#     Trademarks 
#
#       Digital Creations & DCLC, are trademarks of Digital Creations, L.C..
#       All other trademarks are owned by their respective companies. 
#
#     No Warranty 
#
#       The software is provided "as is" without warranty of any kind,
#       either express or implied, including, but not limited to, the
#       implied warranties of merchantability, fitness for a particular
#       purpose, or non-infringement. This software could include
#       technical inaccuracies or typographical errors. Changes are
#       periodically made to the software; these changes will be
#       incorporated in new editions of the software. DCLC may make
#       improvements and/or changes in this software at any time
#       without notice.
#
#     Limitation Of Liability 
#
#       In no event will DCLC be liable for direct, indirect, special,
#       incidental, economic, cover, or consequential damages arising
#       out of the use of or inability to use this software even if
#       advised of the possibility of such damages. Some states do not
#       allow the exclusion or limitation of implied warranties or
#       limitation of liability for incidental or consequential
#       damages, so the above limitation or exclusion may not apply to
#       you.
#  
#
# If you have questions regarding this software,
# contact:
#
#   Jim Fulton, jim@digicool.com
#
#   (540) 371-6909
#
# $Log: /Gendoc/StructuredText.py $
# 
# 1     98-04-01 13:15 Daniel
# Revision 1.1  1996/08/26  20:31:46  omfadmin
# Initial revision
#
#
# 

import regex, regsub

indent_tab  =regex.compile('\(\n\|^\)\( *\)\t')
indent_space=regex.compile('\n\( *\)')
paragraph_divider=regex.compile('\(\n *\)+\n')

def untabify(aString):
    '''\
    Convert indentation tabs to spaces.
    '''
    result=''
    rest=aString
    while 1:
	start=indent_tab.search(rest)
	if start >= 0:
	    lnl=len(indent_tab.group(1))
	    indent=len(indent_tab.group(2))
	    result=result+rest[:start]
	    rest="\n%s%s" % (' ' * ((indent/8+1)*8),
			     rest[start+indent+1+lnl:])
	else:
	    return result+rest

def indent_level(aString):
    '''\
    Find the minimum indentation for a string, not counting blank lines.
    '''
    start=0
    text='\n'+aString
    indent=l=len(text)
    while 1:
	start=indent_space.search(text,start)
	if start >= 0:
	    i=len(indent_space.group(1))
	    start=start+i+1
	    if start < l and text[start] != '\n':	# Skip blank lines
		if not i: return (0,aString)
		if i < indent: indent = i
	else:
	    return (indent,aString)

def paragraphs(list,start):
    l=len(list)
    level=list[start][0]
    i=start+1
    while i < l and list[i][0] > level:	i=i+1
    return i-1-start

def structure(list):
    if not list: return []
    i=0
    l=len(list)
    r=[]
    while i < l:
	sublen=paragraphs(list,i)
	i=i+1
	r.append((list[i-1][1],structure(list[i:i+sublen])))
	i=i+sublen
    return r

bullet=regex.compile('[ \t\n]*[o*-][ \t\n]+\([^\0]*\)')
example=regex.compile('[ \t\n]examples?:[ \t\n]*$')
dl=regex.compile('\([^\n]+\)[ \t]+--[ \t\n]+\([^\0]*\)')
nl=regex.compile('\n')
ol=regex.compile('[ \t]*\(\([0-9]+\|[a-zA-Z]+\)\.\)+[ \t\n]+\([^\0]*\|$\)')
olp=regex.compile('[ \t]*([0-9]+)[ \t\n]+\([^\0]*\|$\)')
em=regex.compile("[ \t\n]\*\([^ \t][^\n*]*[^ \t]\)\*\([ \t\n,.:;!?]\)")
code=regex.compile("[ \t\n(]'\([^ \t']\([^\n']*[^ \t']\)?\)'\([) \t\n,.:;!?]\)")
strong=regex.compile("[ \t\n]\*\*\([^ \t][^\n*]*[^ \t]\)\*\*\([ \t\n,.:;!?]\)")
extra_dl=regex.compile("</dl>\n<dl>")
extra_ul=regex.compile("</ul>\n<ul>")
extra_ol=regex.compile("</ol>\n<ol>")

class StructuredText:

    '''\
    Model text as structured collection of paragraphs.

    Structure is implied by the indentation level.

    This class is intended as a base classes that do actual text
    output formatting.
    '''

    def __init__(self,aStructuredString, level=1):
	'''\
	Convert a string containing structured text into a structured text object.

	Aguments:

	  aStructuredString -- The string to be parsed.
	  level -- The level of top level headings to be created.
	'''
	self.level=level
	paragraphs=regsub.split(untabify(aStructuredString),paragraph_divider)
	paragraphs=map(indent_level,paragraphs)

	self.structure=structure(paragraphs)


    def __str__(self):
	return str(self.structure)
	

class HTML(StructuredText):

    '''\
    An HTML structured text formatter.
    '''\

    def __str__(self):
	'''\
	Return an HTML string representation of the structured text data.

	'''
	s=self._str(self.structure,self.level)
	if s is None: s=''
	s=regsub.gsub(extra_dl,'\n',s)
	s=regsub.gsub(extra_ul,'\n',s)
	s=regsub.gsub(extra_ol,'\n',s)
	s=regsub.gsub(strong,' <strong>\\1</strong>\\2',s)
	s=regsub.gsub(code,' <code>\\1</code>\\3',s)
	s=regsub.gsub(em,' <em>\\1</em>\\2',s)
	return s

    def ul(self, before, p, after):
	if p: p="<p>%s</p>" % p
	return ('%s<ul><li>%s\n%s\n</ul>\n'
		% (before,p,after))

    def ol(self, before, p, after):
	if p: p="<p>%s</p>" % p
	return ('%s<ol><li>%s\n%s\n</ol>\n'
		% (before,p,after))

    def dl(self, before, t, d, after):
	return ('%s<dl><dt>%s<dd><p>%s</p>\n%s\n</dl>\n'
		% (before,t,d,after))

    def head(self, before, t, level, d):
	# if level <= 6: t="<h%d>%s</h%d>" % (level,t,level)
	t="<p><strong>%s</strong><p>" % t
	return ('%s<dl><dt>%s\n<dd>%s\n</dl>\n'
	        % (before,t,d))

    def normal(self,before,p,after):
	return '%s<p>%s</p>\n%s\n' % (before,p,after)

    def _str(self,structure,level):
	r=''
	for s in structure:
	    # print s[0],'\n', len(s[1]), '\n\n'
	    if bullet.match(s[0]) >= 0:
		p=bullet.group(1)
		r=self.ul(r,p,self._str(s[1],level))
	    elif ol.match(s[0]) >= 0:
		p=ol.group(3)
		r=self.ul(r,p,self._str(s[1],level))
	    elif olp.match(s[0]) >= 0:
		p=olp.group(1)
		r=self.ol(r,p,self._str(s[1],level))
	    elif dl.match(s[0]) >= 0:
		t,d=dl.group(1,2)
		r=self.dl(r,t,d,self._str(s[1],level))
	    elif example.search(s[0]) >= 0 and s[1]:
		# Introduce an example, using pre tags:
		r=self.normal(r,s[0],self.pre(s[1]))
	    elif nl.search(s[0]) < 0 and s[1]:
		# Treat as a heading
		t=s[0]
		r=self.head(r,t,level,self._str(s[1],level+1))
	    else:
		r=self.normal(r,s[0],self._str(s[1],level))
	return r

    def pre(self,structure,tagged=0):
	if not structure: return ''
	if tagged:
	    r=''
	else:
	    r='<pre>\n'
	for s in structure:
	    r="%s%s\n\n%s" % (r,s[0],self.pre(s[1],1))
	if not tagged: r=r+'</pre>\n'
	return r
	

def main():
    import sys

    print HTML(sys.stdin.read())

if __name__=="__main__": main()