File: gen_perf_cases.py

package info (click to toggle)
python-markdown2 2.4.11-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 6,104 kB
  • sloc: python: 5,416; perl: 1,493; php: 865; makefile: 34
file content (275 lines) | stat: -rwxr-xr-x 9,726 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
#!/usr/bin/env python2.5

import os
from os.path import *
import sys
import re
from glob import glob
import operator
import shutil
import codecs


TMP = "tmp-"

def gen_aspn_cases(limit=0):
    base_dir = TMP+'aspn-cases'
    if exists(base_dir):
        print("'%s' exists, skipping" % base_dir)
        return 
    os.makedirs(base_dir)
    sys.stdout.write("generate %s" % base_dir); sys.stdout.flush()
    recipes_path = expanduser("~/as/code.as.com/db/aspn/recipes.pprint")
    recipe_dicts = eval(open(recipes_path).read())
    for i, r in enumerate(recipe_dicts):
        sys.stdout.write('.'); sys.stdout.flush()
        f = codecs.open(join(base_dir, "r%04d.text" % i), "w", "utf-8")
        f.write(r["desc"])
        f.close()

        for j, c in enumerate(sorted(r["comments"],
                        key=operator.itemgetter("pub_date"))):
            text = _markdown_from_aspn_html(c["comment"])
            headline = c["title"].strip()
            if headline:
                if headline[-1] not in ".!?,:;'\"":
                    headline += '.'
                headline = _markdown_from_aspn_html(headline).strip()
                text = "**" + headline + "**  " + text
            f = codecs.open(join(base_dir, "r%04dc%02d.text" % (i, j)),
                            'w', "utf-8")
            f.write(text)
            f.close()

        if limit and i >= limit:
            break
    sys.stdout.write('\n')

def gen_test_cases():
    base_dir = TMP+"test-cases"
    if exists(base_dir):
        print("'%s' exists, skipping" % base_dir)
        return 
    os.makedirs(base_dir)
    print("generate %s" % base_dir)
    for test_cases_dir in glob(join("..", "test", "*-cases")):
        for text_file in glob(join(test_cases_dir, "*.text")):
            shutil.copy(text_file, join(base_dir, basename(text_file)))


#---- internal support stuff

br_pat = re.compile(r"</?br ?/?>", re.I)
br_eol_pat = re.compile(r"</?br ?/?>$", re.I | re.MULTILINE)
pre_pat = re.compile(r"<pre>(.*?)</pre>", re.I | re.DOTALL)
single_line_code_pat = re.compile(r"<(tt|code)>(.*?)</\1>", re.I)
a_pat = re.compile(r'''<a(\s+[\w:-]+=["'].*?["'])*>(.*?)</a>''', re.I | re.S | re.U)
href_attr_pat = re.compile(r'''href=(["'])(.*?)\1''', re.I)
title_attr_pat = re.compile(r'''title=(["'])(.*?)\1''', re.I)
i_pat = re.compile(r"<(i)>(.*?)</\1>", re.I)

def _markdown_from_aspn_html(html):
    markdown = html

    markdown = br_eol_pat.sub('\n', markdown)  # <br>EOL
    markdown = br_pat.sub('\n', markdown)  # <br>

    while True: # <code>, <tt> on a single line
        match = single_line_code_pat.search(markdown)
        if not match:
            break
        markdown = single_line_code_pat.sub(r"`\2`", markdown)

    while True: # <i> on a single line
        match = i_pat.search(markdown)
        if not match:
            break
        markdown = i_pat.sub(r"*\2*", markdown)

    while True: # <a>
        match = a_pat.search(markdown)
        if not match:
            break
        start, end = match.span()
        attrs, content = match.group(1), match.group(2)
        href_match = href_attr_pat.search(attrs)
        if href_match:
            href = href_match.group(2)
        else:
            href = None
        title_match = title_attr_pat.search(attrs)
        if title_match:
            title = title_match.group(2)
        else:
            title = None
        escaped_href = href.replace('(', '\\(').replace(')', '\\)')
        if title is None:
            replacement = '[%s](%s)' % (content, escaped_href)
        else:
            replacement = '[%s](%s "%s")' % (content, escaped_href, 
                                             title.replace('"', "'"))
        markdown = markdown[:start] + replacement + markdown[end:]
        
    markdown = markdown.replace("&nbsp;", ' ')

    # <pre> part 1: Pull out <pre>-blocks and put in placeholders
    pre_marker = "THIS_IS_MY_PRE_MARKER_BLAH"
    pre_blocks = []
    while True: # <pre>
        match = pre_pat.search(markdown)
        if not match:
            break
        start, end = match.span()
        lines = match.group(1).splitlines(0)
        if lines and not lines[0].strip():
            del lines[0]
        _dedentlines(lines)
        pre_blocks.append(lines)
        marker = pre_marker + str(len(pre_blocks) - 1)
        markdown = markdown[:start].rstrip() + marker + markdown[end:].lstrip()

    # <pre> part 2: Put <pre>-blocks back in.
    for i, pre_block in enumerate(pre_blocks):
        marker = pre_marker + str(i)
        try:
            idx = markdown.index(marker)
        except ValueError:
            print("marker: %r" % marker)
            raise
        if not markdown[:idx].strip():
            #TODO: Correct this false diagnosis. Problem is not limited
            #      to <h1>
            #TODO: problem with 1203#c6 "Frozen dictionaries": comment title
            #      insertion onto start of an indented-pre/code block
            #
            # There is a bug in python-markdown with an indented block
            # at the start of a buffer: the first line can get rendered
            # as a <h1>. Workaround that by adding a '.' paragraph
            # before.
            # At the time of this writing those comments affected are:
            #    16#c9, 31#c3, 155#c1, 203#c20, 230#c3, 356#c2, 490#c1,
            #    504#c2, 1127#c12
            #log.warn("adding '.'-para Python Markdown hack")
            prefix = ['.']
        else:
            prefix = []
        lines = prefix + ['', ''] + ['    '+ln for ln in lines] + ['', '']
        replacement = '\n'.join(lines)
        markdown = markdown.replace(marker, replacement, 1)

    lines = markdown.splitlines(0)

    # Removing empty lines at start and end.
    while lines and not lines[0].strip():
        del lines[0]
    while lines and not lines[-1].strip():
        del lines[-1]

    # Strip trailing whitespace because don't want auto-<br>'s.
    for i in range(len(lines)):
        lines[i] = lines[i].rstrip()

    markdown = '\n'.join(lines) + '\n'

    #TODO: manual fixes:
    # - comment 1, recipe 7

    return markdown

# Recipe: dedent (0.1.2)
def _dedentlines(lines, tabsize=8, skip_first_line=False):
    """_dedentlines(lines, tabsize=8, skip_first_line=False) -> dedented lines
    
        "lines" is a list of lines to dedent.
        "tabsize" is the tab width to use for indent width calculations.
        "skip_first_line" is a boolean indicating if the first line should
            be skipped for calculating the indent width and for dedenting.
            This is sometimes useful for docstrings and similar.
    
    Same as dedent() except operates on a sequence of lines. Note: the
    lines list is modified **in-place**.
    """
    DEBUG = False
    if DEBUG: 
        print("dedent: dedent(..., tabsize=%d, skip_first_line=%r)"\
              % (tabsize, skip_first_line))
    indents = []
    margin = None
    for i, line in enumerate(lines):
        if i == 0 and skip_first_line: continue
        indent = 0
        for ch in line:
            if ch == ' ':
                indent += 1
            elif ch == '\t':
                indent += tabsize - (indent % tabsize)
            elif ch in '\r\n':
                continue # skip all-whitespace lines
            else:
                break
        else:
            continue # skip all-whitespace lines
        if DEBUG: print("dedent: indent=%d: %r" % (indent, line))
        if margin is None:
            margin = indent
        else:
            margin = min(margin, indent)
    if DEBUG: print("dedent: margin=%r" % margin)

    if margin is not None and margin > 0:
        for i, line in enumerate(lines):
            if i == 0 and skip_first_line: continue
            removed = 0
            for j, ch in enumerate(line):
                if ch == ' ':
                    removed += 1
                elif ch == '\t':
                    removed += tabsize - (removed % tabsize)
                elif ch in '\r\n':
                    if DEBUG: print("dedent: %r: EOL -> strip up to EOL" % line)
                    lines[i] = lines[i][j:]
                    break
                else:
                    raise ValueError("unexpected non-whitespace char %r in "
                                     "line %r while removing %d-space margin"
                                     % (ch, line, margin))
                if DEBUG:
                    print("dedent: %r: %r -> removed %d/%d"\
                          % (line, ch, removed, margin))
                if removed == margin:
                    lines[i] = lines[i][j+1:]
                    break
                elif removed > margin:
                    lines[i] = ' '*(removed-margin) + lines[i][j+1:]
                    break
            else:
                if removed:
                    lines[i] = lines[i][removed:]
    return lines

def _dedent(text, tabsize=8, skip_first_line=False):
    """_dedent(text, tabsize=8, skip_first_line=False) -> dedented text

        "text" is the text to dedent.
        "tabsize" is the tab width to use for indent width calculations.
        "skip_first_line" is a boolean indicating if the first line should
            be skipped for calculating the indent width and for dedenting.
            This is sometimes useful for docstrings and similar.
    
    textwrap.dedent(s), but don't expand tabs to spaces
    """
    lines = text.splitlines(1)
    _dedentlines(lines, tabsize=tabsize, skip_first_line=skip_first_line)
    return ''.join(lines)


#---- mainline

if __name__ == "__main__":
    try:
        limit = int(sys.argv[1])
    except:
        limit = 0
    gen_aspn_cases(limit)
    gen_test_cases()