File: md_in_html.py

package info (click to toggle)
python-markdown 3.9-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,168 kB
  • sloc: python: 12,708; makefile: 66; sh: 7
file content (440 lines) | stat: -rw-r--r-- 19,785 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
# Python-Markdown Markdown in HTML Extension
# ===============================

# An implementation of [PHP Markdown Extra](http://michelf.com/projects/php-markdown/extra/)'s
# parsing of Markdown syntax in raw HTML.

# See https://Python-Markdown.github.io/extensions/raw_html
# for documentation.

# Copyright The Python Markdown Project

# License: [BSD](https://opensource.org/licenses/bsd-license.php)

"""
Parse Markdown syntax within raw HTML.
Based on the implementation in [PHP Markdown Extra](http://michelf.com/projects/php-markdown/extra/).

See the [documentation](https://Python-Markdown.github.io/extensions/raw_html)
for details.
"""

from __future__ import annotations

from . import Extension
from ..blockprocessors import BlockProcessor
from ..preprocessors import Preprocessor
from ..postprocessors import RawHtmlPostprocessor
from .. import util
from ..htmlparser import HTMLExtractor, blank_line_re
import xml.etree.ElementTree as etree
from typing import TYPE_CHECKING, Literal, Mapping

if TYPE_CHECKING:  # pragma: no cover
    from markdown import Markdown


class HTMLExtractorExtra(HTMLExtractor):
    """
    Override `HTMLExtractor` and create `etree` `Elements` for any elements which should have content parsed as
    Markdown.
    """

    def __init__(self, md: Markdown, *args, **kwargs):
        # All block-level tags.
        self.block_level_tags = set(md.block_level_elements.copy())
        # Block-level tags in which the content only gets span level parsing
        self.span_tags = set(
            ['address', 'dd', 'dt', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'legend', 'li', 'p', 'summary', 'td', 'th']
        )
        # Block-level tags which never get their content parsed.
        self.raw_tags = set(['canvas', 'math', 'option', 'pre', 'script', 'style', 'textarea'])

        super().__init__(md, *args, **kwargs)

        # Block-level tags in which the content gets parsed as blocks
        self.block_tags = set(self.block_level_tags) - (self.span_tags | self.raw_tags | self.empty_tags)
        self.span_and_blocks_tags = self.block_tags | self.span_tags

    def reset(self):
        """Reset this instance.  Loses all unprocessed data."""
        self.mdstack: list[str] = []  # When markdown=1, stack contains a list of tags
        self.treebuilder = etree.TreeBuilder()
        self.mdstate: list[Literal['block', 'span', 'off', None]] = []
        self.mdstarted: list[bool] = []
        super().reset()

    def close(self):
        """Handle any buffered data."""
        super().close()
        # Handle any unclosed tags.
        if self.mdstack:
            # Close the outermost parent. `handle_endtag` will close all unclosed children.
            self.handle_endtag(self.mdstack[0])

    def get_element(self) -> etree.Element:
        """ Return element from `treebuilder` and reset `treebuilder` for later use. """
        element = self.treebuilder.close()
        self.treebuilder = etree.TreeBuilder()
        return element

    def get_state(self, tag, attrs: Mapping[str, str]) -> Literal['block', 'span', 'off', None]:
        """ Return state from tag and `markdown` attribute. One of 'block', 'span', or 'off'. """
        md_attr = attrs.get('markdown', '0')
        if md_attr == 'markdown':
            # `<tag markdown>` is the same as `<tag markdown='1'>`.
            md_attr = '1'
        parent_state = self.mdstate[-1] if self.mdstate else None
        if parent_state == 'off' or (parent_state == 'span' and md_attr != '0'):
            # Only use the parent state if it is more restrictive than the markdown attribute.
            md_attr = parent_state
        if ((md_attr == '1' and tag in self.block_tags) or
                (md_attr == 'block' and tag in self.span_and_blocks_tags)):
            return 'block'
        elif ((md_attr == '1' and tag in self.span_tags) or
              (md_attr == 'span' and tag in self.span_and_blocks_tags)):
            return 'span'
        elif tag in self.block_level_tags:
            return 'off'
        else:  # pragma: no cover
            return None

    def handle_starttag(self, tag, attrs):
        # Handle tags that should always be empty and do not specify a closing tag
        if tag in self.empty_tags and (self.at_line_start() or self.intail):
            attrs = {key: value if value is not None else key for key, value in attrs}
            if "markdown" in attrs:
                attrs.pop('markdown')
                element = etree.Element(tag, attrs)
                data = etree.tostring(element, encoding='unicode', method='html')
            else:
                data = self.get_starttag_text()
            self.handle_empty_tag(data, True)
            return

        if (
            tag in self.block_level_tags and
            (self.at_line_start() or self.intail or self.mdstarted and self.mdstarted[-1])
        ):
            # Valueless attribute (ex: `<tag checked>`) results in `[('checked', None)]`.
            # Convert to `{'checked': 'checked'}`.
            attrs = {key: value if value is not None else key for key, value in attrs}
            state = self.get_state(tag, attrs)
            if self.inraw or (state in [None, 'off'] and not self.mdstack):
                # fall back to default behavior
                attrs.pop('markdown', None)
                super().handle_starttag(tag, attrs)
            else:
                if 'p' in self.mdstack and tag in self.block_level_tags:
                    # Close unclosed 'p' tag
                    self.handle_endtag('p')
                self.mdstate.append(state)
                self.mdstack.append(tag)
                self.mdstarted.append(True)
                attrs['markdown'] = state
                self.treebuilder.start(tag, attrs)

        else:
            # Span level tag
            if self.inraw:
                super().handle_starttag(tag, attrs)
            else:
                text = self.get_starttag_text()
                if self.mdstate and self.mdstate[-1] == "off":
                    self.handle_data(self.md.htmlStash.store(text))
                else:
                    self.handle_data(text)
                if tag in self.CDATA_CONTENT_ELEMENTS:
                    # This is presumably a standalone tag in a code span (see #1036).
                    self.clear_cdata_mode()

    def handle_endtag(self, tag):
        if tag in self.block_level_tags:
            if self.inraw:
                super().handle_endtag(tag)
            elif tag in self.mdstack:
                # Close element and any unclosed children
                while self.mdstack:
                    item = self.mdstack.pop()
                    self.mdstate.pop()
                    self.mdstarted.pop()
                    self.treebuilder.end(item)
                    if item == tag:
                        break
                if not self.mdstack:
                    # Last item in stack is closed. Stash it
                    element = self.get_element()
                    # Get last entry to see if it ends in newlines
                    # If it is an element, assume there is no newlines
                    item = self.cleandoc[-1] if self.cleandoc else ''
                    # If we only have one newline before block element, add another
                    if not item.endswith('\n\n') and item.endswith('\n'):
                        self.cleandoc.append('\n')

                    # Flatten the HTML structure of "markdown" blocks such that when they
                    # get parsed, content will be parsed similar inside the blocks as it
                    # does outside the block. Having real HTML elements in the tree before
                    # the content adjacent content is processed can cause unpredictable
                    # issues for extensions.
                    current = element
                    last = []
                    while current is not None:
                        for child in list(current):
                            current.remove(child)
                            text = current.text if current.text is not None else ''
                            tail = child.tail if child.tail is not None else ''
                            child.tail = None
                            state = child.attrib.get('markdown', 'off')

                            # Add a newline to tail if it is not just a trailing newline
                            if tail != '\n':
                                tail = '\n' + tail.rstrip('\n')

                            # Ensure there is an empty new line between blocks
                            if not text.endswith('\n\n'):
                                text = text.rstrip('\n') + '\n\n'

                            # Process the block nested under the span appropriately
                            if state in ('span', 'block'):
                                current.text = f'{text}{self.md.htmlStash.store(child)}{tail}'
                                last.append(child)
                            else:
                                # Non-Markdown HTML will not be recursively parsed for Markdown,
                                # so we can just remove markers and leave them unflattened.
                                # Additionally, we don't need to append to our list for further
                                # processing.
                                child.attrib.pop('markdown')
                                [c.attrib.pop('markdown', None) for c in child.iter()]
                                current.text = f'{text}{self.md.htmlStash.store(child)}{tail}'
                        # Target the child elements that have been expanded.
                        current = last.pop(0) if last else None

                    self.cleandoc.append(self.md.htmlStash.store(element))
                    self.cleandoc.append('\n\n')
                    self.state = []
                    # Check if element has a tail
                    if not blank_line_re.match(
                            self.rawdata[self.line_offset + self.offset + len(self.get_endtag_text(tag)):]):
                        # More content exists after `endtag`.
                        self.intail = True
            else:
                # Treat orphan closing tag as a span level tag.
                text = self.get_endtag_text(tag)
                if self.mdstate and self.mdstate[-1] == "off":
                    self.handle_data(self.md.htmlStash.store(text))
                else:
                    self.handle_data(text)
        else:
            # Span level tag
            if self.inraw:
                super().handle_endtag(tag)
            else:
                text = self.get_endtag_text(tag)
                if self.mdstate and self.mdstate[-1] == "off":
                    self.handle_data(self.md.htmlStash.store(text))
                else:
                    self.handle_data(text)

    def handle_startendtag(self, tag, attrs):
        if tag in self.empty_tags:
            attrs = {key: value if value is not None else key for key, value in attrs}
            if "markdown" in attrs:
                attrs.pop('markdown')
                element = etree.Element(tag, attrs)
                data = etree.tostring(element, encoding='unicode', method='html')
            else:
                data = self.get_starttag_text()
        else:
            data = self.get_starttag_text()
        self.handle_empty_tag(data, is_block=self.md.is_block_level(tag))

    def handle_data(self, data):
        if self.intail and '\n' in data:
            self.intail = False
        if self.inraw or not self.mdstack:
            super().handle_data(data)
        else:
            self.mdstarted[-1] = False
            self.treebuilder.data(data)

    def handle_empty_tag(self, data, is_block):
        if self.inraw or not self.mdstack:
            super().handle_empty_tag(data, is_block)
        else:
            if self.at_line_start() and is_block:
                self.handle_data('\n' + self.md.htmlStash.store(data) + '\n\n')
            elif self.mdstate and self.mdstate[-1] == "off":
                self.handle_data(self.md.htmlStash.store(data))
            else:
                self.handle_data(data)

    def parse_pi(self, i: int) -> int:
        if self.at_line_start() or self.intail or self.mdstack:
            # The same override exists in `HTMLExtractor` without the check
            # for `mdstack`. Therefore, use parent of `HTMLExtractor` instead.
            return super(HTMLExtractor, self).parse_pi(i)
        # This is not the beginning of a raw block so treat as plain data
        # and avoid consuming any tags which may follow (see #1066).
        self.handle_data('<?')
        return i + 2

    def parse_html_declaration(self, i: int) -> int:
        if self.at_line_start() or self.intail or self.mdstack:
            if self.rawdata[i:i+3] == '<![' and not self.rawdata[i:i+9] == '<![CDATA[':
                # We have encountered the bug in #1534 (Python bug `gh-77057`).
                # Provide an override until we drop support for Python < 3.13.
                result = self.parse_bogus_comment(i)
                if result == -1:
                    self.handle_data(self.rawdata[i:i + 1])
                    return i + 1
                return result
            # The same override exists in `HTMLExtractor` without the check
            # for `mdstack`. Therefore, use parent of `HTMLExtractor` instead.
            return super(HTMLExtractor, self).parse_html_declaration(i)
        # This is not the beginning of a raw block so treat as plain data
        # and avoid consuming any tags which may follow (see #1066).
        self.handle_data('<!')
        return i + 2


class HtmlBlockPreprocessor(Preprocessor):
    """Remove html blocks from the text and store them for later retrieval."""

    def run(self, lines: list[str]) -> list[str]:
        source = '\n'.join(lines)
        parser = HTMLExtractorExtra(self.md)
        parser.feed(source)
        parser.close()
        return ''.join(parser.cleandoc).split('\n')


class MarkdownInHtmlProcessor(BlockProcessor):
    """Process Markdown Inside HTML Blocks which have been stored in the `HtmlStash`."""

    def test(self, parent: etree.Element, block: str) -> bool:
        # Always return True. `run` will return `False` it not a valid match.
        return True

    def parse_element_content(self, element: etree.Element) -> None:
        """
        Recursively parse the text content of an `etree` Element as Markdown.

        Any block level elements generated from the Markdown will be inserted as children of the element in place
        of the text content. All `markdown` attributes are removed. For any elements in which Markdown parsing has
        been disabled, the text content of it and its children are wrapped in an `AtomicString`.
        """

        md_attr = element.attrib.pop('markdown', 'off')

        if md_attr == 'block':
            # Parse the block elements content as Markdown
            if element.text:
                block = element.text.rstrip('\n')
                element.text = ''
                self.parser.parseBlocks(element, block.split('\n\n'))

        elif md_attr == 'span':
            # Span elements need to be recursively processed for block elements and raw HTML
            # as their content is not normally accessed by block processors, so expand stashed
            # HTML under the span. Span content itself will not be parsed here, but will await
            # the inline parser.
            block = element.text if element.text is not None else ''
            element.text = ''
            child = None
            start = 0

            # Search the content for HTML placeholders and process the elements
            for m in util.HTML_PLACEHOLDER_RE.finditer(block):
                index = int(m.group(1))
                el = self.parser.md.htmlStash.rawHtmlBlocks[index]
                end = m.start()

                if isinstance(el, etree.Element):
                    # Replace the placeholder with the element and process it.
                    # Content after the placeholder should be attached to the tail.
                    if child is None:
                        element.text += block[start:end]
                    else:
                        child.tail += block[start:end]
                    element.append(el)
                    self.parse_element_content(el)
                    child = el
                    if child.tail is None:
                        child.tail = ''
                    self.parser.md.htmlStash.rawHtmlBlocks.pop(index)
                    self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '')

                else:
                    # Not an element object, so insert content back into the element
                    if child is None:
                        element.text += block[start:end]
                    else:
                        child.tail += block[start:end]
                start = end

            # Insert anything left after last element
            if child is None:
                element.text += block[start:]
            else:
                child.tail += block[start:]

        else:
            # Disable inline parsing for everything else
            if element.text is None:
                element.text = ''
            element.text = util.AtomicString(element.text)
            for child in list(element):
                self.parse_element_content(child)
                if child.tail:
                    child.tail = util.AtomicString(child.tail)

    def run(self, parent: etree.Element, blocks: list[str]) -> bool:
        m = util.HTML_PLACEHOLDER_RE.match(blocks[0])
        if m:
            index = int(m.group(1))
            element = self.parser.md.htmlStash.rawHtmlBlocks[index]
            if isinstance(element, etree.Element):
                # We have a matched element. Process it.
                block = blocks.pop(0)
                parent.append(element)
                self.parse_element_content(element)
                # Cleanup stash. Replace element with empty string to avoid confusing postprocessor.
                self.parser.md.htmlStash.rawHtmlBlocks.pop(index)
                self.parser.md.htmlStash.rawHtmlBlocks.insert(index, '')
                content = block[m.end(0):]
                # Ensure the rest of the content gets handled
                if content:
                    blocks.insert(0, content)
                # Confirm the match to the `blockparser`.
                return True
        # No match found.
        return False


class MarkdownInHTMLPostprocessor(RawHtmlPostprocessor):
    def stash_to_string(self, text: str | etree.Element) -> str:
        """ Override default to handle any `etree` elements still in the stash. """
        if isinstance(text, etree.Element):
            return self.md.serializer(text)
        else:
            return str(text)


class MarkdownInHtmlExtension(Extension):
    """Add Markdown parsing in HTML to Markdown class."""

    def extendMarkdown(self, md):
        """ Register extension instances. """

        # Replace raw HTML preprocessor
        md.preprocessors.register(HtmlBlockPreprocessor(md), 'html_block', 20)
        # Add `blockprocessor` which handles the placeholders for `etree` elements
        md.parser.blockprocessors.register(
            MarkdownInHtmlProcessor(md.parser), 'markdown_block', 105
        )
        # Replace raw HTML postprocessor
        md.postprocessors.register(MarkdownInHTMLPostprocessor(md), 'raw_html', 30)


def makeExtension(**kwargs):  # pragma: no cover
    return MarkdownInHtmlExtension(**kwargs)