1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173
|
import re
from collections.abc import Iterable
from typing import Optional, Dict, Pattern, Match, List
from sybil import Document, Region, Lexeme
from sybil.parsers.abstract.lexers import BlockLexer, strip_prefix
FENCE = re.compile(r"^(?P<prefix>[ \t]*)(?P<fence>`{3,}|~{3,})", re.MULTILINE)
class RawFencedCodeBlockLexer:
"""
A :class:`~sybil.typing.Lexer` for Markdown fenced code blocks allowing flexible lexing
of the whole `info` line along with more complicated prefixes.
The following lexemes are extracted:
- ``source`` as a :class:`~sybil.Lexeme`.
- any other named groups specified in ``info_pattern`` as :class:`strings <str>`.
:param info_pattern:
a :class:`re.Pattern` to match the `info` line and any required prefix that follows it.
:param mapping:
If provided, this is used to rename lexemes from the keys in the mapping to their
values. Only mapped lexemes will be returned in any :class:`~sybil.Region` objects.
"""
def __init__(
self,
info_pattern: Pattern[str] = re.compile(r'$\n', re.MULTILINE),
mapping: Optional[Dict[str, str]] = None,
) -> None:
self.info_pattern = info_pattern
self.mapping = mapping
@staticmethod
def match_closes_existing(current: Match[str], existing: Match[str]) -> bool:
current_fence = current.group('fence')
existing_fence = existing.group('fence')
same_type = current_fence[0] == existing_fence[0]
okay_length = len(current_fence) >= len(existing_fence)
same_prefix = len(current.group('prefix')) == len(existing.group('prefix'))
return same_type and okay_length and same_prefix
def make_region(
self, opening: Match[str], document: Document, closing: Optional[Match[str]]
) -> Optional[Region]:
if closing is None:
content_end = region_end = len(document.text)
else:
content_end = closing.start()
region_end = closing.end()
content = document.text[opening.end(): content_end]
info = self.info_pattern.match(content)
if info is None:
return None
lexemes = info.groupdict()
lexemes['source'] = Lexeme(
strip_prefix(content[info.end():], opening.group('prefix')),
offset=len(opening.group(0))+info.end(),
line_offset=0,
)
if self.mapping:
lexemes = {dest: lexemes[source] for source, dest in self.mapping.items()}
return Region(opening.start(), region_end, lexemes=lexemes)
def __call__(self, document: Document) -> Iterable[Region]:
open_blocks: List[Match[str]] = []
index = 0
while True:
match = FENCE.search(document.text, index)
if match is None:
break
else:
index = match.end()
# does this fence close any open block?
for i in range(len(open_blocks)):
existing = open_blocks[i]
if self.match_closes_existing(match, existing):
maybe_region = self.make_region(existing, document, match)
if maybe_region is not None:
yield maybe_region
open_blocks = open_blocks[:i]
break
else:
open_blocks.append(match)
if open_blocks:
maybe_region = self.make_region(open_blocks[0], document, closing=None)
if maybe_region is not None:
yield maybe_region
class FencedCodeBlockLexer(RawFencedCodeBlockLexer):
"""
A :class:`~sybil.typing.Lexer` for Markdown fenced code blocks where a language is specified.
:class:`RawFencedCodeBlockLexer` can be used if the whole `info` line, or a more complicated
prefix, is required.
The following lexemes are extracted:
- ``language`` as a :class:`str`.
- ``source`` as a :class:`~sybil.Lexeme`.
:param language:
a :class:`str` containing a regular expression pattern to match language names.
:param mapping:
If provided, this is used to rename lexemes from the keys in the mapping to their
values. Only mapped lexemes will be returned in any :class:`~sybil.Region` objects.
"""
def __init__(self, language: str, mapping: Optional[Dict[str, str]] = None) -> None:
super().__init__(
info_pattern=re.compile(f'(?P<language>{language})$\n', re.MULTILINE),
mapping=mapping,
)
DIRECTIVE_IN_HTML_COMMENT_START = (
r"^(?P<prefix>[ \t]*)<!--+\s*(?:;\s*)?(?P<directive>{directive}):?[ \t]"
r"*(?P<arguments>{arguments})[ \t]*"
r"(?:$\n|(?=--+>))"
)
DIRECTIVE_IN_HTML_COMMENT_END = '(?:(?<=\n){prefix})?--+>'
class DirectiveInHTMLCommentLexer(BlockLexer):
"""
A :class:`~sybil.parsers.abstract.lexers.BlockLexer` for faux directives in
HTML-style Markdown comments such as:
.. code-block:: markdown
<!--- not-really-a-directive: some-argument
Source here...
--->
It extracts the following lexemes:
- ``directive`` as a :class:`str`.
- ``arguments`` as a :class:`str`.
- ``source`` as a :class:`~sybil.Lexeme`.
:param directive:
a :class:`str` containing a regular expression pattern to match directive names.
:param arguments:
a :class:`str` containing a regular expression pattern to match directive arguments.
:param mapping:
If provided, this is used to rename lexemes from the keys in the mapping to their values.
Only mapped lexemes will be returned in any :class:`~sybil.Region` objects.
"""
def __init__(
self, directive: str, arguments: str = '.*?', mapping: Optional[Dict[str, str]] = None
) -> None:
super().__init__(
start_pattern=re.compile(
DIRECTIVE_IN_HTML_COMMENT_START.format(directive=directive, arguments=arguments),
re.MULTILINE
),
end_pattern_template=DIRECTIVE_IN_HTML_COMMENT_END,
mapping=mapping,
)
|