File: markdown2html

package info (click to toggle)
mutt 2.2.13-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 25,908 kB
  • sloc: ansic: 95,475; sh: 5,169; perl: 2,137; python: 548; makefile: 485; sed: 16
file content (309 lines) | stat: -rw-r--r-- 9,750 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
#!/usr/bin/python3
#
# markdown2html.py — simple Markdown-to-HTML converter for use with Mutt
#
# Mutt recently learnt [how to compose `multipart/alternative`
# emails][1]. This script assumes a message has been composed using Markdown
# (with a lot of pandoc extensions enabled), and translates it to `text/html`
# for Mutt to tie into such a `multipart/alternative` message.
#
# [1]: https://gitlab.com/muttmua/mutt/commit/0e566a03725b4ad789aa6ac1d17cdf7bf4e7e354)
#
# Configuration:
#   muttrc:
#     set send_multipart_alternative=yes
#     set send_multipart_alternative_filter=/path/to/markdown2html.py
#
# Optionally, Custom CSS styles will be read from `~/.mutt/markdown2html.css`,
# if present.
#
# Requirements:
#   - python3
#   - PyPandoc (and pandoc installed, or downloaded)
#   - Pynliner
#
# Optional:
#   - Pygments, if installed, then syntax highlighting is enabled
#
# Latest version:
#   https://git.madduck.net/etc/mutt.git/blob_plain/HEAD:/.mutt/markdown2html
#
# Copyright © 2019 martin f. krafft <madduck@madduck.net>
# Released under the GPL-2+ licence, just like Mutt itself.
#

import pypandoc
import pynliner
import re
import os
import sys

try:
    from pygments.formatters import get_formatter_by_name
    formatter = get_formatter_by_name('html', style='default')
    DEFAULT_CSS = formatter.get_style_defs('.sourceCode')

except ImportError:
    DEFAULT_CSS = ""


DEFAULT_CSS += '''
.quote {
    padding: 0 0.5em;
    margin: 0;
    font-style: italic;
    border-left: 2px solid #ccc;
    color: #999;
    font-size: 80%;
}
.quotelead {
    font-style: italic;
    margin-bottom: -1em;
    color: #999;
    font-size: 80%;
}
.quotechar { display: none; }
.footnote-ref, .footnote-back { text-decoration: none;}
.signature {
    color: #999;
    font-family: monospace;
    white-space: pre;
    margin: 1em 0 0 0;
    font-size: 80%;
}
table, th, td {
    border-collapse: collapse;
    border: 1px solid #999;
}
th, td { padding: 0.5em; }
.header {
    background: #eee;
}
.even { background: #eee; }
'''

STYLESHEET = os.path.join(os.path.expanduser('~/.mutt'),
                          'markdown2html.css')
if os.path.exists(STYLESHEET):
    DEFAULT_CSS += open(STYLESHEET).read()

HTML_DOCUMENT = '''<!DOCTYPE html>
<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>
<meta charset="utf-8"/>
<meta name="viewport" content="width=device-width, initial-scale=1.0, user-scalable=yes"/>
<title>HTML E-Mail</title>
</head><body class="email">
{htmlbody}
</body></html>'''


SIGNATURE_HTML = \
        '<div class="signature"><span class="leader">-- </span>{sig}</div>'


def _preprocess_markdown(mdwn):
    '''
    Preprocess Markdown for handling by the converter.
    '''
    # convert hard line breaks within paragraphs to 2 trailing spaces, which
    # is the markdown way of representing hard line breaks. Note how the
    # regexp will not match between paragraphs.
    ret = re.sub(r'(\S)\n(\s*\S)', r'\g<1>  \n\g<2>', mdwn, flags=re.MULTILINE)

    # Clients like Thunderbird need the leading '>' to be able to properly
    # create nested quotes, so we duplicate the symbol, the first instance
    # will tell pandoc to create a blockquote, while the second instance will
    # be a <span> containing the character, along with a class that causes CSS
    # to actually hide it from display. However, this does not work with the
    # text-mode HTML2text converters, and so it's left commented for now.
    #ret = re.sub(r'\n>', r'  \n>[>]{.quotechar}', ret, flags=re.MULTILINE)

    return ret


def _identify_quotes_for_later(mdwn):
    '''
    Email quoting such as:

    ```
    On 1970-01-01, you said:
    > The Flat Earth Society has members all around the globe.
    ```

    isn't really properly handled by Markdown, so let's do our best to
    identify the individual elements, and mark them, using a syntax similar to
    what pandoc uses already in some cases. As pandoc won't actually use these
    data (yet?), we call `self._reformat_quotes` later to use these markers
    to slap the appropriate classes on the HTML tags.
    '''

    def generate_lines_with_context(mdwn):
        '''
        Iterates the input string line-wise, returning a triplet of
        previous, current, and next line, the first and last of which
        will be None on the first and last line of the input data
        respectively.
        '''
        prev = cur = nxt = None
        lines = iter(mdwn.splitlines())
        cur = next(lines)
        for nxt in lines:
            yield prev, cur, nxt
            prev = cur
            cur = nxt
        yield prev, cur, None

    ret = []
    for prev, cur, nxt in generate_lines_with_context(mdwn):

        # The lead-in to a quote is a single line immediately preceding the
        # quote, and ending with ':'. Note that there could be multiple of
        # these:
        if nxt is not None and re.match(r'^\s*[^>].+:\s*$', cur) and nxt.startswith('>'):
            ret.append(f'{{.quotelead}}{cur.strip()}')
            # pandoc needs an empty line before the blockquote, so
            # we enter one for the purpose of HTML rendition:
            ret.append('')
            continue

        # The first blockquote after such a lead-in gets marked as the
        # "initial" quote:
        elif prev is not None and re.match(r'^\s*[^>].+:\s*$', prev) and cur.startswith('>'):
            ret.append(re.sub(r'^(\s*>\s*)+(.+)',
                              r'\g<1>{.quoteinitial}\g<2>',
                              cur, flags=re.MULTILINE))

        # All other occurrences of blockquotes get the "subsequent" marker:
        elif cur.startswith('>') and prev is not None and not prev.startswith('>'):
            ret.append(re.sub(r'^((?:\s*>\s*)+)(.+)',
                              r'\g<1>{.quotesubsequent}\g<2>',
                              cur, flags=re.MULTILINE))

        else: # pass through everything else.
            ret.append(cur)

    return '\n'.join(ret)


def _reformat_quotes(html):
    '''
    Earlier in the pipeline, we marked email quoting, using markers, which we
    now need to turn into HTML classes, so that we can use CSS to style them.
    '''
    ret = html.replace('{.quotelead}', '<p class="quotelead">')
    ret = re.sub(r'<blockquote>\n((?:<blockquote>\n)*)<p>(?:\{\.quote(\w+)\})',
                 r'<blockquote class="quote \g<2>">\n\g<1><p>', ret, flags=re.MULTILINE)
    return ret



def _convert_with_pandoc(mdwn, inputfmt='markdown', outputfmt='html5',
                         ext_enabled=None, ext_disabled=None,
                         standalone=True, title="HTML E-Mail"):
    '''
    Invoke pandoc to do the actual conversion of Markdown to HTML5.
    '''
    if not ext_enabled:
        ext_enabled = [ 'backtick_code_blocks',
                       'line_blocks',
                       'fancy_lists',
                       'startnum',
                       'definition_lists',
                       'example_lists',
                       'table_captions',
                       'simple_tables',
                       'multiline_tables',
                       'grid_tables',
                       'pipe_tables',
                       'all_symbols_escapable',
                       'intraword_underscores',
                       'strikeout',
                       'superscript',
                       'subscript',
                       'fenced_divs',
                       'bracketed_spans',
                       'footnotes',
                       'inline_notes',
                       'emoji',
                       'tex_math_double_backslash',
                       'autolink_bare_uris'
                      ]
    if not ext_disabled:
        ext_disabled = [ 'tex_math_single_backslash',
                         'tex_math_dollars',
                         'smart',
                         'raw_html'
                       ]

    enabled = '+'.join(ext_enabled)
    disabled = '-'.join(ext_disabled)
    inputfmt = f'{inputfmt}+{enabled}-{disabled}'

    args = []
    if standalone:
        args.append('--standalone')
    if title:
        args.append(f'--metadata=pagetitle:"{title}"')

    return pypandoc.convert_text(mdwn, format=inputfmt, to=outputfmt,
                                 extra_args=args)


def _apply_styling(html):
    '''
    Inline all styles defined and used into the individual HTML tags.
    '''
    return pynliner.Pynliner().from_string(html).with_cssString(DEFAULT_CSS).run()


def _postprocess_html(html):
    '''
    Postprocess the generated and styled HTML.
    '''
    return html


def convert_markdown_to_html(mdwn):
    '''
    Converts the input Markdown to HTML, handling separately the body, as well
    as an optional signature.
    '''
    parts = re.split(r'^-- $', mdwn, 1, flags=re.MULTILINE)
    body = parts[0]
    if len(parts) == 2:
        sig = parts[1]
    else:
        sig = None

    html=''
    if body:
        body = _preprocess_markdown(body)
        body = _identify_quotes_for_later(body)
        html = _convert_with_pandoc(body, standalone=False)
        html = _reformat_quotes(html)

    if sig:
        sig = _preprocess_markdown(sig)
        html += SIGNATURE_HTML.format(sig='<br/>'.join(sig.splitlines()))

    html = HTML_DOCUMENT.format(htmlbody=html)
    html = _apply_styling(html)
    html = _postprocess_html(html)

    return html


def main():
    '''
    Convert text on stdin to HTML, and print it to stdout, like mutt would
    expect.
    '''
    html = convert_markdown_to_html(sys.stdin.read())
    if html:
        # mutt expects the content type in the first line, so:
        print(f'text/html\n\n{html}')


if __name__ == '__main__':
    main()