File: markdowncheck.py

package info (click to toggle)
linkchecker 10.6.0-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 3,132 kB
  • sloc: python: 13,154; makefile: 134; sh: 71; xml: 36; sql: 20; javascript: 19; php: 2
file content (218 lines) | stat: -rw-r--r-- 7,567 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
#
# Copyright (C) 2014 Vadym Khokhlov
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
"""
Parse links in Markdown files.

Supported links are:
        <http://autolink.com>
        [name](http://link.com "Optional title")
        [id]: http://link.com "Optional title"
"""

# Some ideas and code were borrowed from https://pypi.python.org/pypi/markdown2 project

import re

from . import _ContentPlugin
from .. import log, LOG_PLUGIN


class MarkdownCheck(_ContentPlugin):
    """Markdown parsing plugin."""

    _filename_re_key = "filename_re"
    _default_filename_re = re.compile(r'.*\.(markdown|md(own)?|mkdn?)$')

    _link_res = [
        re.compile(r'<((https?|ftp):[^\'">\s]+)>', re.I),
        re.compile(
            r"""
                    \[.+\]: # id
                    [ \t]*\n?               # maybe *one* newline
                    [ \t]*
                    <?(.+?)>?           # url = \1
                    [ \t]*
                    (?:
                    \n?             # maybe one newline
                    [ \t]*
                    (?<=\s)         # lookbehind for whitespace
                    ['"(]
                    [^\n]*          # title
                    ['")]
                    [ \t]*
                    )?  # title is optional
                    (?:\n+|\Z)
                    """,
            re.X | re.M | re.U,
        ),
    ]

    _whitespace = re.compile(r'\s*')

    _strip_anglebrackets = re.compile(r'<(.*)>.*')

    _inline_link_title = re.compile(
        r'''
            (                   # \1
              [ \t]+
              (['"])            # quote char
              (.*?)
            )?                  # title is optional
          \)$
        ''',
        re.X | re.S,
    )

    def __init__(self, config):
        super().__init__(config)
        self.filename_re = self._default_filename_re
        pattern = config.get(self._filename_re_key)
        if pattern:
            try:
                self.filename_re = re.compile(pattern)
            except re.error as msg:
                log.warn(LOG_PLUGIN, _("Invalid regex pattern %r: %s"), pattern, msg)

    @classmethod
    def read_config(cls, configparser):
        """Read configuration file options."""
        config = dict()
        config[cls._filename_re_key] = (
            configparser.get(cls.__name__, cls._filename_re_key)
            if configparser.has_option(cls.__name__, cls._filename_re_key)
            else None
        )
        return config

    def applies_to(self, url_data, pagetype=None):
        """Check for Markdown file."""
        return self.filename_re.search(url_data.base_url) is not None

    def check(self, url_data):
        """Extracts urls from the file."""
        content = url_data.get_content()
        self._check_by_re(url_data, content)
        self._check_inline_links(url_data, content)

    def _save_url(self, url_data, content, url_text, url_pos):
        """Saves url. Converts url to 1-line text and url position as offset
        from the file beginning to (line, column).

        :param url_data: object for url storing
        :param content: file content
        :param url_text: url text
        :param url_pos: url position from the beginning
        """
        line = content.count('\n', 0, url_pos) + 1
        column = url_pos - content.rfind('\n', 0, url_pos)
        url_data.add_url(
            url_text.translate(str.maketrans("", "", '\n ')), line=line, column=column
        )

    def _check_by_re(self, url_data, content):
        """ Finds urls by re.

        :param url_data: object for url storing
        :param content: file content
        """
        for link_re in self._link_res:
            for u in link_re.finditer(content):
                self._save_url(url_data, content, u.group(1), u.start(1))

    def _find_balanced(self, text, start, open_c, close_c):
        """Returns the index where the open_c and close_c characters balance
        out - the same number of open_c and close_c are encountered - or the
        end of string if it's reached before the balance point is found.
        """
        i = start
        n = len(text)
        count = 1
        while count > 0 and i < n:
            if text[i] == open_c:
                count += 1
            elif text[i] == close_c:
                count -= 1
            i += 1
        return i

    def _extract_url_and_title(self, text, start):
        """Extracts the url from the tail of a link."""
        # text[start] equals the opening parenthesis
        idx = self._whitespace.match(text, start + 1).end()
        if idx == len(text):
            return None, None
        end_idx = idx
        has_anglebrackets = text[idx] == "<"
        if has_anglebrackets:
            end_idx = self._find_balanced(text, end_idx + 1, "<", ">")
        end_idx = self._find_balanced(text, end_idx, "(", ")")
        match = self._inline_link_title.search(text, idx, end_idx)
        if not match:
            return None, None
        url = text[idx:match.start()]
        if has_anglebrackets:
            url = self._strip_anglebrackets.sub(r'\1', url)
        return url, end_idx

    def _check_inline_links(self, url_data, content):
        """Checks inline links.

        :param url_data: url_data object
        :param content: content for processing
        """
        MAX_LINK_TEXT_SENTINEL = 3000
        curr_pos = 0
        content_length = len(content)
        while True:  # Handle the next link.
            # The next '[' is the start of:
            # - an inline anchor:   [text](url "title")
            # - an inline img:      ![text](url "title")
            # - not markup:         [...anything else...
            try:
                start_idx = content.index('[', curr_pos)
            except ValueError:
                break

            # Find the matching closing ']'.
            bracket_depth = 0
            for p in range(
                start_idx + 1, min(start_idx + MAX_LINK_TEXT_SENTINEL, content_length)
            ):
                if content[p] == ']':
                    bracket_depth -= 1
                    if bracket_depth < 0:
                        break
                elif content[p] == '[':
                    bracket_depth += 1
            else:
                # Closing bracket not found within sentinel length. This isn't markup.
                curr_pos = start_idx + 1
                continue

            # Now determine what this is by the remainder.
            p += 1
            if p >= content_length:
                return

            if content[p] == '(':
                url, url_end_idx = self._extract_url_and_title(content, p)
                if url is not None:
                    self._save_url(url_data, content, url, p)
                    start_idx = url_end_idx

            # Otherwise, it isn't markup.
            curr_pos = start_idx + 1