File: doc.rb

package info (click to toggle)
ruby-mdl 0.13.0-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 196 kB
  • sloc: ruby: 1,485; python: 40; makefile: 4; sh: 3
file content (309 lines) | stat: -rw-r--r-- 9,791 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
require 'kramdown'
require_relative 'kramdown_parser'

module MarkdownLint
  ##
  # Representation of the markdown document passed to rule checks
  class Doc
    ##
    # A list of raw markdown source lines. Note that the list is 0-indexed,
    # while line numbers in the parsed source are 1-indexed, so you need to
    # subtract 1 from a line number to get the correct line. The element_line*
    # methods take care of this for you.

    attr_reader :lines, :parsed, :elements, :offset

    ##
    # A Kramdown::Document object containing the parsed markdown document.

    ##
    # A list of top level Kramdown::Element objects from the parsed document.

    ##
    # The line number offset which is greater than zero when the
    # markdown file contains YAML front matter that should be ignored.

    ##
    # Create a new document given a string containing the markdown source

    def initialize(text, ignore_front_matter = false)
      regex = /^---\n(.*?)---\n\n?/m
      if ignore_front_matter && regex.match(text)
        @offset = regex.match(text).to_s.split("\n").length
        text.sub!(regex, '')
      else
        @offset = 0
      end
      # The -1 is to cause split to preserve an extra entry in the array so we
      # can tell if there's a final newline in the file or not.
      @lines = text.split(/\R/, -1)
      @parsed = Kramdown::Document.new(text, :input => 'MarkdownLint')
      @elements = @parsed.root.children
      add_annotations(@elements)
    end

    ##
    # Alternate 'constructor' passing in a filename

    def self.new_from_file(filename, ignore_front_matter = false)
      if filename == '-'
        new($stdin.read, ignore_front_matter)
      else
        new(File.read(filename, :encoding => 'UTF-8'), ignore_front_matter)
      end
    end

    ##
    # Find all elements of a given type, returning their options hash. The
    # options hash has most of the useful data about an element and often you
    # can just use this in your rules.
    #
    #   # Returns [ { :location => 1, :element_level => 2 }, ... ]
    #   elements = find_type(:li)
    #
    # If +nested+ is set to false, this returns only top level elements of a
    # given type.

    def find_type(type, nested = true)
      find_type_elements(type, nested).map(&:options)
    end

    ##
    # Find all elements of a given type, returning a list of the element
    # objects themselves.
    #
    # Instead of a single type, a list of types can be provided instead to
    # find all types.
    #
    # If +nested+ is set to false, this returns only top level elements of a
    # given type.

    def find_type_elements(type, nested = true, elements = @elements)
      results = []
      type = [type] if type.instance_of?(Symbol)
      elements.each do |e|
        results.push(e) if type.include?(e.type)
        if nested && !e.children.empty?
          results.concat(find_type_elements(type, nested, e.children))
        end
      end
      results
    end

    ##
    # A variation on find_type_elements that allows you to skip drilling down
    # into children of specific element types.
    #
    # Instead of a single type, a list of types can be provided instead to
    # find all types.
    #
    # Unlike find_type_elements, this method will always search for nested
    # elements, and skip the element types given to nested_except.

    def find_type_elements_except(
      type, nested_except = [], elements = @elements
    )
      results = []
      type = [type] if type.instance_of?(Symbol)
      nested_except = [nested_except] if nested_except.instance_of?(Symbol)
      elements.each do |e|
        results.push(e) if type.include?(e.type)
        next if nested_except.include?(e.type) || e.children.empty?

        results.concat(
          find_type_elements_except(type, nested_except, e.children),
        )
      end
      results
    end

    ##
    # Returns the line number a given element is located on in the source
    # file. You can pass in either an element object or an options hash here.

    def element_linenumber(element)
      element = element.options if element.is_a?(Kramdown::Element)
      element[:location]
    end

    ##
    # Returns the actual source line for a given element. You can pass in an
    # element object or an options hash here. This is useful if you need to
    # examine the source line directly for your rule to make use of
    # information that isn't present in the parsed document.

    def element_line(element)
      @lines[element_linenumber(element) - 1]
    end

    ##
    # Returns a list of line numbers for all elements passed in. You can pass
    # in a list of element objects or a list of options hashes here.

    def element_linenumbers(elements)
      elements.map { |e| element_linenumber(e) }
    end

    ##
    # Returns the actual source lines for a list of elements. You can pass in
    # a list of elements objects or a list of options hashes here.

    def element_lines(elements)
      elements.map { |e| element_line(e) }
    end

    ##
    # Returns the header 'style' - :atx (hashes at the beginning), :atx_closed
    # (atx header style, but with hashes at the end of the line also), :setext
    # (underlined). You can pass in the element object or an options hash
    # here.

    def header_style(header)
      if header.type != :header
        raise 'header_style called with non-header element'
      end

      line = element_line(header)
      if line.start_with?('#')
        if line.strip.end_with?('#')
          :atx_closed
        else
          :atx
        end
      else
        :setext
      end
    end

    ##
    # Returns the list style for a list: :asterisk, :plus, :dash, :ordered or
    # :ordered_paren depending on which symbol is used to denote the list
    # item. You can pass in either the element itself or an options hash here.

    def list_style(item)
      raise 'list_style called with non-list element' if item.type != :li

      line = element_line(item).strip.gsub(/^>\s+/, '')
      if line.start_with?('*')
        :asterisk
      elsif line.start_with?('+')
        :plus
      elsif line.start_with?('-')
        :dash
      elsif line.match('[0-9]+\.')
        :ordered
      elsif line.match('[0-9]+\)')
        :ordered_paren
      else
        :unknown
      end
    end

    ##
    # Returns how much a given line is indented. Hard tabs are treated as an
    # indent of 8 spaces. You need to pass in the raw string here.

    def indent_for(line)
      line.match(/^\s*/)[0].gsub("\t", ' ' * 8).length
    end

    ##
    # Returns line numbers for lines that match the given regular expression

    def matching_lines(regex)
      @lines.each_with_index.select { |text, _linenum| regex.match(text) }
            .map do |i|
              i[1] + 1
            end
    end

    ##
    # Returns line numbers for lines that match the given regular expression.
    # Only considers text inside of 'text' elements (i.e. regular markdown
    # text and not code/links or other elements).
    def matching_text_element_lines(regex, exclude_nested = [:a])
      matches = []
      find_type_elements_except(:text, exclude_nested).each do |e|
        first_line = e.options[:location]
        # We'll error out if kramdown doesn't have location information for
        # the current element. It's better to just not match in these cases
        # rather than crash.
        next if first_line.nil?

        lines = e.value.split("\n")
        lines.each_with_index do |l, i|
          matches << (first_line + i) if regex.match(l)
        end
      end
      matches
    end

    ##
    # Extracts the text from an element whose children consist of text
    # elements and other things

    def extract_text(element, prefix = '', restore_whitespace = true)
      quotes = {
        :rdquo => '"',
        :ldquo => '"',
        :lsquo => "'",
        :rsquo => "'",
      }
      # If anything goes amiss here, e.g. unknown type, then nil will be
      # returned and we'll just not catch that part of the text, which seems
      # like a sensible failure mode.
      lines = element.children.map do |e|
        if e.type == :text
          e.value
        elsif %i{strong em p codespan}.include?(e.type)
          extract_text(e, prefix, restore_whitespace).join("\n")
        elsif e.type == :smart_quote
          quotes[e.value]
        end
      end.join.split("\n")
      # Text blocks have whitespace stripped, so we need to add it back in at
      # the beginning. Because this might be in something like a blockquote,
      # we optionally strip off a prefix given to the function.
      lines[0] = element_line(element).sub(prefix, '') if restore_whitespace
      lines
    end

    ##
    # Returns the element as plaintext

    def extract_as_text(element)
      quotes = {
        :rdquo => '"',
        :ldquo => '"',
        :lsquo => "'",
        :rsquo => "'",
      }
      # If anything goes amiss here, e.g. unknown type, then nil will be
      # returned and we'll just not catch that part of the text, which seems
      # like a sensible failure mode.
      element.children.map do |e|
        if e.type == :text || e.type == :codespan
          e.value
        elsif %i{strong em p a}.include?(e.type)
          extract_as_text(e).join("\n")
        elsif e.type == :smart_quote
          quotes[e.value]
        end
      end.join.split("\n")
    end

    private

    ##
    # Adds a 'level' and 'parent' option to all elements to show how nested they
    # are

    def add_annotations(elements, level = 1, parent = nil)
      elements.each do |e|
        e.options[:element_level] = level
        e.options[:parent] = parent
        add_annotations(e.children, level + 1, e)
      end
    end
  end
end