1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309
|
require 'kramdown'
require_relative 'kramdown_parser'
module MarkdownLint
##
# Representation of the markdown document passed to rule checks
class Doc
##
# A list of raw markdown source lines. Note that the list is 0-indexed,
# while line numbers in the parsed source are 1-indexed, so you need to
# subtract 1 from a line number to get the correct line. The element_line*
# methods take care of this for you.
attr_reader :lines, :parsed, :elements, :offset
##
# A Kramdown::Document object containing the parsed markdown document.
##
# A list of top level Kramdown::Element objects from the parsed document.
##
# The line number offset which is greater than zero when the
# markdown file contains YAML front matter that should be ignored.
##
# Create a new document given a string containing the markdown source
def initialize(text, ignore_front_matter = false)
regex = /^---\n(.*?)---\n\n?/m
if ignore_front_matter && regex.match(text)
@offset = regex.match(text).to_s.split("\n").length
text.sub!(regex, '')
else
@offset = 0
end
# The -1 is to cause split to preserve an extra entry in the array so we
# can tell if there's a final newline in the file or not.
@lines = text.split(/\R/, -1)
@parsed = Kramdown::Document.new(text, :input => 'MarkdownLint')
@elements = @parsed.root.children
add_annotations(@elements)
end
##
# Alternate 'constructor' passing in a filename
def self.new_from_file(filename, ignore_front_matter = false)
if filename == '-'
new($stdin.read, ignore_front_matter)
else
new(File.read(filename, :encoding => 'UTF-8'), ignore_front_matter)
end
end
##
# Find all elements of a given type, returning their options hash. The
# options hash has most of the useful data about an element and often you
# can just use this in your rules.
#
# # Returns [ { :location => 1, :element_level => 2 }, ... ]
# elements = find_type(:li)
#
# If +nested+ is set to false, this returns only top level elements of a
# given type.
def find_type(type, nested = true)
find_type_elements(type, nested).map(&:options)
end
##
# Find all elements of a given type, returning a list of the element
# objects themselves.
#
# Instead of a single type, a list of types can be provided instead to
# find all types.
#
# If +nested+ is set to false, this returns only top level elements of a
# given type.
def find_type_elements(type, nested = true, elements = @elements)
results = []
type = [type] if type.instance_of?(Symbol)
elements.each do |e|
results.push(e) if type.include?(e.type)
if nested && !e.children.empty?
results.concat(find_type_elements(type, nested, e.children))
end
end
results
end
##
# A variation on find_type_elements that allows you to skip drilling down
# into children of specific element types.
#
# Instead of a single type, a list of types can be provided instead to
# find all types.
#
# Unlike find_type_elements, this method will always search for nested
# elements, and skip the element types given to nested_except.
def find_type_elements_except(
type, nested_except = [], elements = @elements
)
results = []
type = [type] if type.instance_of?(Symbol)
nested_except = [nested_except] if nested_except.instance_of?(Symbol)
elements.each do |e|
results.push(e) if type.include?(e.type)
next if nested_except.include?(e.type) || e.children.empty?
results.concat(
find_type_elements_except(type, nested_except, e.children),
)
end
results
end
##
# Returns the line number a given element is located on in the source
# file. You can pass in either an element object or an options hash here.
def element_linenumber(element)
element = element.options if element.is_a?(Kramdown::Element)
element[:location]
end
##
# Returns the actual source line for a given element. You can pass in an
# element object or an options hash here. This is useful if you need to
# examine the source line directly for your rule to make use of
# information that isn't present in the parsed document.
def element_line(element)
@lines[element_linenumber(element) - 1]
end
##
# Returns a list of line numbers for all elements passed in. You can pass
# in a list of element objects or a list of options hashes here.
def element_linenumbers(elements)
elements.map { |e| element_linenumber(e) }
end
##
# Returns the actual source lines for a list of elements. You can pass in
# a list of elements objects or a list of options hashes here.
def element_lines(elements)
elements.map { |e| element_line(e) }
end
##
# Returns the header 'style' - :atx (hashes at the beginning), :atx_closed
# (atx header style, but with hashes at the end of the line also), :setext
# (underlined). You can pass in the element object or an options hash
# here.
def header_style(header)
if header.type != :header
raise 'header_style called with non-header element'
end
line = element_line(header)
if line.start_with?('#')
if line.strip.end_with?('#')
:atx_closed
else
:atx
end
else
:setext
end
end
##
# Returns the list style for a list: :asterisk, :plus, :dash, :ordered or
# :ordered_paren depending on which symbol is used to denote the list
# item. You can pass in either the element itself or an options hash here.
def list_style(item)
raise 'list_style called with non-list element' if item.type != :li
line = element_line(item).strip.gsub(/^>\s+/, '')
if line.start_with?('*')
:asterisk
elsif line.start_with?('+')
:plus
elsif line.start_with?('-')
:dash
elsif line.match('[0-9]+\.')
:ordered
elsif line.match('[0-9]+\)')
:ordered_paren
else
:unknown
end
end
##
# Returns how much a given line is indented. Hard tabs are treated as an
# indent of 8 spaces. You need to pass in the raw string here.
def indent_for(line)
line.match(/^\s*/)[0].gsub("\t", ' ' * 8).length
end
##
# Returns line numbers for lines that match the given regular expression
def matching_lines(regex)
@lines.each_with_index.select { |text, _linenum| regex.match(text) }
.map do |i|
i[1] + 1
end
end
##
# Returns line numbers for lines that match the given regular expression.
# Only considers text inside of 'text' elements (i.e. regular markdown
# text and not code/links or other elements).
def matching_text_element_lines(regex, exclude_nested = [:a])
matches = []
find_type_elements_except(:text, exclude_nested).each do |e|
first_line = e.options[:location]
# We'll error out if kramdown doesn't have location information for
# the current element. It's better to just not match in these cases
# rather than crash.
next if first_line.nil?
lines = e.value.split("\n")
lines.each_with_index do |l, i|
matches << (first_line + i) if regex.match(l)
end
end
matches
end
##
# Extracts the text from an element whose children consist of text
# elements and other things
def extract_text(element, prefix = '', restore_whitespace = true)
quotes = {
:rdquo => '"',
:ldquo => '"',
:lsquo => "'",
:rsquo => "'",
}
# If anything goes amiss here, e.g. unknown type, then nil will be
# returned and we'll just not catch that part of the text, which seems
# like a sensible failure mode.
lines = element.children.map do |e|
if e.type == :text
e.value
elsif %i{strong em p codespan}.include?(e.type)
extract_text(e, prefix, restore_whitespace).join("\n")
elsif e.type == :smart_quote
quotes[e.value]
end
end.join.split("\n")
# Text blocks have whitespace stripped, so we need to add it back in at
# the beginning. Because this might be in something like a blockquote,
# we optionally strip off a prefix given to the function.
lines[0] = element_line(element).sub(prefix, '') if restore_whitespace
lines
end
##
# Returns the element as plaintext
def extract_as_text(element)
quotes = {
:rdquo => '"',
:ldquo => '"',
:lsquo => "'",
:rsquo => "'",
}
# If anything goes amiss here, e.g. unknown type, then nil will be
# returned and we'll just not catch that part of the text, which seems
# like a sensible failure mode.
element.children.map do |e|
if e.type == :text || e.type == :codespan
e.value
elsif %i{strong em p a}.include?(e.type)
extract_as_text(e).join("\n")
elsif e.type == :smart_quote
quotes[e.value]
end
end.join.split("\n")
end
private
##
# Adds a 'level' and 'parent' option to all elements to show how nested they
# are
def add_annotations(elements, level = 1, parent = nil)
elements.each do |e|
e.options[:element_level] = level
e.options[:parent] = parent
add_annotations(e.children, level + 1, e)
end
end
end
end
|