1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210
|
# frozen_string_literal: true
require 'nokogiri'
require 'active_support/xml_mini/nokogiri' # convert Documents to hashes
module HTML
# GitHub HTML processing filters and utilities. This module includes a small
# framework for defining DOM based content filters and applying them to user
# provided content.
#
# See HTML::Pipeline::Filter for information on building filters.
#
# Construct a Pipeline for running multiple HTML filters. A pipeline is created once
# with one to many filters, and it then can be `call`ed many times over the course
# of its lifetime with input.
#
# filters - Array of Filter objects. Each must respond to call(doc,
# context) and return the modified DocumentFragment or a
# String containing HTML markup. Filters are performed in the
# order provided.
# default_context - The default context hash. Values specified here will be merged
# into values from the each individual pipeline run. Can NOT be
# nil. Default: empty Hash.
# result_class - The default Class of the result object for individual
# calls. Default: Hash. Protip: Pass in a Struct to get
# some semblance of type safety.
class Pipeline
autoload :VERSION, 'html/pipeline/version'
autoload :Filter, 'html/pipeline/filter'
autoload :AbsoluteSourceFilter, 'html/pipeline/absolute_source_filter'
autoload :BodyContent, 'html/pipeline/body_content'
autoload :AutolinkFilter, 'html/pipeline/autolink_filter'
autoload :CamoFilter, 'html/pipeline/camo_filter'
autoload :EmailReplyFilter, 'html/pipeline/email_reply_filter'
autoload :EmojiFilter, 'html/pipeline/emoji_filter'
autoload :HttpsFilter, 'html/pipeline/https_filter'
autoload :ImageFilter, 'html/pipeline/image_filter'
autoload :ImageMaxWidthFilter, 'html/pipeline/image_max_width_filter'
autoload :MarkdownFilter, 'html/pipeline/markdown_filter'
autoload :MentionFilter, 'html/pipeline/@mention_filter'
autoload :TeamMentionFilter, 'html/pipeline/@team_mention_filter'
autoload :PlainTextInputFilter, 'html/pipeline/plain_text_input_filter'
autoload :SanitizationFilter, 'html/pipeline/sanitization_filter'
autoload :SyntaxHighlightFilter, 'html/pipeline/syntax_highlight_filter'
autoload :TextileFilter, 'html/pipeline/textile_filter'
autoload :TableOfContentsFilter, 'html/pipeline/toc_filter'
autoload :TextFilter, 'html/pipeline/text_filter'
class MissingDependencyError < RuntimeError; end
def self.require_dependency(name, requirer)
require name
rescue LoadError => e
raise MissingDependencyError,
"Missing dependency '#{name}' for #{requirer}. See README.md for details.\n#{e.class.name}: #{e}"
end
# Our DOM implementation.
DocumentFragment = Nokogiri::HTML::DocumentFragment
# Parse a String into a DocumentFragment object. When a DocumentFragment is
# provided, return it verbatim.
def self.parse(document_or_html)
document_or_html ||= ''
if document_or_html.is_a?(String)
DocumentFragment.parse(document_or_html)
else
document_or_html
end
end
# Public: Returns an Array of Filter objects for this Pipeline.
attr_reader :filters
# Public: Instrumentation service for the pipeline.
# Set an ActiveSupport::Notifications compatible object to enable.
attr_accessor :instrumentation_service
# Public: String name for this Pipeline. Defaults to Class name.
attr_writer :instrumentation_name
def instrumentation_name
return @instrumentation_name if defined?(@instrumentation_name)
@instrumentation_name = self.class.name
end
class << self
# Public: Default instrumentation service for new pipeline objects.
attr_accessor :default_instrumentation_service
end
def initialize(filters, default_context = {}, result_class = nil)
raise ArgumentError, 'default_context cannot be nil' if default_context.nil?
@filters = filters.flatten.freeze
@default_context = default_context.freeze
@result_class = result_class || Hash
@instrumentation_service = self.class.default_instrumentation_service
end
# Apply all filters in the pipeline to the given HTML.
#
# html - A String containing HTML or a DocumentFragment object.
# context - The context hash passed to each filter. See the Filter docs
# for more info on possible values. This object MUST NOT be modified
# in place by filters. Use the Result for passing state back.
# result - The result Hash passed to each filter for modification. This
# is where Filters store extracted information from the content.
#
# Returns the result Hash after being filtered by this Pipeline. Contains an
# :output key with the DocumentFragment or String HTML markup based on the
# output of the last filter in the pipeline.
def call(html, context = {}, result = nil)
context = @default_context.merge(context)
context = context.freeze
result ||= @result_class.new
payload = default_payload filters: @filters.map(&:name),
context: context, result: result
instrument 'call_pipeline.html_pipeline', payload do
result[:output] =
@filters.inject(html) do |doc, filter|
perform_filter(filter, doc, context, result)
end
end
result
end
# Internal: Applies a specific filter to the supplied doc.
#
# The filter is instrumented.
#
# Returns the result of the filter.
def perform_filter(filter, doc, context, result)
payload = default_payload filter: filter.name,
context: context, result: result
instrument 'call_filter.html_pipeline', payload do
filter.call(doc, context, result)
end
end
# Like call but guarantee the value returned is a DocumentFragment.
# Pipelines may return a DocumentFragment or a String. Callers that need a
# DocumentFragment should use this method.
def to_document(input, context = {}, result = nil)
result = call(input, context, result)
HTML::Pipeline.parse(result[:output])
end
# Like call but guarantee the value returned is a string of HTML markup.
def to_html(input, context = {}, result = nil)
result = call(input, context, result = nil)
output = result[:output]
if output.respond_to?(:to_html)
output.to_html
else
output.to_s
end
end
# Public: setup instrumentation for this pipeline.
#
# Returns nothing.
def setup_instrumentation(name = nil, service = nil)
self.instrumentation_name = name
self.instrumentation_service =
service || self.class.default_instrumentation_service
end
# Internal: if the `instrumentation_service` object is set, instruments the
# block, otherwise the block is ran without instrumentation.
#
# Returns the result of the provided block.
def instrument(event, payload = nil)
payload ||= default_payload
return yield(payload) unless instrumentation_service
instrumentation_service.instrument event, payload do |payload|
yield payload
end
end
# Internal: Default payload for instrumentation.
#
# Accepts a Hash of additional payload data to be merged.
#
# Returns a Hash.
def default_payload(payload = {})
{ pipeline: instrumentation_name }.merge(payload)
end
end
end
# XXX nokogiri monkey patches for 1.8
unless ''.respond_to?(:force_encoding)
class Nokogiri::XML::Node
# Work around an issue with utf-8 encoded data being erroneously converted to
# ... some other shit when replacing text nodes. See 'utf-8 output 2' in
# user_content_test.rb for details.
def replace_with_encoding_fix(replacement)
if replacement.respond_to?(:to_str)
replacement = document.fragment("<div>#{replacement}</div>").children.first.children
end
replace_without_encoding_fix(replacement)
end
alias replace_without_encoding_fix replace
alias replace replace_with_encoding_fix
def swap(replacement)
replace(replacement)
self
end
end
end
|