1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127
|
// Copyright 2017 The Closure Library Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS-IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/**
* @fileoverview Contains utility methods to extract text content from HTML.
* @supported IE 10+, Chrome 26+, Firefox 22+, Safari 7.1+, Opera 15+
*/
goog.provide('goog.html.textExtractor');
goog.require('goog.array');
goog.require('goog.dom.TagName');
goog.require('goog.html.sanitizer.HtmlSanitizer');
goog.require('goog.object');
goog.require('goog.userAgent');
/**
* Safely extracts text from an untrusted HTML string using the HtmlSanitizer.
* Compared to goog.html.utils.stripHtmlTags, it tries to be smarter about
* printing newlines between blocks and leave out textual content that would not
* be displayed to the user (such as SCRIPT and STYLE tags).
* @param {string} html The untrusted HTML string.
* @return {string}
*/
// TODO(pelizzi): consider an optional bool parameter to also extract the text
// content of alt attributes and such.
goog.html.textExtractor.extractTextContent = function(html) {
if (!goog.html.textExtractor.isSupported()) {
return '';
}
// Disable all attributes except style to protect against DOM clobbering.
var sanitizer = new goog.html.sanitizer.HtmlSanitizer.Builder()
.onlyAllowAttributes(['style'])
.allowCssStyles()
.build();
// The default policy of the sanitizer strips the content of tags such as
// SCRIPT and STYLE, whose non-textual content would otherwise end up in the
// extracted text.
var sanitizedNodes = sanitizer.sanitizeToDomNode(html);
// textContent and innerText do not handle spacing between block elements
// properly. We need to reimplement a similar algorithm ourselves and account
// for spacing between block elements.
return goog.html.textExtractor.extractTextContentFromNode_(sanitizedNodes)
.trim();
};
/**
* Recursively extract text from the supplied DOM node and its descendants.
* @param {!Node} node
* @return {string}
* @private
*/
goog.html.textExtractor.extractTextContentFromNode_ = function(node) {
switch (node.nodeType) {
case Node.ELEMENT_NODE:
var element = /** @type {!Element} */ (node);
if (element.tagName == goog.dom.TagName.BR) {
return '\n';
}
var result = goog.array
.map(
node.childNodes,
goog.html.textExtractor.extractTextContentFromNode_)
.join('');
if (goog.html.textExtractor.isBlockElement_(element)) {
result = '\n' + result + '\n';
}
return result;
case Node.TEXT_NODE:
return node.nodeValue.replace(/\s+/g, ' ').trim();
default:
return '';
}
};
/**
* A set of block elements.
* @private @const {!Object<!goog.dom.TagName, boolean>}
*/
goog.html.textExtractor.BLOCK_ELEMENTS_ = goog.object.createSet(
goog.dom.TagName.ADDRESS, goog.dom.TagName.BLOCKQUOTE,
goog.dom.TagName.CENTER, goog.dom.TagName.DIV, goog.dom.TagName.DL,
goog.dom.TagName.FIELDSET, goog.dom.TagName.FORM, goog.dom.TagName.H1,
goog.dom.TagName.H2, goog.dom.TagName.H3, goog.dom.TagName.H4,
goog.dom.TagName.H5, goog.dom.TagName.H6, goog.dom.TagName.HR,
goog.dom.TagName.OL, goog.dom.TagName.P, goog.dom.TagName.PRE,
goog.dom.TagName.TABLE, goog.dom.TagName.UL);
/**
* Returns true whether this is a block element, i.e. the browser would visually
* separate the text content from the text content of the previous node.
* @param {!Element} element
* @return {boolean}
* @private
*/
goog.html.textExtractor.isBlockElement_ = function(element) {
return element.style.display == 'block' ||
goog.html.textExtractor.BLOCK_ELEMENTS_.hasOwnProperty(element.tagName);
};
/**
* Whether the browser supports the text extractor. The extractor depends on the
* HTML Sanitizer, which only supports IE starting from version 10.
* Visible for testing.
* @return {boolean}
* @package
*/
goog.html.textExtractor.isSupported = function() {
return !goog.userAgent.IE || goog.userAgent.isVersionOrHigher(10);
};
|