File: textextractor.js

package info (click to toggle)
aseba-plugin-blockly 20180211%2Bgit-2
links: PTS
area: non-free
in suites: buster
size: 64,472 kB
sloc: xml: 7,976; python: 2,314; sh: 261; lisp: 24; makefile: 10
file content (127 lines) | stat: -rw-r--r-- 4,656 bytes
parent folder | download | duplicates (8)
// Copyright 2017 The Closure Library Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//      http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS-IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


/**
 * @fileoverview Contains utility methods to extract text content from HTML.
 * @supported IE 10+, Chrome 26+, Firefox 22+, Safari 7.1+, Opera 15+
 */

goog.provide('goog.html.textExtractor');

goog.require('goog.array');
goog.require('goog.dom.TagName');
goog.require('goog.html.sanitizer.HtmlSanitizer');
goog.require('goog.object');
goog.require('goog.userAgent');


/**
 * Safely extracts text from an untrusted HTML string using the HtmlSanitizer.
 * Compared to goog.html.utils.stripHtmlTags, it tries to be smarter about
 * printing newlines between blocks and leave out textual content that would not
 * be displayed to the user (such as SCRIPT and STYLE tags).
 * @param {string} html The untrusted HTML string.
 * @return {string}
 */
// TODO(pelizzi): consider an optional bool parameter to also extract the text
// content of alt attributes and such.
goog.html.textExtractor.extractTextContent = function(html) {
  if (!goog.html.textExtractor.isSupported()) {
    return '';
  }
  // Disable all attributes except style to protect against DOM clobbering.
  var sanitizer = new goog.html.sanitizer.HtmlSanitizer.Builder()
                      .onlyAllowAttributes(['style'])
                      .allowCssStyles()
                      .build();
  // The default policy of the sanitizer strips the content of tags such as
  // SCRIPT and STYLE, whose non-textual content would otherwise end up in the
  // extracted text.
  var sanitizedNodes = sanitizer.sanitizeToDomNode(html);
  // textContent and innerText do not handle spacing between block elements
  // properly. We need to reimplement a similar algorithm ourselves and account
  // for spacing between block elements.
  return goog.html.textExtractor.extractTextContentFromNode_(sanitizedNodes)
      .trim();
};


/**
 * Recursively extract text from the supplied DOM node and its descendants.
 * @param {!Node} node
 * @return {string}
 * @private
 */
goog.html.textExtractor.extractTextContentFromNode_ = function(node) {
  switch (node.nodeType) {
    case Node.ELEMENT_NODE:
      var element = /** @type {!Element} */ (node);
      if (element.tagName == goog.dom.TagName.BR) {
        return '\n';
      }
      var result = goog.array
                       .map(
                           node.childNodes,
                           goog.html.textExtractor.extractTextContentFromNode_)
                       .join('');
      if (goog.html.textExtractor.isBlockElement_(element)) {
        result = '\n' + result + '\n';
      }
      return result;
    case Node.TEXT_NODE:
      return node.nodeValue.replace(/\s+/g, ' ').trim();
    default:
      return '';
  }
};


/**
 * A set of block elements.
 * @private @const {!Object<!goog.dom.TagName, boolean>}
 */
goog.html.textExtractor.BLOCK_ELEMENTS_ = goog.object.createSet(
    goog.dom.TagName.ADDRESS, goog.dom.TagName.BLOCKQUOTE,
    goog.dom.TagName.CENTER, goog.dom.TagName.DIV, goog.dom.TagName.DL,
    goog.dom.TagName.FIELDSET, goog.dom.TagName.FORM, goog.dom.TagName.H1,
    goog.dom.TagName.H2, goog.dom.TagName.H3, goog.dom.TagName.H4,
    goog.dom.TagName.H5, goog.dom.TagName.H6, goog.dom.TagName.HR,
    goog.dom.TagName.OL, goog.dom.TagName.P, goog.dom.TagName.PRE,
    goog.dom.TagName.TABLE, goog.dom.TagName.UL);


/**
 * Returns true whether this is a block element, i.e. the browser would visually
 * separate the text content from the text content of the previous node.
 * @param {!Element} element
 * @return {boolean}
 * @private
 */
goog.html.textExtractor.isBlockElement_ = function(element) {
  return element.style.display == 'block' ||
      goog.html.textExtractor.BLOCK_ELEMENTS_.hasOwnProperty(element.tagName);
};


/**
 * Whether the browser supports the text extractor. The extractor depends on the
 * HTML Sanitizer, which only supports IE starting from version 10.
 * Visible for testing.
 * @return {boolean}
 * @package
 */
goog.html.textExtractor.isSupported = function() {
  return !goog.userAgent.IE || goog.userAgent.isVersionOrHigher(10);
};