1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248
|
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
#include "third_party/blink/renderer/modules/content_extraction/document_chunker.h"
#include "third_party/blink/renderer/core/dom/node_traversal.h"
#include "third_party/blink/renderer/core/dom/text.h"
#include "third_party/blink/renderer/core/frame/local_frame.h"
#include "third_party/blink/renderer/core/html/html_element.h"
#include "third_party/blink/renderer/core/html/html_iframe_element.h"
#include "third_party/blink/renderer/core/html_names.h"
#include "third_party/blink/renderer/core/svg/svg_element.h"
#include "third_party/blink/renderer/core/svg_names.h"
#include "third_party/blink/renderer/platform/wtf/text/character_visitor.h"
#include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
namespace blink {
namespace {
// Checks for excluded tags. Text within these will be excluded from passages.
bool IsExcludedElement(const Node& node) {
const Element* element = DynamicTo<Element>(node);
if (!element) {
return false;
}
return element->HasTagName(html_names::kNoscriptTag) ||
element->HasTagName(html_names::kScriptTag) ||
element->HasTagName(html_names::kStyleTag) ||
element->HasTagName(svg_names::kDefsTag) ||
element->HasTagName(svg_names::kStyleTag) ||
element->HasTagName(svg_names::kScriptTag);
}
// Checks for tags that indicate a section break. Sibling nodes will not be
// greedily aggregated into a chunk across one of these tags.
bool IsSectionBreak(const Node& node) {
const HTMLElement* html_element = DynamicTo<HTMLElement>(node);
if (!html_element) {
return false;
}
return html_element->HasTagName(html_names::kArticleTag) ||
html_element->HasTagName(html_names::kBrTag) ||
html_element->HasTagName(html_names::kDivTag) ||
html_element->HasTagName(html_names::kH1Tag) ||
html_element->HasTagName(html_names::kH2Tag) ||
html_element->HasTagName(html_names::kH3Tag) ||
html_element->HasTagName(html_names::kH4Tag) ||
html_element->HasTagName(html_names::kH5Tag) ||
html_element->HasTagName(html_names::kH6Tag) ||
html_element->HasTagName(html_names::kHrTag) ||
html_element->HasTagName(html_names::kFooterTag) ||
html_element->HasTagName(html_names::kHeaderTag) ||
html_element->HasTagName(html_names::kMainTag) ||
html_element->HasTagName(html_names::kNavTag);
}
} // namespace
bool ShouldContentExtractionIncludeIFrame(const HTMLIFrameElement& iframe_element) {
if (iframe_element.IsAdRelated()) {
return false;
}
LocalFrame* iframe_frame =
DynamicTo<LocalFrame>(iframe_element.ContentFrame());
if (!iframe_frame || iframe_frame->IsCrossOriginToParentOrOuterDocument()) {
return false;
}
Document* iframe_document = iframe_frame->GetDocument();
if (!iframe_document->body()) {
return false;
}
return true;
}
DocumentChunker::DocumentChunker(size_t max_words_per_aggregate_passage,
bool greedily_aggregate_sibling_nodes,
uint32_t max_passages,
uint32_t min_words_per_passage)
: max_words_per_aggregate_passage_(max_words_per_aggregate_passage),
greedily_aggregate_sibling_nodes_(greedily_aggregate_sibling_nodes),
max_passages_(max_passages),
min_words_per_passage_(min_words_per_passage) {}
Vector<String> DocumentChunker::Chunk(const Node& tree) {
AggregateNode root = ProcessNode(tree, 0, 0);
if (root.passage_list.passages.empty()) {
root.passage_list.AddPassageForNode(root, min_words_per_passage_);
}
Vector<String> passages(root.passage_list.passages);
if (max_passages_ != 0 && passages.size() > max_passages_) {
passages.Shrink(max_passages_);
}
for (String& passage : passages) {
passage.Truncate(1024);
}
return passages;
}
DocumentChunker::AggregateNode DocumentChunker::ProcessNode(
const Node& node,
int depth,
uint32_t passage_count) {
if (depth > 96 || (max_passages_ != 0 && passage_count >= max_passages_)) {
// Limit processing of deep trees, and passages beyond the max.
return {};
}
AggregateNode current_node;
if (IsExcludedElement(node) || node.getNodeType() == Node::kCommentNode) {
// Exclude text within these nodes.
return current_node;
}
if (const HTMLIFrameElement* iframe = DynamicTo<HTMLIFrameElement>(&node)) {
if (!ShouldContentExtractionIncludeIFrame(*iframe)) {
return current_node;
}
const LocalFrame* local_frame = To<LocalFrame>(iframe->ContentFrame());
return ProcessNode(*local_frame->GetDocument(), depth + 1, passage_count);
}
if (const Text* text = DynamicTo<Text>(node)) {
String simplified_text = text->data().SimplifyWhiteSpace();
if (!simplified_text.empty()) {
current_node.num_words =
WTF::VisitCharacters(simplified_text, [](auto chars) {
return std::count(chars.begin(), chars.end(), ' ') + 1;
});
current_node.segments.push_back(simplified_text);
}
return current_node;
}
// Will hold the aggregate of this node and all its unchunked descendants
// after we've recursed over all of its children.
AggregateNode current_aggregating_node;
// As above, but this holds the current greedy aggregate, which can be reset
// when starting a new greedy aggregate passage (if the current greedy
// aggregate is over max words, we hit a section break, or we hit a node
// that is already part of another passage).
AggregateNode current_greedy_aggregating_node;
// Indicates whether we should attempt to aggregate the node being processed
// in this function with its children. We only attempt to aggregate if we
// can include all of its descendants in the aggregate.
bool should_aggregate_current_node = true;
// Will hold a list of descendant passages that should be added to this
// current_node.passage_list if we do not end up aggregating the
// current_node into a passage with its descendants.
PassageList passage_list;
for (const Node& child : NodeTraversal::ChildrenOf(node)) {
AggregateNode child_node = ProcessNode(
child, depth + 1, passage_count + passage_list.passages.size());
if (!child_node.passage_list.passages.empty()) {
should_aggregate_current_node = false;
if (greedily_aggregate_sibling_nodes_) {
passage_list.AddPassageForNode(current_greedy_aggregating_node,
min_words_per_passage_);
current_greedy_aggregating_node = AggregateNode();
}
passage_list.Extend(child_node.passage_list);
} else {
current_aggregating_node.AddNode(child_node);
if (greedily_aggregate_sibling_nodes_) {
if (!IsSectionBreak(child) &&
current_greedy_aggregating_node.Fits(
child_node, max_words_per_aggregate_passage_)) {
current_greedy_aggregating_node.AddNode(child_node);
} else {
passage_list.AddPassageForNode(current_greedy_aggregating_node,
min_words_per_passage_);
current_greedy_aggregating_node = child_node;
}
} else {
passage_list.AddPassageForNode(child_node, min_words_per_passage_);
}
}
}
if (greedily_aggregate_sibling_nodes_) {
passage_list.AddPassageForNode(current_greedy_aggregating_node,
min_words_per_passage_);
}
// If we should not or cannot aggregate this node, add passages for this
// node and its descendant passages.
if (!should_aggregate_current_node ||
!current_node.Fits(current_aggregating_node,
max_words_per_aggregate_passage_)) {
current_node.passage_list.AddPassageForNode(current_node,
min_words_per_passage_);
current_node.passage_list.Extend(passage_list);
return current_node;
}
// Add this node to the aggregate.
current_node.AddNode(current_aggregating_node);
return current_node;
}
void DocumentChunker::PassageList::AddPassageForNode(
const AggregateNode& node,
size_t min_words_per_passage) {
if (node.num_words < min_words_per_passage) {
return;
}
String passage = node.CreatePassage();
if (!passage.empty()) {
passages.push_back(std::move(passage));
}
}
void DocumentChunker::PassageList::Extend(const PassageList& passage_list) {
passages.AppendVector(passage_list.passages);
}
bool DocumentChunker::AggregateNode::Fits(const AggregateNode& node,
size_t max_words) {
return num_words + node.num_words <= max_words;
}
void DocumentChunker::AggregateNode::AddNode(const AggregateNode& node) {
num_words += node.num_words;
segments.AppendVector(node.segments);
}
String DocumentChunker::AggregateNode::CreatePassage() const {
if (segments.empty()) {
return String();
}
StringBuilder builder;
builder.Append(segments[0]);
for (unsigned int i = 1; i < segments.size(); i++) {
builder.Append(' ');
builder.Append(segments[i]);
}
return builder.ReleaseString();
}
} // namespace blink
|