File: inner_text_builder.cc

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (142 lines) | stat: -rw-r--r-- 5,439 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
// Copyright 2023 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "third_party/blink/renderer/modules/content_extraction/inner_text_builder.h"

#include "third_party/blink/renderer/core/dom/document.h"
#include "third_party/blink/renderer/core/frame/local_frame.h"
#include "third_party/blink/renderer/core/html/html_body_element.h"
#include "third_party/blink/renderer/core/html/html_iframe_element.h"
#include "third_party/blink/renderer/modules/content_extraction/document_chunker.h"

namespace blink {

// static
mojom::blink::InnerTextFramePtr InnerTextBuilder::Build(
    LocalFrame& frame,
    const mojom::blink::InnerTextParams& params) {
  auto inner_text_frame = mojom::blink::InnerTextFrame::New();
  inner_text_frame->token = frame.GetLocalFrameToken();
  auto* body = frame.GetDocument()->body();
  if (!body) {
    return inner_text_frame;
  }
  HeapVector<Member<ChildIFrame>> child_iframes;
  InnerTextBuilder builder(params, child_iframes);
  builder.Build(*body, *inner_text_frame);
  return inner_text_frame;
}

InnerTextBuilder::InnerTextBuilder(
    const mojom::blink::InnerTextParams& params,
    HeapVector<Member<ChildIFrame>>& child_iframes)
    : params_(params), child_iframes_(child_iframes) {}

void InnerTextBuilder::Build(HTMLElement& body,
                             mojom::blink::InnerTextFrame& frame) {
  String inner_text = body.innerText(this);
  unsigned inner_text_offset = 0;
  for (auto& child_iframe : child_iframes_) {
    const HTMLIFrameElement* iframe_element = child_iframe->iframe;
    if (!ShouldContentExtractionIncludeIFrame(*iframe_element)) {
      continue;
    }
    AddNextNonFrameSegments(inner_text, child_iframe->offset, inner_text_offset,
                            frame);

    LocalFrame* iframe_frame =
        DynamicTo<LocalFrame>(iframe_element->ContentFrame());
    // ShouldContentExtractionIncludeIFrame only returns true if all of these
    // are true.
    CHECK(iframe_frame);
    auto* iframe_document = iframe_element->contentDocument();
    CHECK(iframe_document);
    CHECK(iframe_document->body());

    mojom::blink::InnerTextFramePtr child_inner_text_frame =
        mojom::blink::InnerTextFrame::New();
    child_inner_text_frame->token = iframe_frame->GetLocalFrameToken();

    HeapVector<Member<ChildIFrame>> child_iframes;
    InnerTextBuilder iframe_builder(params_, child_iframes);
    iframe_builder.Build(*iframe_document->body(), *child_inner_text_frame);
    frame.segments.push_back(mojom::blink::InnerTextSegment::NewFrame(
        std::move(child_inner_text_frame)));
  }
  AddNextNonFrameSegments(inner_text, inner_text.length(), inner_text_offset,
                          frame);
}

void InnerTextBuilder::AddNextNonFrameSegments(
    const String& text,
    unsigned next_child_offset,
    unsigned& text_offset,
    mojom::blink::InnerTextFrame& frame) {
  if (matching_node_location_ &&
      *matching_node_location_ <= next_child_offset) {
    if (text_offset != *matching_node_location_) {
      frame.segments.push_back(mojom::blink::InnerTextSegment::NewText(
          text.Substring(text_offset, *matching_node_location_ - text_offset)));
      text_offset = *matching_node_location_;
    }
    frame.segments.push_back(mojom::blink::InnerTextSegment::NewNodeLocation(
        mojom::blink::NodeLocationType::kStart));
    matching_node_location_.reset();
  }
  if (next_child_offset > text_offset) {
    frame.segments.push_back(mojom::blink::InnerTextSegment::NewText(
        text.Substring(text_offset, next_child_offset - text_offset)));
    text_offset = next_child_offset;
  }
}

void InnerTextBuilder::WillVisit(const Node& element, unsigned offset) {
  if (const auto* iframe = DynamicTo<HTMLIFrameElement>(&element)) {
    auto* child_iframe = MakeGarbageCollected<ChildIFrame>();
    child_iframe->offset = offset;
    child_iframe->iframe = iframe;
    child_iframes_.push_back(child_iframe);
  }
  if (params_.node_id && Node::FromDomNodeId(*params_.node_id) == &element) {
    matching_node_location_ = offset;
  }
}

void InnerTextBuilder::ChildIFrame::Trace(Visitor* visitor) const {
  visitor->Trace(iframe);
}

////////////////////////////////////////////////////////////////////////////////

// static
mojom::blink::InnerTextFramePtr InnerTextPassagesBuilder::Build(
    LocalFrame& frame,
    const mojom::blink::InnerTextParams& params) {
  auto inner_text_frame = mojom::blink::InnerTextFrame::New();
  inner_text_frame->token = frame.GetLocalFrameToken();
  Document* document = frame.GetDocument();
  if (!document) {
    return inner_text_frame;
  }

  // Operate on the document node instead of the body because
  // the head may contain useful information like title.
  DocumentChunker document_chunker(
      params.max_words_per_aggregate_passage.value_or(200),
      params.greedily_aggregate_sibling_nodes.value_or(true),
      params.max_passages, params.min_words_per_passage.value_or(0));
  auto segments = document_chunker.Chunk(*document);
  inner_text_frame->segments.ReserveInitialCapacity(segments.size());
  for (const String& s : segments) {
    inner_text_frame->segments.push_back(
        mojom::blink::InnerTextSegment::NewText(s));
  }

  return inner_text_frame;
}

InnerTextPassagesBuilder::InnerTextPassagesBuilder(
    const mojom::blink::InnerTextParams& params) {}

}  // namespace blink