File: document_chunker.h

package info (click to toggle)
chromium 138.0.7204.183-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 6,071,908 kB
  • sloc: cpp: 34,937,088; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,953; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,806; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (103 lines) | stat: -rw-r--r-- 4,271 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
// Copyright 2024 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef THIRD_PARTY_BLINK_RENDERER_MODULES_CONTENT_EXTRACTION_DOCUMENT_CHUNKER_H_
#define THIRD_PARTY_BLINK_RENDERER_MODULES_CONTENT_EXTRACTION_DOCUMENT_CHUNKER_H_

#include "third_party/blink/renderer/core/dom/node.h"
#include "third_party/blink/renderer/platform/wtf/text/wtf_string.h"
#include "third_party/blink/renderer/platform/wtf/vector.h"

namespace blink {

class HTMLIFrameElement;

// Returns true if the content of `iframe_element` should be included for
// inner text or document passages.
bool ShouldContentExtractionIncludeIFrame(const HTMLIFrameElement& iframe_element);

// Chunks documents into text passages. Each passage contains either a single
// node of text, or the text of the node and its siblings and descendants if the
// total number of words is less than max_words_per_aggregate_passage. This is
// done by recursively walking the document tree, gathering the content of
// individual text nodes ("segments") and then aggregating these into longer
// strings ("passages"), each containing whitespace-joined segments from zero or
// more siblings and descendants.
class DocumentChunker {
 public:
  // Parameters:
  // max_words_per_aggregate_passage: Maximum number of words in a passage
  //  comprised of multiple nodes. A passage with text from only a single
  //  node may exceed this max.
  // greedily_aggregate_sibling_nodes: If true, sibling nodes are greedily
  //  aggregated into passages under max_words_per_aggregate_passage words. If
  //  false, each sibling node is output as a separate passage if they cannot
  //  all be combined into a single passage under
  //  max_words_per_aggregate_passage words.
  DocumentChunker(size_t max_words_per_aggregate_passage,
                  bool greedily_aggregate_sibling_nodes,
                  uint32_t max_passages,
                  uint32_t min_words_per_passage);

  // Chunks the node and its descendants into text passages.
  // Returns a vector of text passages.
  Vector<String> Chunk(const Node& tree);

 private:
  struct AggregateNode;

  // A list of finished aggregations of text segments, built from the leaves up.
  struct PassageList {
    // Creates and adds a text passage for the input node, if it is non-empty,
    // and contains more words than the given minimum.
    void AddPassageForNode(const AggregateNode& node,
                           size_t min_words_per_passage);

    // Extends this PassageList from another given |passage_list|.
    void Extend(const PassageList& passage_list);

    // Passages are completed aggregations of text segments. It is possible
    // for a single passage to exceed max_words_per_aggregate_passage but the
    // aggregation process tries to avoid it. This has an inline capacity of
    // 32 to avoid excessive per-node reallocations during the recursive walk.
    Vector<String, 32> passages;
  };

  // Contains aggregate information about a node and its descendants.
  struct AggregateNode {
    // Returns true if |node| can be added without exceeding |max_words|.
    bool Fits(const AggregateNode& node, size_t max_words);

    // Adds the input node to this AggregateNode.
    void AddNode(const AggregateNode& node);

    // Returns a text passage built from joined |segments|.
    String CreatePassage() const;

    // Segments of text that are part of this AggregateNode.
    // These are accumulated as work in progress toward creating full passages.
    Vector<String, 32> segments;

    // Total number of words in |segments|.
    size_t num_words = 0;

    // Completed passages for this node and its descendants.
    PassageList passage_list;
  };

  // Recursively processes a node and its descendants, returning early if
  // a maximum |depth| is reached.
  AggregateNode ProcessNode(const Node& node,
                            int depth,
                            uint32_t passage_count);

  size_t max_words_per_aggregate_passage_;
  bool greedily_aggregate_sibling_nodes_;
  uint32_t max_passages_;
  uint32_t min_words_per_passage_;
};

}  // namespace blink

#endif  // THIRD_PARTY_BLINK_RENDERER_MODULES_CONTENT_EXTRACTION_DOCUMENT_CHUNKER_H_