File: document_metadata_extractor.cc

package info (click to toggle)
chromium 139.0.7258.138-1
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid
  • size: 6,120,676 kB
  • sloc: cpp: 35,100,869; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (330 lines) | stat: -rw-r--r-- 11,501 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
// Copyright 2017 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "third_party/blink/renderer/modules/document_metadata/document_metadata_extractor.h"

#include <algorithm>
#include <memory>
#include <utility>

#include "base/notreached.h"
#include "components/schema_org/common/metadata.mojom-blink.h"
#include "third_party/blink/public/mojom/document_metadata/document_metadata.mojom-blink.h"
#include "third_party/blink/renderer/core/dom/document.h"
#include "third_party/blink/renderer/core/dom/element_traversal.h"
#include "third_party/blink/renderer/core/frame/local_frame.h"
#include "third_party/blink/renderer/core/html/html_element.h"
#include "third_party/blink/renderer/core/html_names.h"
#include "third_party/blink/renderer/platform/instrumentation/tracing/trace_event.h"
#include "third_party/blink/renderer/platform/json/json_parser.h"
#include "third_party/blink/renderer/platform/json/json_values.h"
#include "third_party/blink/renderer/platform/wtf/text/atomic_string.h"
#include "third_party/blink/renderer/platform/wtf/text/string_builder.h"
#include "third_party/blink/renderer/platform/wtf/vector.h"

namespace blink {

namespace {

using mojom::blink::WebPage;
using mojom::blink::WebPagePtr;
using schema_org::mojom::blink::Entity;
using schema_org::mojom::blink::EntityPtr;
using schema_org::mojom::blink::Property;
using schema_org::mojom::blink::PropertyPtr;
using schema_org::mojom::blink::Values;
using schema_org::mojom::blink::ValuesPtr;

// App Indexing enforces a max nesting depth of 5. Our top level message
// corresponds to the WebPage, so this only leaves 4 more levels. We will parse
// entites up to this depth, and ignore any further nesting. If an object at the
// max nesting depth has a property corresponding to an entity, that property
// will be dropped. Note that we will still parse json-ld blocks deeper than
// this, but it won't be passed to App Indexing.
constexpr int kMaxDepth = 4;
// Some strings are very long, and we don't currently use those, so limit string
// length to something reasonable to avoid undue pressure on Icing. Note that
// App Indexing supports strings up to length 20k.
constexpr wtf_size_t kMaxStringLength = 200;
// Enforced by App Indexing, so stop processing early if possible.
constexpr wtf_size_t kMaxNumFields = 20;
// Enforced by App Indexing, so stop processing early if possible.
constexpr wtf_size_t kMaxRepeatedSize = 100;

constexpr char kJSONLDKeyType[] = "@type";
constexpr char kJSONLDKeyGraph[] = "@graph";
bool IsSupportedType(AtomicString type) {
  DEFINE_STATIC_LOCAL(
      HashSet<AtomicString>, elements,
      ({// Common types that include addresses.
        AtomicString("AutoDealer"), AtomicString("Hotel"),
        AtomicString("LocalBusiness"), AtomicString("Organization"),
        AtomicString("Person"), AtomicString("Place"),
        AtomicString("PostalAddress"), AtomicString("Product"),
        AtomicString("Residence"), AtomicString("Restaurant"),
        AtomicString("SingleFamilyResidence"),
        // Common types including phone numbers
        AtomicString("Store"), AtomicString("ContactPoint"),
        AtomicString("LodgingBusiness")}));
  return type && elements.Contains(type);
}

void ExtractEntity(const JSONObject&, int recursion_level, Entity&);

bool ParseRepeatedValue(const JSONArray& arr,
                        int recursion_level,
                        ValuesPtr& values) {
  if (arr.size() < 1) {
    return false;
  }

  const JSONValue::ValueType type = arr.at(0)->GetType();
  switch (type) {
    case JSONValue::ValueType::kTypeNull:
      return false;
    case JSONValue::ValueType::kTypeBoolean:
      values = Values::NewBoolValues({});
      break;
    case JSONValue::ValueType::kTypeInteger:
      values = Values::NewLongValues({});
      break;
    // App Indexing doesn't support double type, so just encode its decimal
    // value as a string instead.
    case JSONValue::ValueType::kTypeDouble:
    case JSONValue::ValueType::kTypeString:
      values = Values::NewStringValues({});
      break;
    case JSONValue::ValueType::kTypeObject:
      if (recursion_level + 1 >= kMaxDepth) {
        return false;
      }
      values = Values::NewEntityValues({});
      break;
    case JSONArray::ValueType::kTypeArray:
      // App Indexing doesn't support nested arrays.
      return false;
  }

  const wtf_size_t arr_size = std::min(arr.size(), kMaxRepeatedSize);
  for (wtf_size_t i = 0; i < arr_size; ++i) {
    const JSONValue* const element = arr.at(i);
    if (element->GetType() != type) {
      // App Indexing doesn't support mixed types. If there are mixed
      // types in the parsed object, we will drop the property.
      return false;
    }
    switch (type) {
      case JSONValue::ValueType::kTypeBoolean: {
        bool v;
        element->AsBoolean(&v);
        values->get_bool_values().push_back(v);
        continue;
      }
      case JSONValue::ValueType::kTypeInteger: {
        int v;
        element->AsInteger(&v);
        values->get_long_values().push_back(v);
        continue;
      }
      case JSONValue::ValueType::kTypeDouble: {
        // App Indexing doesn't support double type, so just encode its decimal
        // value as a string instead.
        double v;
        element->AsDouble(&v);
        String s = String::Number(v);
        s.Truncate(kMaxStringLength);
        values->get_string_values().push_back(s);
        continue;
      }
      case JSONValue::ValueType::kTypeString: {
        String v;
        element->AsString(&v);
        v.Truncate(kMaxStringLength);
        values->get_string_values().push_back(v);
        continue;
      }
      case JSONValue::ValueType::kTypeObject: {
        auto entity = Entity::New();
        ExtractEntity(*(JSONObject::Cast(element)), recursion_level + 1,
                      *entity);
        values->get_entity_values().push_back(std::move(entity));
        continue;
      }
      case JSONValue::ValueType::kTypeNull:
      case JSONValue::ValueType::kTypeArray:
        NOTREACHED();
    }
  }
  return true;
}

void ExtractEntity(const JSONObject& val, int recursion_level, Entity& entity) {
  if (recursion_level >= kMaxDepth) {
    return;
  }

  String type;
  val.GetString(kJSONLDKeyType, &type);
  if (!type) {
    type = "Thing";
  }
  entity.type = type;
  for (wtf_size_t i = 0; i < std::min(val.size(), kMaxNumFields); ++i) {
    PropertyPtr property = Property::New();
    const JSONObject::Entry& entry = val.at(i);
    property->name = entry.first;
    if (property->name == kJSONLDKeyType) {
      continue;
    }

    bool add_property = true;

    switch (entry.second->GetType()) {
      case JSONValue::ValueType::kTypeBoolean: {
        bool v;
        val.GetBoolean(entry.first, &v);
        property->values = Values::NewBoolValues({v});
      } break;
      case JSONValue::ValueType::kTypeInteger: {
        int v;
        val.GetInteger(entry.first, &v);
        property->values = Values::NewLongValues({v});
      } break;
      case JSONValue::ValueType::kTypeDouble: {
        double v;
        val.GetDouble(entry.first, &v);
        String s = String::Number(v);
        s.Truncate(kMaxStringLength);
        property->values = Values::NewStringValues({s});
      } break;
      case JSONValue::ValueType::kTypeString: {
        String v;
        val.GetString(entry.first, &v);
        v.Truncate(kMaxStringLength);
        property->values = Values::NewStringValues({v});
      } break;
      case JSONValue::ValueType::kTypeObject: {
        if (recursion_level + 1 >= kMaxDepth) {
          add_property = false;
          break;
        }
        Vector<EntityPtr> entities;
        entities.push_back(Entity::New());
        ExtractEntity(*(val.GetJSONObject(entry.first)), recursion_level + 1,
                      *entities[0]);
        property->values = Values::NewEntityValues(std::move(entities));
      } break;
      case JSONValue::ValueType::kTypeArray:
        add_property = ParseRepeatedValue(*(val.GetArray(entry.first)),
                                          recursion_level, property->values);
        break;
      case JSONValue::ValueType::kTypeNull:
        add_property = false;
        break;
    }
    if (add_property)
      entity.properties.push_back(std::move(property));
  }
}

void ExtractTopLevelEntity(const JSONObject& val, Vector<EntityPtr>& entities) {
  // Now we have a JSONObject which corresponds to a single (possibly nested)
  // entity.
  EntityPtr entity = Entity::New();
  String type;
  val.GetString(kJSONLDKeyType, &type);
  if (!IsSupportedType(AtomicString(type))) {
    return;
  }
  ExtractEntity(val, 0, *entity);
  entities.push_back(std::move(entity));
}

void ExtractEntitiesFromArray(const JSONArray& arr,
                              Vector<EntityPtr>& entities) {
  for (wtf_size_t i = 0; i < arr.size(); ++i) {
    const JSONValue* val = arr.at(i);
    if (val->GetType() == JSONValue::ValueType::kTypeObject) {
      ExtractTopLevelEntity(*(JSONObject::Cast(val)), entities);
    }
  }
}

void ExtractEntityFromTopLevelObject(const JSONObject& val,
                                     Vector<EntityPtr>& entities) {
  const JSONArray* graph = val.GetArray(kJSONLDKeyGraph);
  if (graph) {
    ExtractEntitiesFromArray(*graph, entities);
  }
  ExtractTopLevelEntity(val, entities);
}

// These values are persisted to logs. Entries should not be renumbered and
// numeric values should never be reused.
enum ExtractionStatus {
  kOK,
  kEmpty,
  kParseFailure,
  kWrongType,
  kMaxValue = kWrongType,
};

ExtractionStatus ExtractMetadata(const Element& root,
                                 Vector<EntityPtr>& entities) {
  for (Element& element : ElementTraversal::DescendantsOf(root)) {
    if (element.HasTagName(html_names::kScriptTag) &&
        element.FastGetAttribute(html_names::kTypeAttr) ==
            "application/ld+json") {
      // TODO(crbug.com/1264024): Deprecate JSON comments here, if possible.
      std::unique_ptr<JSONValue> json =
          ParseJSONWithCommentsDeprecated(element.textContent());
      if (!json) {
        LOG(ERROR) << "Failed to parse json.";
        return ExtractionStatus::kParseFailure;
      }
      switch (json->GetType()) {
        case JSONValue::ValueType::kTypeArray:
          ExtractEntitiesFromArray(*(JSONArray::Cast(json.get())), entities);
          break;
        case JSONValue::ValueType::kTypeObject:
          ExtractEntityFromTopLevelObject(*(JSONObject::Cast(json.get())),
                                          entities);
          break;
        default:
          return ExtractionStatus::kWrongType;
      }
    }
  }
  if (entities.empty()) {
    return ExtractionStatus::kEmpty;
  }
  return ExtractionStatus::kOK;
}

}  // namespace

WebPagePtr DocumentMetadataExtractor::Extract(const Document& document) {
  TRACE_EVENT0("blink", "DocumentMetadataExtractor::Extract");

  if (!document.GetFrame() || !document.GetFrame()->IsMainFrame())
    return nullptr;

  Element* html = document.documentElement();
  if (!html)
    return nullptr;

  WebPagePtr page = WebPage::New();

  // Traverse the DOM tree and extract the metadata.
  ExtractionStatus status = ExtractMetadata(*html, page->entities);
  if (status != ExtractionStatus::kOK) {
    return nullptr;
  }

  page->url = document.Url();
  page->title = document.title();
  return page;
}

}  // namespace blink