File: script_run_iterator.cc

package info (click to toggle)
chromium 138.0.7204.183-1
  • links: PTS, VCS
  • area: main
  • in suites: trixie
  • size: 6,071,908 kB
  • sloc: cpp: 34,937,088; ansic: 7,176,967; javascript: 4,110,704; python: 1,419,953; asm: 946,768; xml: 739,971; pascal: 187,324; sh: 89,623; perl: 88,663; objc: 79,944; sql: 50,304; cs: 41,786; fortran: 24,137; makefile: 21,806; php: 13,980; tcl: 13,166; yacc: 8,925; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (513 lines) | stat: -rw-r--r-- 18,870 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
// Copyright 2015 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#include "third_party/blink/renderer/platform/fonts/script_run_iterator.h"

#include <algorithm>

#include "base/compiler_specific.h"
#include "base/containers/contains.h"
#include "base/logging.h"
#include "base/notreached.h"
#include "third_party/blink/renderer/platform/runtime_enabled_features.h"
#include "third_party/blink/renderer/platform/text/icu_error.h"
#include "third_party/blink/renderer/platform/wtf/text/character_names.h"
#include "third_party/blink/renderer/platform/wtf/threading.h"

namespace blink {

namespace {

// UScriptCode and OpenType script are not 1:1; specifically, both Hiragana and
// Katakana map to 'kana' in OpenType. They will be mapped correctly in
// HarfBuzz, but normalizing earlier helps to reduce splitting runs between
// these scripts.
// https://docs.microsoft.com/en-us/typography/opentype/spec/scripttags
inline UScriptCode GetScriptForOpenType(UChar32 ch, UErrorCode* status) {
  UScriptCode script = uscript_getScript(ch, status);
  if (U_FAILURE(*status)) [[unlikely]] {
    return script;
  }
  if (script == USCRIPT_KATAKANA || script == USCRIPT_KATAKANA_OR_HIRAGANA)
      [[unlikely]] {
    return USCRIPT_HIRAGANA;
  }
  return script;
}

inline bool IsHanScript(UScriptCode script) {
  return script == USCRIPT_HAN || script == USCRIPT_HIRAGANA ||
         script == USCRIPT_BOPOMOFO;
}

inline UScriptCode FirstHanScript(
    const ScriptRunIterator::UScriptCodeList& list) {
  const auto result = std::ranges::find_if(list, IsHanScript);
  if (result != list.end())
    return *result;
  return USCRIPT_INVALID_CODE;
}

ScriptRunIterator::UScriptCodeList GetHanScriptExtensions() {
  ICUError status;
  ScriptRunIterator::UScriptCodeList list;
  list.resize(ScriptRunIterator::kMaxScriptCount - 1);
  // Get the list from one of the CJK punctuation in the CJK Symbols and
  // Punctuation block.
  int count = uscript_getScriptExtensions(kLeftCornerBracket, &list[0],
                                          list.size(), &status);
  if (U_SUCCESS(status)) {
    DCHECK_GT(count, 0);
    list.resize(count);
    return list;
  }
  NOTREACHED();
}

// This function updates the script list to the Han ideographic-based scripts if
// the East Asian Width property[1] indicates it is an East Asian character.
//
// Most East Asian punctuation characters have East Asian scripts in the script
// extensions. However, not all of them are so. For example, when they are
// halfwidth/fullwidth forms, they must have the same properties as their
// canonical equivalent[2] code points that are not East Asian. Such code points
// can split runs in the middle of consecutive CJK punctuation characters when
// they are preceded by non-CJK characters, and prevent applying font features
// to consecutive CJK punctuation characters.
//
// TODO(crbug.com/1273998): This function is not needed if Unicode changes the
// script extension for these code points.
//
// [1]: https://www.unicode.org/reports/tr11/
// [2]: https://unicode.org/reports/tr15/#Canon_Compat_Equivalence
void FixScriptsByEastAsianWidth(UChar32 ch,
                                ScriptRunIterator::UScriptCodeList* set) {
  // Replace the list only if it is the `COMMON` script. If `COMMON`, there
  // should be only one entry.
  DCHECK(!set->empty());
  if (set->size() > 1 || set->front() != USCRIPT_COMMON) {
    DCHECK(!set->Contains(USCRIPT_COMMON));
    return;
  }

  // It's an East Asian character when the EAW property is W, F, or H.
  // https://www.unicode.org/reports/tr11/#Set_Relations
  const auto eaw = static_cast<UEastAsianWidth>(
      u_getIntPropertyValue(ch, UCHAR_EAST_ASIAN_WIDTH));
  if (eaw == U_EA_WIDE || eaw == U_EA_FULLWIDTH || eaw == U_EA_HALFWIDTH) {
    // Replace the list with the list of Han ideographic scripts, as seen for
    // U+300C in https://www.unicode.org/Public/UNIDATA/ScriptExtensions.txt.
    DEFINE_STATIC_LOCAL(ScriptRunIterator::UScriptCodeList, han_scripts,
                        (GetHanScriptExtensions()));
    if (han_scripts.empty()) [[unlikely]] {
      // When |GetHanScriptExtensions| returns an empty list, replacing with it
      // will crash later, which makes the analysis complicated.
      NOTREACHED();
    }
    set->Shrink(0);
    set->AppendVector(han_scripts);
  }
}

}  // namespace

typedef ScriptData::PairedBracketType PairedBracketType;

ScriptData::~ScriptData() = default;

void ICUScriptData::GetScripts(UChar32 ch, UScriptCodeList& dst) const {
  ICUError status;
  // Leave room to insert primary script. It's not strictly necessary but
  // it ensures that the result won't ever be greater than kMaxScriptCount,
  // which some client someday might expect.
  dst.resize(kMaxScriptCount - 1);
  // Note, ICU convention is to return the number of available items
  // regardless of the capacity passed to the call. So count can be greater
  // than dst->size(), if a later version of the unicode data has more
  // than kMaxScriptCount items.

  // |uscript_getScriptExtensions| do not need to be collated to
  // USCRIPT_HIRAGANA because when ScriptExtensions contains Kana, it contains
  // Hira as well, and Hira is always before Kana.
  int count = uscript_getScriptExtensions(ch, &dst[0], dst.size(), &status);
  if (status == U_BUFFER_OVERFLOW_ERROR) {
    // Allow this, we'll just use what we have.
    DLOG(ERROR) << "Exceeded maximum script count of " << kMaxScriptCount
                << " for 0x" << std::hex << ch;
    count = dst.size();
    status = U_ZERO_ERROR;
  }
  UScriptCode primary_script = GetScriptForOpenType(ch, &status);

  if (U_FAILURE(status)) {
    DLOG(ERROR) << "Could not get icu script data: " << status << " for 0x"
                << std::hex << ch;
    dst.clear();
    return;
  }

  dst.resize(count);

  if (primary_script == dst.at(0)) {
    // Only one script (might be common or inherited -- these are never in
    // the extensions unless they're the only script), or extensions are in
    // priority order already.
    return;
  }

  if (primary_script != USCRIPT_INHERITED && primary_script != USCRIPT_COMMON &&
      primary_script != USCRIPT_INVALID_CODE) {
    // Not common or primary, with extensions that are not in order. We know
    // the primary, so we insert it at the front and swap the previous front
    // to somewhere else in the list.
    auto it =
        std::find(UNSAFE_TODO(dst.begin() + 1), dst.end(), primary_script);
    if (it == dst.end()) {
      dst.push_back(primary_script);
      std::swap(dst.front(), dst.back());
    } else {
      std::swap(*dst.begin(), *it);
    }
    return;
  }

  if (primary_script == USCRIPT_COMMON) {
    if (count == 1) {
      // Common with a preferred script. Keep common at head.
      dst.push_front(primary_script);
      return;
    }

    // Ignore common. Find the preferred script of the multiple scripts that
    // remain, and ensure it is at the head. Just keep swapping them in,
    // there aren't likely to be many.
    for (wtf_size_t i = 1; i < dst.size(); ++i) {
      if (dst.at(0) == USCRIPT_LATIN || dst.at(i) < dst.at(0)) {
        std::swap(dst.at(0), dst.at(i));
      }
    }
    return;
  }

  // The primary is inherited, and there are other scripts. Put inherited at
  // the front, the true primary next, and then the others in random order.
  // TODO: Take into account the language of a document if available.
  // Otherwise, use Unicode block as a tie breaker. Comparing
  // ScriptCodes as integers is not meaningful because 'old' scripts are
  // just sorted in alphabetic order.
  dst.push_back(dst.at(0));
  dst.at(0) = primary_script;
  for (wtf_size_t i = 2; i < dst.size(); ++i) {
    if (dst.at(1) == USCRIPT_LATIN || dst.at(i) < dst.at(1)) {
      std::swap(dst.at(1), dst.at(i));
    }
  }
}

UChar32 ICUScriptData::GetPairedBracket(UChar32 ch) const {
  return u_getBidiPairedBracket(ch);
}

PairedBracketType ICUScriptData::GetPairedBracketType(UChar32 ch) const {
  return static_cast<PairedBracketType>(
      u_getIntPropertyValue(ch, UCHAR_BIDI_PAIRED_BRACKET_TYPE));
}

const ICUScriptData* ICUScriptData::Instance() {
  DEFINE_THREAD_SAFE_STATIC_LOCAL(const ICUScriptData, icu_script_data_instance,
                                  ());
  return &icu_script_data_instance;
}

ScriptRunIterator::ScriptRunIterator(base::span<const UChar> text,
                                     const ScriptData* data)
    : text_(text.data()),
      length_(base::checked_cast<wtf_size_t>(text.size())),
      brackets_fixup_depth_(0),
      next_set_(std::make_unique<UScriptCodeList>()),
      ahead_set_(std::make_unique<UScriptCodeList>()),
      // The initial value of ahead_character_ is not used.
      ahead_character_(0),
      ahead_pos_(0),
      common_preferred_(USCRIPT_COMMON),
      script_data_(data) {
  DCHECK(text.data());
  DCHECK(data);

  if (ahead_pos_ < length_) {
    current_set_.clear();
    // Priming the current_set_ with USCRIPT_COMMON here so that the first
    // resolution between current_set_ and next_set_ in MergeSets() leads to
    // choosing the script of the first consumed character.
    current_set_.push_back(USCRIPT_COMMON);
    UNSAFE_TODO(U16_NEXT(text_, ahead_pos_, length_, ahead_character_));
    script_data_->GetScripts(ahead_character_, *ahead_set_);
  }
}

ScriptRunIterator::ScriptRunIterator(base::span<const UChar> text)
    : ScriptRunIterator(text, ICUScriptData::Instance()) {}

bool ScriptRunIterator::Consume(unsigned* limit, UScriptCode* script) {
  if (current_set_.empty()) {
    return false;
  }

  wtf_size_t pos;
  UChar32 ch;
  while (Fetch(&pos, &ch)) {
    PairedBracketType paired_type = script_data_->GetPairedBracketType(ch);
    switch (paired_type) {
      case PairedBracketType::kBracketTypeOpen:
        OpenBracket(ch);
        break;
      case PairedBracketType::kBracketTypeClose:
        CloseBracket(ch);
        break;
      default:
        break;
    }
    if (!MergeSets()) {
      *limit = pos;
      *script = ResolveCurrentScript();
      // If the current character is an open bracket, do not assign the resolved
      // script to it yet because it will belong to the next run.
      const bool exclude_last =
          paired_type == PairedBracketType::kBracketTypeOpen;
      FixupStack(*script, exclude_last);
      current_set_ = *next_set_;
      return true;
    }
  }

  *limit = length_;
  *script = ResolveCurrentScript();
  current_set_.clear();
  return true;
}

void ScriptRunIterator::OpenBracket(UChar32 ch) {
  if (brackets_.size() == kMaxBrackets) {
    brackets_.pop_front();
    if (brackets_fixup_depth_ == kMaxBrackets) {
      --brackets_fixup_depth_;
    }
  }
  FixScriptsByEastAsianWidth(ch, next_set_.get());
  brackets_.push_back(BracketRec({ch, USCRIPT_COMMON}));
  ++brackets_fixup_depth_;
}

void ScriptRunIterator::CloseBracket(UChar32 ch) {
  if (!brackets_.empty()) {
    UChar32 target = script_data_->GetPairedBracket(ch);
    for (auto it = brackets_.rbegin(); it != brackets_.rend(); ++it) {
      if (it->ch == target) {
        // Have a match, use open paren's resolved script.
        UScriptCode script = it->script;
        // Han languages are multi-scripts, and there are font features that
        // apply to consecutive punctuation characters.
        // When encountering a closing bracket do not insist on the closing
        // bracket getting assigned the same script as the opening bracket if
        // current_set_ provides an option to resolve to any other possible Han
        // script as well, which avoids breaking the run.
        if (IsHanScript(script)) {
          const UScriptCode current_han_script = FirstHanScript(current_set_);
          if (current_han_script != USCRIPT_INVALID_CODE)
            script = current_han_script;
        }
        if (script != USCRIPT_COMMON) {
          next_set_->clear();
          next_set_->push_back(script);
        }

        // And pop stack to this point.
        int num_popped =
            static_cast<int>(std::distance(brackets_.rbegin(), it));
        // TODO: No resize operation in WTF::Deque?
        for (int i = 0; i < num_popped; ++i)
          brackets_.pop_back();
        brackets_fixup_depth_ = static_cast<wtf_size_t>(
            std::max(0, static_cast<int>(brackets_fixup_depth_) - num_popped));
        return;
      }
    }
  }
  // leave stack alone, no match
}

// Keep items in current_set_ that are in next_set_.
//
// If the sets are disjoint, return false and leave current_set_ unchanged. Else
// return true and make current set the intersection. Make sure to maintain
// current priority script as priority if it remains, else retain next priority
// script if it remains.
//
// Also maintain a common preferred script.  If current and next are both
// common, and there is no common preferred script and next has a preferred
// script, set the common preferred script to that of next.
bool ScriptRunIterator::MergeSets() {
  if (next_set_->empty() || current_set_.empty()) {
    return false;
  }

  auto current_set_it = current_set_.begin();
  auto current_end = current_set_.end();
  // Most of the time, this is the only one.
  // Advance the current iterator, we won't need to check it again later.
  UScriptCode priority_script = UNSAFE_TODO(*current_set_it++);

  // If next is common or inherited, the only thing that might change
  // is the common preferred script.
  if (next_set_->at(0) <= USCRIPT_INHERITED) {
    if (next_set_->size() == 2 && priority_script <= USCRIPT_INHERITED &&
        common_preferred_ == USCRIPT_COMMON) {
      common_preferred_ = next_set_->at(1);
    }
    return true;
  }

  // If current is common or inherited, use the next script set.
  if (priority_script <= USCRIPT_INHERITED) {
    current_set_ = *next_set_;
    return true;
  }

  // Neither is common or inherited. If current is a singleton,
  // just see if it exists in the next set. This is the common case.
  bool have_priority = base::Contains(*next_set_, priority_script);
  if (current_set_it == current_end) {
    return have_priority;
  }

  // Establish the priority script, if we have one.
  // First try current priority script.
  auto next_it = next_set_->begin();
  auto next_end = next_set_->end();
  if (!have_priority) {
    // So try next priority script.
    // Skip the first current script, we already know it's not there.
    // Advance the next iterator, later we won't need to check it again.
    priority_script = UNSAFE_TODO(*next_it++);
    have_priority =
        std::find(current_set_it, current_end, priority_script) != current_end;
  }

  // Note that we can never write more scripts into the current vector than
  // it already contains, so currentWriteIt won't ever exceed the size/capacity.
  auto current_write_it = current_set_.begin();
  if (have_priority) {
    // keep the priority script.
    UNSAFE_TODO(*current_write_it++ = priority_script);
  }

  if (next_it != next_end) {
    // Iterate over the remaining current scripts, and keep them if
    // they occur in the remaining next scripts.
    while (current_set_it != current_end) {
      UScriptCode sc = UNSAFE_TODO(*current_set_it++);
      if (std::find(next_it, next_end, sc) != next_end) {
        UNSAFE_TODO(*current_write_it++ = sc);
      }
    }
  }

  // Only change current if the run continues.
  int written =
      static_cast<int>(std::distance(current_set_.begin(), current_write_it));
  if (written > 0) {
    current_set_.resize(written);
    return true;
  }
  return false;
}

// When we hit the end of the run, and resolve the script, we now know the
// resolved script of any open bracket that was pushed on the stack since
// the start of the run. Fixup depth records how many of these there
// were. We've maintained this count during pushes, and taken care to
// adjust it if the stack got overfull and open brackets were pushed off
// the bottom. This sets the script of the fixup_depth topmost entries of the
// stack to the resolved script.
void ScriptRunIterator::FixupStack(UScriptCode resolved_script,
                                   bool exclude_last) {
  wtf_size_t count = brackets_fixup_depth_;
  if (count <= 0)
    return;
  if (count > brackets_.size()) {
    // Should never happen unless someone breaks the code.
    DLOG(ERROR) << "Brackets fixup depth exceeds size of bracket vector.";
    count = brackets_.size();
  }
  auto it = brackets_.rbegin();
  // Do not assign the script to the last one if |exclude_last|.
  if (exclude_last) {
    ++it;
    --count;
    brackets_fixup_depth_ = 1;
  } else {
    brackets_fixup_depth_ = 0;
  }
  for (; count; ++it, --count)
    it->script = resolved_script;
}

bool ScriptRunIterator::Fetch(wtf_size_t* pos, UChar32* ch) {
  if (ahead_pos_ > length_) {
    return false;
  }
  *pos = ahead_pos_ - (ahead_character_ >= 0x10000 ? 2 : 1);
  *ch = ahead_character_;

  std::swap(next_set_, ahead_set_);
  if (ahead_pos_ == length_) {
    // No more data to fetch, but last character still needs to be processed.
    // Advance ahead_pos_ so that next time we will know this has been done.
    ahead_pos_++;
    return true;
  }

  UNSAFE_TODO(U16_NEXT(text_, ahead_pos_, length_, ahead_character_));

  if (!next_set_->empty() && next_set_->front() != USCRIPT_COMMON &&
      U_GET_GC_MASK(ahead_character_) & U_GC_M_MASK &&
      RuntimeEnabledFeatures::ScriptRunIteratorCombiningMarksEnabled())
      [[unlikely]] {
    // A combining mark--whatever its Script property value--should inherit the
    // script property value of its base character.
    // https://www.unicode.org/reports/tr24/#Nonspacing_Marks
    // `USCRIPT_COMMON` could try looking for more context, but the script of
    // the combining mark may be still useful, and is backward compatible.
    // https://www.unicode.org/reports/tr24/#Common
    *ahead_set_ = *next_set_;
    return true;
  }

  script_data_->GetScripts(ahead_character_, *ahead_set_);
  if (ahead_set_->empty()) {
    // No scripts for this character. This has already been logged, so
    // we just terminate processing this text.
    return false;
  }
  if ((*ahead_set_)[0] == USCRIPT_INHERITED && ahead_set_->size() > 1) {
    if ((*next_set_)[0] == USCRIPT_COMMON) {
      // Overwrite the next set with the non-inherited portion of the set.
      *next_set_ = *ahead_set_;
      next_set_->EraseAt(0);
      // Discard the remaining values, we'll inherit.
      ahead_set_->resize(1);
    } else {
      // Else, this applies to anything.
      ahead_set_->resize(1);
    }
  }
  return true;
}

UScriptCode ScriptRunIterator::ResolveCurrentScript() const {
  UScriptCode result = current_set_.at(0);
  return result == USCRIPT_COMMON ? common_preferred_ : result;
}

}  // namespace blink