File: sequence_matcher.h

package info (click to toggle)
chromium 139.0.7258.127-1
  • links: PTS, VCS
  • area: main
  • in suites:
  • size: 6,122,068 kB
  • sloc: cpp: 35,100,771; ansic: 7,163,530; javascript: 4,103,002; python: 1,436,920; asm: 946,517; xml: 746,709; pascal: 187,653; perl: 88,691; sh: 88,436; objc: 79,953; sql: 51,488; cs: 44,583; fortran: 24,137; makefile: 22,147; tcl: 15,277; php: 13,980; yacc: 8,984; ruby: 7,485; awk: 3,720; lisp: 3,096; lex: 1,327; ada: 727; jsp: 228; sed: 36
file content (109 lines) | stat: -rw-r--r-- 4,255 bytes parent folder | download | duplicates (6)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
// Copyright 2019 The Chromium Authors
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.

#ifndef CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_
#define CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_

#include <string>
#include <unordered_map>
#include <vector>

namespace ash::string_matching {

namespace {

constexpr double kNumMatchingBlocksPenalty = 0.1;
constexpr bool kUseTextLengthAgnosticism = false;

}  // namespace

// Performs the calculation of similarity level between 2 strings. This class's
// functionality is inspired by python's difflib.SequenceMatcher library.
// (https://docs.python.org/2/library/difflib.html#difflib.SequenceMatcher)
//
// TODO(crbug.com/1336160): The unused edit distance pathway has been removed.
// Consider coming up with a new edit-distance method having the awareness of
// string structures. (e.g., awareness of block matching and token break
// locations).
class SequenceMatcher {
 public:
  // Representing a common substring between `first_string_` and
  // `second_string_`.
  struct Match {
    Match();
    Match(int pos_first, int pos_second, int len);
    // Starting position of the common substring in `first_string_`.
    int pos_first_string;
    // Starting position of the common substring in `second_string_`.
    int pos_second_string;
    // Length of the common substring.
    int length;
  };

  // `num_matching_blocks_penalty` is used to penalize too many small matching
  // blocks. For the same number of matching characters, we prefer fewer
  // matching blocks. Value equal to 0 means no penalty. Values greater than 0
  // means heavier penalty will be applied to larger number of blocks.
  SequenceMatcher(
      const std::u16string& first_string,
      const std::u16string& second_string,
      double num_matching_blocks_penalty = kNumMatchingBlocksPenalty);

  SequenceMatcher(const SequenceMatcher&) = delete;
  SequenceMatcher& operator=(const SequenceMatcher&) = delete;

  ~SequenceMatcher();

  // Calculates similarity ratio of `first_string_` and `second_string_`.
  //
  // In the actual string matching in launcher searches, we will input with the
  // query as `first_string_` and the text as `second_string_`. As the query is
  // likely to be shorter than the text in most cases, we would like to
  // ignore/lower the influence of the amounts of any remaining unmatched
  // portions of the `second_string_` onto the ratio (i.e., "text-length
  // agnosticism").
  //
  // Thus, We will trim the text length if it is too long and
  // `text_length_agnostic` is true, and it only works for the block
  // matching algorithm.
  double Ratio(bool text_length_agnostic = kUseTextLengthAgnosticism);
  // Finds the longest common substring between
  // `first_string_[first_start:first_end]` and
  // `second_string_[second_start:second_end]`. Used by
  // GetMatchingBlocks().
  Match FindLongestMatch(int first_start,
                         int first_end,
                         int second_start,
                         int second_end);
  // Get all matching blocks of `first_string_` and `second_string_`.
  // All blocks will be sorted by their starting position within
  // `first_string_`.
  //
  // The last matching block will always be:
  //
  //   Match(first_string_.size(), second_string_.size(), 0).
  //
  // This is to cover the case where two strings have no matching blocks,
  // so that we have something to store in `matching_blocks_` to indicate
  // that matching has taken place.
  std::vector<Match> GetMatchingBlocks();

 private:
  std::u16string first_string_;
  std::u16string second_string_;
  double num_matching_blocks_penalty_;
  double block_matching_ratio_ = -1.0;
  std::vector<Match> matching_blocks_;

  // For each character `c` in `second_string_`, this variable
  // `char_to_positions_` stores all positions where `c` occurs in
  // `second_string_`.
  std::unordered_map<char, std::vector<int>> char_to_positions_;
  // Memory for dynamic programming algorithm used in FindLongestMatch().
  std::vector<int> dp_common_string_;
};

}  // namespace ash::string_matching

#endif  // CHROMEOS_ASH_COMPONENTS_STRING_MATCHING_SEQUENCE_MATCHER_H_