File: Trigram.cpp

package info (click to toggle)
llvm-toolchain-15 1%3A15.0.6-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,554,644 kB
  • sloc: cpp: 5,922,452; ansic: 1,012,136; asm: 674,362; python: 191,568; objc: 73,855; f90: 42,327; lisp: 31,913; pascal: 11,973; javascript: 10,144; sh: 9,421; perl: 7,447; ml: 5,527; awk: 3,523; makefile: 2,520; xml: 885; cs: 573; fortran: 567
file content (162 lines) | stat: -rw-r--r-- 5,976 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
//===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "Trigram.h"
#include "FuzzyMatch.h"
#include "index/dex/Token.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include <cctype>
#include <limits>
#include <queue>
#include <string>
#include <utility>

namespace clang {
namespace clangd {
namespace dex {

// Produce trigrams (including duplicates) and pass them to Out().
template <typename Func>
static void identifierTrigrams(llvm::StringRef Identifier, Func Out) {
  assert(!Identifier.empty());
  // Apply fuzzy matching text segmentation.
  llvm::SmallVector<CharRole> Roles(Identifier.size());
  calculateRoles(Identifier,
                 llvm::makeMutableArrayRef(Roles.data(), Identifier.size()));

  std::string LowercaseIdentifier = Identifier.lower();

  // For each character, store indices of the characters to which fuzzy matching
  // algorithm can jump. There are 2 possible variants:
  //
  // * Next Tail - next character from the same segment
  // * Next Head - front character of the next segment
  //
  // Next stores tuples of three indices in the presented order, if a variant is
  // not available then 0 is stored.
  llvm::SmallVector<std::array<unsigned, 2>, 12> Next(
      LowercaseIdentifier.size());
  unsigned NextTail = 0, NextHead = 0;
  for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
    Next[I] = {{NextTail, NextHead}};
    NextTail = Roles[I] == Tail ? I : 0;
    if (Roles[I] == Head) {
      NextHead = I;
    }
  }

  // Iterate through valid sequences of three characters Fuzzy Matcher can
  // process.
  for (unsigned I = 0; I < LowercaseIdentifier.size(); ++I) {
    // Skip delimiters.
    if (Roles[I] != Head && Roles[I] != Tail)
      continue;
    for (unsigned J : Next[I]) {
      if (J == 0)
        continue;
      for (unsigned K : Next[J]) {
        if (K == 0)
          continue;
        Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J],
                    LowercaseIdentifier[K]));
      }
    }
  }
  // Short queries semantics are different. When the user dosn't type enough
  // symbols to form trigrams, we still want to serve meaningful results. To
  // achieve that, we form incomplete trigrams (bi- and unigrams) for the
  // identifiers and also generate short trigrams on the query side from what
  // is available. We allow a small number of short trigram types in order to
  // prevent excessive memory usage and increase the quality of the search.
  // Only the first few symbols are allowed to be used in incomplete trigrams.
  //
  // Example - for "_abc_def_ghi_jkl" we'll get following incomplete trigrams:
  // "_", "_a", "a", "ab", "ad", "d", "de", "dg"
  for (unsigned Position = 0, HeadsSeen = 0; HeadsSeen < 2;) {
    // The first symbol might be a separator, so the loop condition should be
    // stopping as soon as there is no next head or we have seen two heads.
    if (Roles[Position] == Head)
      ++HeadsSeen;
    Out(Trigram(LowercaseIdentifier[Position]));
    for (unsigned I : Next[Position])
      if (I != 0)
        Out(Trigram(LowercaseIdentifier[Position], LowercaseIdentifier[I]));
    Position = Next[Position][1];
    if (Position == 0)
      break;
  }
}

void generateIdentifierTrigrams(llvm::StringRef Identifier,
                                std::vector<Trigram> &Result) {
  // Empirically, scanning for duplicates is faster with fewer trigrams, and
  // a hashtable is faster with more. This is a hot path, so dispatch based on
  // expected number of trigrams. Longer identifiers produce more trigrams.
  // The magic number was tuned by running IndexBenchmark.DexBuild.
  constexpr unsigned ManyTrigramsIdentifierThreshold = 14;
  Result.clear();
  if (Identifier.empty())
    return;

  if (Identifier.size() < ManyTrigramsIdentifierThreshold) {
    identifierTrigrams(Identifier, [&](Trigram T) {
      if (!llvm::is_contained(Result, T))
        Result.push_back(T);
    });
  } else {
    identifierTrigrams(Identifier, [&](Trigram T) { Result.push_back(T); });
    llvm::sort(Result);
    Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
  }
}

std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
  if (Query.empty())
    return {};

  // Apply fuzzy matching text segmentation.
  llvm::SmallVector<CharRole> Roles(Query.size());
  calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));

  std::string LowercaseQuery = Query.lower();

  llvm::DenseSet<Token> UniqueTrigrams;
  std::string Chars;
  for (unsigned I = 0; I < LowercaseQuery.size(); ++I) {
    if (Roles[I] != Head && Roles[I] != Tail)
      continue; // Skip delimiters.
    Chars.push_back(LowercaseQuery[I]);
    if (Chars.size() > 3)
      Chars.erase(Chars.begin());
    if (Chars.size() == 3)
      UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
  }

  // For queries with very few letters, generateIdentifierTrigrams emulates
  // outputs of this function to match the semantics.
  if (UniqueTrigrams.empty()) {
    // If bigram can't be formed out of letters/numbers, we prepend separator.
    std::string Result(1, LowercaseQuery.front());
    for (unsigned I = 1; I < LowercaseQuery.size(); ++I)
      if (Roles[I] == Head || Roles[I] == Tail)
        Result += LowercaseQuery[I];
    UniqueTrigrams.insert(
        Token(Token::Kind::Trigram, llvm::StringRef(Result).take_back(2)));
  }

  return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
}

} // namespace dex
} // namespace clangd
} // namespace clang