1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162
|
//===--- Trigram.cpp - Trigram generation for Fuzzy Matching ----*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "Trigram.h"
#include "FuzzyMatch.h"
#include "index/dex/Token.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringRef.h"
#include <cctype>
#include <limits>
#include <queue>
#include <string>
#include <utility>
namespace clang {
namespace clangd {
namespace dex {
// Produce trigrams (including duplicates) and pass them to Out().
template <typename Func>
static void identifierTrigrams(llvm::StringRef Identifier, Func Out) {
assert(!Identifier.empty());
// Apply fuzzy matching text segmentation.
llvm::SmallVector<CharRole> Roles(Identifier.size());
calculateRoles(Identifier,
llvm::makeMutableArrayRef(Roles.data(), Identifier.size()));
std::string LowercaseIdentifier = Identifier.lower();
// For each character, store indices of the characters to which fuzzy matching
// algorithm can jump. There are 2 possible variants:
//
// * Next Tail - next character from the same segment
// * Next Head - front character of the next segment
//
// Next stores tuples of three indices in the presented order, if a variant is
// not available then 0 is stored.
llvm::SmallVector<std::array<unsigned, 2>, 12> Next(
LowercaseIdentifier.size());
unsigned NextTail = 0, NextHead = 0;
for (int I = LowercaseIdentifier.size() - 1; I >= 0; --I) {
Next[I] = {{NextTail, NextHead}};
NextTail = Roles[I] == Tail ? I : 0;
if (Roles[I] == Head) {
NextHead = I;
}
}
// Iterate through valid sequences of three characters Fuzzy Matcher can
// process.
for (unsigned I = 0; I < LowercaseIdentifier.size(); ++I) {
// Skip delimiters.
if (Roles[I] != Head && Roles[I] != Tail)
continue;
for (unsigned J : Next[I]) {
if (J == 0)
continue;
for (unsigned K : Next[J]) {
if (K == 0)
continue;
Out(Trigram(LowercaseIdentifier[I], LowercaseIdentifier[J],
LowercaseIdentifier[K]));
}
}
}
// Short queries semantics are different. When the user dosn't type enough
// symbols to form trigrams, we still want to serve meaningful results. To
// achieve that, we form incomplete trigrams (bi- and unigrams) for the
// identifiers and also generate short trigrams on the query side from what
// is available. We allow a small number of short trigram types in order to
// prevent excessive memory usage and increase the quality of the search.
// Only the first few symbols are allowed to be used in incomplete trigrams.
//
// Example - for "_abc_def_ghi_jkl" we'll get following incomplete trigrams:
// "_", "_a", "a", "ab", "ad", "d", "de", "dg"
for (unsigned Position = 0, HeadsSeen = 0; HeadsSeen < 2;) {
// The first symbol might be a separator, so the loop condition should be
// stopping as soon as there is no next head or we have seen two heads.
if (Roles[Position] == Head)
++HeadsSeen;
Out(Trigram(LowercaseIdentifier[Position]));
for (unsigned I : Next[Position])
if (I != 0)
Out(Trigram(LowercaseIdentifier[Position], LowercaseIdentifier[I]));
Position = Next[Position][1];
if (Position == 0)
break;
}
}
void generateIdentifierTrigrams(llvm::StringRef Identifier,
std::vector<Trigram> &Result) {
// Empirically, scanning for duplicates is faster with fewer trigrams, and
// a hashtable is faster with more. This is a hot path, so dispatch based on
// expected number of trigrams. Longer identifiers produce more trigrams.
// The magic number was tuned by running IndexBenchmark.DexBuild.
constexpr unsigned ManyTrigramsIdentifierThreshold = 14;
Result.clear();
if (Identifier.empty())
return;
if (Identifier.size() < ManyTrigramsIdentifierThreshold) {
identifierTrigrams(Identifier, [&](Trigram T) {
if (!llvm::is_contained(Result, T))
Result.push_back(T);
});
} else {
identifierTrigrams(Identifier, [&](Trigram T) { Result.push_back(T); });
llvm::sort(Result);
Result.erase(std::unique(Result.begin(), Result.end()), Result.end());
}
}
std::vector<Token> generateQueryTrigrams(llvm::StringRef Query) {
if (Query.empty())
return {};
// Apply fuzzy matching text segmentation.
llvm::SmallVector<CharRole> Roles(Query.size());
calculateRoles(Query, llvm::makeMutableArrayRef(Roles.data(), Query.size()));
std::string LowercaseQuery = Query.lower();
llvm::DenseSet<Token> UniqueTrigrams;
std::string Chars;
for (unsigned I = 0; I < LowercaseQuery.size(); ++I) {
if (Roles[I] != Head && Roles[I] != Tail)
continue; // Skip delimiters.
Chars.push_back(LowercaseQuery[I]);
if (Chars.size() > 3)
Chars.erase(Chars.begin());
if (Chars.size() == 3)
UniqueTrigrams.insert(Token(Token::Kind::Trigram, Chars));
}
// For queries with very few letters, generateIdentifierTrigrams emulates
// outputs of this function to match the semantics.
if (UniqueTrigrams.empty()) {
// If bigram can't be formed out of letters/numbers, we prepend separator.
std::string Result(1, LowercaseQuery.front());
for (unsigned I = 1; I < LowercaseQuery.size(); ++I)
if (Roles[I] == Head || Roles[I] == Tail)
Result += LowercaseQuery[I];
UniqueTrigrams.insert(
Token(Token::Kind::Trigram, llvm::StringRef(Result).take_back(2)));
}
return {UniqueTrigrams.begin(), UniqueTrigrams.end()};
}
} // namespace dex
} // namespace clangd
} // namespace clang
|