File: Dex.cpp

package info (click to toggle)
llvm-toolchain-15 1%3A15.0.6-4
links: PTS, VCS
area: main
in suites: bookworm
size: 1,554,644 kB
sloc: cpp: 5,922,452; ansic: 1,012,136; asm: 674,362; python: 191,568; objc: 73,855; f90: 42,327; lisp: 31,913; pascal: 11,973; javascript: 10,144; sh: 9,421; perl: 7,447; ml: 5,527; awk: 3,523; makefile: 2,520; xml: 885; cs: 573; fortran: 567
file content (385 lines) | stat: -rw-r--r-- 15,507 bytes
//===--- Dex.cpp - Dex Symbol Index Implementation --------------*- C++ -*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#include "Dex.h"
#include "FileDistance.h"
#include "FuzzyMatch.h"
#include "Quality.h"
#include "URI.h"
#include "index/Index.h"
#include "index/dex/Iterator.h"
#include "index/dex/Token.h"
#include "index/dex/Trigram.h"
#include "support/Logger.h"
#include "support/Trace.h"
#include "llvm/ADT/DenseMap.h"
#include "llvm/ADT/StringMap.h"
#include "llvm/ADT/StringSet.h"
#include "llvm/Support/Path.h"
#include "llvm/Support/ScopedPrinter.h"
#include <algorithm>
#include <queue>
#include <utility>
#include <vector>

namespace clang {
namespace clangd {
namespace dex {

std::unique_ptr<SymbolIndex> Dex::build(SymbolSlab Symbols, RefSlab Refs,
                                        RelationSlab Rels) {
  auto Size = Symbols.bytes() + Refs.bytes();
  // There is no need to include "Rels" in Data because the relations are self-
  // contained, without references into a backing store.
  auto Data = std::make_pair(std::move(Symbols), std::move(Refs));
  return std::make_unique<Dex>(Data.first, Data.second, Rels, std::move(Data),
                                Size);
}

namespace {

// Mark symbols which are can be used for code completion.
const Token RestrictedForCodeCompletion =
    Token(Token::Kind::Sentinel, "Restricted For Code Completion");

// Helper to efficiently assemble the inverse index (token -> matching docs).
// The output is a nice uniform structure keyed on Token, but constructing
// the Token object every time we want to insert into the map is wasteful.
// Instead we have various maps keyed on things that are cheap to compute,
// and produce the Token keys once at the end.
class IndexBuilder {
  llvm::DenseMap<Trigram, std::vector<DocID>> TrigramDocs;
  std::vector<DocID> RestrictedCCDocs;
  llvm::StringMap<std::vector<DocID>> TypeDocs;
  llvm::StringMap<std::vector<DocID>> ScopeDocs;
  llvm::StringMap<std::vector<DocID>> ProximityDocs;
  std::vector<Trigram> TrigramScratch;

public:
  // Add the tokens which are given symbol's characteristics.
  // This includes fuzzy matching trigrams, symbol's scope, etc.
  // FIXME(kbobyrev): Support more token types:
  // * Namespace proximity
  void add(const Symbol &Sym, DocID D) {
    generateIdentifierTrigrams(Sym.Name, TrigramScratch);
    for (Trigram T : TrigramScratch)
      TrigramDocs[T].push_back(D);
    ScopeDocs[Sym.Scope].push_back(D);
    if (!llvm::StringRef(Sym.CanonicalDeclaration.FileURI).empty())
      for (const auto &ProximityURI :
           generateProximityURIs(Sym.CanonicalDeclaration.FileURI))
        ProximityDocs[ProximityURI].push_back(D);
    if (Sym.Flags & Symbol::IndexedForCodeCompletion)
      RestrictedCCDocs.push_back(D);
    if (!Sym.Type.empty())
      TypeDocs[Sym.Type].push_back(D);
  }

  // Assemble the final compressed posting lists for the added symbols.
  llvm::DenseMap<Token, PostingList> build() && {
    llvm::DenseMap<Token, PostingList> Result(/*InitialReserve=*/
                                              TrigramDocs.size() +
                                              RestrictedCCDocs.size() +
                                              TypeDocs.size() +
                                              ScopeDocs.size() +
                                              ProximityDocs.size());
    // Tear down intermediate structs as we go to reduce memory usage.
    // Since we're trying to get rid of underlying allocations, clearing the
    // containers is not enough.
    auto CreatePostingList =
        [&Result](Token::Kind TK, llvm::StringMap<std::vector<DocID>> &Docs) {
          for (auto &E : Docs) {
            Result.try_emplace(Token(TK, E.first()), E.second);
            E.second = {};
          }
          Docs = {};
        };
    CreatePostingList(Token::Kind::Type, TypeDocs);
    CreatePostingList(Token::Kind::Scope, ScopeDocs);
    CreatePostingList(Token::Kind::ProximityURI, ProximityDocs);

    // TrigramDocs are stored in a DenseMap and RestrictedCCDocs is not even a
    // map, treat them specially.
    for (auto &E : TrigramDocs) {
      Result.try_emplace(Token(Token::Kind::Trigram, E.first.str()), E.second);
      E.second = {};
    }
    TrigramDocs = llvm::DenseMap<Trigram, std::vector<DocID>>{};
    if (!RestrictedCCDocs.empty())
      Result.try_emplace(RestrictedForCodeCompletion,
                         std::move(RestrictedCCDocs));
    return Result;
  }
};

} // namespace

void Dex::buildIndex() {
  this->Corpus = dex::Corpus(Symbols.size());
  std::vector<std::pair<float, const Symbol *>> ScoredSymbols(Symbols.size());

  for (size_t I = 0; I < Symbols.size(); ++I) {
    const Symbol *Sym = Symbols[I];
    LookupTable[Sym->ID] = Sym;
    ScoredSymbols[I] = {quality(*Sym), Sym};
  }

  // Symbols are sorted by symbol qualities so that items in the posting lists
  // are stored in the descending order of symbol quality.
  llvm::sort(ScoredSymbols, std::greater<std::pair<float, const Symbol *>>());

  // SymbolQuality was empty up until now.
  SymbolQuality.resize(Symbols.size());
  // Populate internal storage using Symbol + Score pairs.
  for (size_t I = 0; I < ScoredSymbols.size(); ++I) {
    SymbolQuality[I] = ScoredSymbols[I].first;
    Symbols[I] = ScoredSymbols[I].second;
  }

  // Build posting lists for symbols.
  IndexBuilder Builder;
  for (DocID SymbolRank = 0; SymbolRank < Symbols.size(); ++SymbolRank)
    Builder.add(*Symbols[SymbolRank], SymbolRank);
  InvertedIndex = std::move(Builder).build();
}

std::unique_ptr<Iterator> Dex::iterator(const Token &Tok) const {
  auto It = InvertedIndex.find(Tok);
  return It == InvertedIndex.end() ? Corpus.none()
                                   : It->second.iterator(&It->first);
}

// Constructs BOOST iterators for Path Proximities.
std::unique_ptr<Iterator> Dex::createFileProximityIterator(
    llvm::ArrayRef<std::string> ProximityPaths) const {
  std::vector<std::unique_ptr<Iterator>> BoostingIterators;
  // Deduplicate parent URIs extracted from the ProximityPaths.
  llvm::StringSet<> ParentURIs;
  llvm::StringMap<SourceParams> Sources;
  for (const auto &Path : ProximityPaths) {
    Sources[Path] = SourceParams();
    auto PathURI = URI::create(Path);
    const auto PathProximityURIs = generateProximityURIs(PathURI.toString());
    for (const auto &ProximityURI : PathProximityURIs)
      ParentURIs.insert(ProximityURI);
  }
  // Use SymbolRelevanceSignals for symbol relevance evaluation: use defaults
  // for all parameters except for Proximity Path distance signal.
  SymbolRelevanceSignals PathProximitySignals;
  // DistanceCalculator will find the shortest distance from ProximityPaths to
  // any URI extracted from the ProximityPaths.
  URIDistance DistanceCalculator(Sources);
  PathProximitySignals.FileProximityMatch = &DistanceCalculator;
  // Try to build BOOST iterator for each Proximity Path provided by
  // ProximityPaths. Boosting factor should depend on the distance to the
  // Proximity Path: the closer processed path is, the higher boosting factor.
  for (const auto &ParentURI : ParentURIs.keys()) {
    // FIXME(kbobyrev): Append LIMIT on top of every BOOST iterator.
    auto It = iterator(Token(Token::Kind::ProximityURI, ParentURI));
    if (It->kind() != Iterator::Kind::False) {
      PathProximitySignals.SymbolURI = ParentURI;
      BoostingIterators.push_back(Corpus.boost(
          std::move(It), PathProximitySignals.evaluateHeuristics()));
    }
  }
  BoostingIterators.push_back(Corpus.all());
  return Corpus.unionOf(std::move(BoostingIterators));
}

// Constructs BOOST iterators for preferred types.
std::unique_ptr<Iterator>
Dex::createTypeBoostingIterator(llvm::ArrayRef<std::string> Types) const {
  std::vector<std::unique_ptr<Iterator>> BoostingIterators;
  SymbolRelevanceSignals PreferredTypeSignals;
  PreferredTypeSignals.TypeMatchesPreferred = true;
  auto Boost = PreferredTypeSignals.evaluateHeuristics();
  for (const auto &T : Types)
    BoostingIterators.push_back(
        Corpus.boost(iterator(Token(Token::Kind::Type, T)), Boost));
  BoostingIterators.push_back(Corpus.all());
  return Corpus.unionOf(std::move(BoostingIterators));
}

/// Constructs iterators over tokens extracted from the query and exhausts it
/// while applying Callback to each symbol in the order of decreasing quality
/// of the matched symbols.
bool Dex::fuzzyFind(const FuzzyFindRequest &Req,
                    llvm::function_ref<void(const Symbol &)> Callback) const {
  assert(!StringRef(Req.Query).contains("::") &&
         "There must be no :: in query.");
  trace::Span Tracer("Dex fuzzyFind");
  FuzzyMatcher Filter(Req.Query);
  // For short queries we use specialized trigrams that don't yield all results.
  // Prevent clients from postfiltering them for longer queries.
  bool More = !Req.Query.empty() && Req.Query.size() < 3;

  std::vector<std::unique_ptr<Iterator>> Criteria;
  const auto TrigramTokens = generateQueryTrigrams(Req.Query);

  // Generate query trigrams and construct AND iterator over all query
  // trigrams.
  std::vector<std::unique_ptr<Iterator>> TrigramIterators;
  for (const auto &Trigram : TrigramTokens)
    TrigramIterators.push_back(iterator(Trigram));
  Criteria.push_back(Corpus.intersect(std::move(TrigramIterators)));

  // Generate scope tokens for search query.
  std::vector<std::unique_ptr<Iterator>> ScopeIterators;
  for (const auto &Scope : Req.Scopes)
    ScopeIterators.push_back(iterator(Token(Token::Kind::Scope, Scope)));
  if (Req.AnyScope)
    ScopeIterators.push_back(
        Corpus.boost(Corpus.all(), ScopeIterators.empty() ? 1.0 : 0.2));
  Criteria.push_back(Corpus.unionOf(std::move(ScopeIterators)));

  // Add proximity paths boosting (all symbols, some boosted).
  Criteria.push_back(createFileProximityIterator(Req.ProximityPaths));
  // Add boosting for preferred types.
  Criteria.push_back(createTypeBoostingIterator(Req.PreferredTypes));

  if (Req.RestrictForCodeCompletion)
    Criteria.push_back(iterator(RestrictedForCodeCompletion));

  // Use TRUE iterator if both trigrams and scopes from the query are not
  // present in the symbol index.
  auto Root = Corpus.intersect(std::move(Criteria));
  // Retrieve more items than it was requested: some of  the items with high
  // final score might not be retrieved otherwise.
  // FIXME(kbobyrev): Tune this ratio.
  if (Req.Limit)
    Root = Corpus.limit(std::move(Root), *Req.Limit * 100);
  SPAN_ATTACH(Tracer, "query", llvm::to_string(*Root));
  vlog("Dex query tree: {0}", *Root);

  using IDAndScore = std::pair<DocID, float>;
  std::vector<IDAndScore> IDAndScores = consume(*Root);

  auto Compare = [](const IDAndScore &LHS, const IDAndScore &RHS) {
    return LHS.second > RHS.second;
  };
  TopN<IDAndScore, decltype(Compare)> Top(
      Req.Limit ? *Req.Limit : std::numeric_limits<size_t>::max(), Compare);
  for (const auto &IDAndScore : IDAndScores) {
    const DocID SymbolDocID = IDAndScore.first;
    const auto *Sym = Symbols[SymbolDocID];
    const llvm::Optional<float> Score = Filter.match(Sym->Name);
    if (!Score)
      continue;
    // Combine Fuzzy Matching score, precomputed symbol quality and boosting
    // score for a cumulative final symbol score.
    const float FinalScore =
        (*Score) * SymbolQuality[SymbolDocID] * IDAndScore.second;
    // If Top.push(...) returns true, it means that it had to pop an item. In
    // this case, it is possible to retrieve more symbols.
    if (Top.push({SymbolDocID, FinalScore}))
      More = true;
  }

  // Apply callback to the top Req.Limit items in the descending
  // order of cumulative score.
  for (const auto &Item : std::move(Top).items())
    Callback(*Symbols[Item.first]);
  return More;
}

void Dex::lookup(const LookupRequest &Req,
                 llvm::function_ref<void(const Symbol &)> Callback) const {
  trace::Span Tracer("Dex lookup");
  for (const auto &ID : Req.IDs) {
    auto I = LookupTable.find(ID);
    if (I != LookupTable.end())
      Callback(*I->second);
  }
}

bool Dex::refs(const RefsRequest &Req,
               llvm::function_ref<void(const Ref &)> Callback) const {
  trace::Span Tracer("Dex refs");
  uint32_t Remaining = Req.Limit.value_or(std::numeric_limits<uint32_t>::max());
  for (const auto &ID : Req.IDs)
    for (const auto &Ref : Refs.lookup(ID)) {
      if (!static_cast<int>(Req.Filter & Ref.Kind))
        continue;
      if (Remaining == 0)
        return true; // More refs were available.
      --Remaining;
      Callback(Ref);
    }
  return false; // We reported all refs.
}

void Dex::relations(
    const RelationsRequest &Req,
    llvm::function_ref<void(const SymbolID &, const Symbol &)> Callback) const {
  trace::Span Tracer("Dex relations");
  uint32_t Remaining = Req.Limit.value_or(std::numeric_limits<uint32_t>::max());
  for (const SymbolID &Subject : Req.Subjects) {
    LookupRequest LookupReq;
    auto It = Relations.find(
        std::make_pair(Subject, static_cast<uint8_t>(Req.Predicate)));
    if (It != Relations.end()) {
      for (const auto &Object : It->second) {
        if (Remaining > 0) {
          --Remaining;
          LookupReq.IDs.insert(Object);
        }
      }
    }
    lookup(LookupReq, [&](const Symbol &Object) { Callback(Subject, Object); });
  }
}

llvm::unique_function<IndexContents(llvm::StringRef) const>
Dex::indexedFiles() const {
  return [this](llvm::StringRef FileURI) {
    return Files.contains(FileURI) ? IdxContents : IndexContents::None;
  };
}

size_t Dex::estimateMemoryUsage() const {
  size_t Bytes = Symbols.size() * sizeof(const Symbol *);
  Bytes += SymbolQuality.size() * sizeof(float);
  Bytes += LookupTable.getMemorySize();
  Bytes += InvertedIndex.getMemorySize();
  for (const auto &TokenToPostingList : InvertedIndex)
    Bytes += TokenToPostingList.second.bytes();
  Bytes += Refs.getMemorySize();
  Bytes += Relations.getMemorySize();
  return Bytes + BackingDataSize;
}

std::vector<std::string> generateProximityURIs(llvm::StringRef URIPath) {
  std::vector<std::string> Result;
  auto ParsedURI = URI::parse(URIPath);
  assert(ParsedURI &&
         "Non-empty argument of generateProximityURIs() should be a valid "
         "URI.");
  llvm::StringRef Body = ParsedURI->body();
  // FIXME(kbobyrev): Currently, this is a heuristic which defines the maximum
  // size of resulting vector. Some projects might want to have higher limit if
  // the file hierarchy is deeper. For the generic case, it would be useful to
  // calculate Limit in the index build stage by calculating the maximum depth
  // of the project source tree at runtime.
  size_t Limit = 5;
  // Insert original URI before the loop: this would save a redundant iteration
  // with a URI parse.
  Result.emplace_back(ParsedURI->toString());
  while (!Body.empty() && --Limit > 0) {
    // FIXME(kbobyrev): Parsing and encoding path to URIs is not necessary and
    // could be optimized.
    Body = llvm::sys::path::parent_path(Body, llvm::sys::path::Style::posix);
    if (!Body.empty())
      Result.emplace_back(
          URI(ParsedURI->scheme(), ParsedURI->authority(), Body).toString());
  }
  return Result;
}

} // namespace dex
} // namespace clangd
} // namespace clang