1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211
|
//===--- Grammar.cpp - Grammar for clang pseudoparser -----------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
#include "clang-pseudo/grammar/Grammar.h"
#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/FormatVariadic.h"
#include "llvm/Support/raw_ostream.h"
namespace clang {
namespace pseudo {
Rule::Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Sequence)
: Target(Target), Size(static_cast<uint8_t>(Sequence.size())) {
assert(Sequence.size() <= Rule::MaxElements);
llvm::copy(Sequence, this->Sequence);
}
Grammar::Grammar(std::unique_ptr<GrammarTable> Table) : T(std::move(Table)) {
Underscore = *findNonterminal("_");
}
llvm::ArrayRef<Rule> Grammar::rulesFor(SymbolID SID) const {
assert(isNonterminal(SID));
const auto &R = T->Nonterminals[SID].RuleRange;
assert(R.End <= T->Rules.size());
return llvm::makeArrayRef(&T->Rules[R.Start], R.End - R.Start);
}
const Rule &Grammar::lookupRule(RuleID RID) const {
assert(RID < T->Rules.size());
return T->Rules[RID];
}
llvm::StringRef Grammar::symbolName(SymbolID SID) const {
if (isToken(SID))
return T->Terminals[symbolToToken(SID)];
return T->Nonterminals[SID].Name;
}
std::string Grammar::mangleSymbol(SymbolID SID) const {
static const char *const TokNames[] = {
#define TOK(X) #X,
#define KEYWORD(X, Y) #X,
#include "clang/Basic/TokenKinds.def"
nullptr};
if (clang::pseudo::isToken(SID))
return TokNames[clang::pseudo::symbolToToken(SID)];
std::string Name = symbolName(SID).str();
// translation-unit -> translation_unit
std::replace(Name.begin(), Name.end(), '-', '_');
return Name;
}
std::string Grammar::mangleRule(RuleID RID) const {
const auto &R = lookupRule(RID);
std::string MangleName = mangleSymbol(R.Target);
for (size_t I = 0; I < R.seq().size(); ++I)
MangleName += llvm::formatv("_{0}{1}", I, mangleSymbol(R.seq()[I]));
return MangleName;
}
llvm::Optional<SymbolID> Grammar::findNonterminal(llvm::StringRef Name) const {
auto It = llvm::partition_point(
T->Nonterminals,
[&](const GrammarTable::Nonterminal &X) { return X.Name < Name; });
if (It != T->Nonterminals.end() && It->Name == Name)
return It - T->Nonterminals.begin();
return llvm::None;
}
std::string Grammar::dumpRule(RuleID RID) const {
std::string Result;
llvm::raw_string_ostream OS(Result);
const Rule &R = T->Rules[RID];
OS << symbolName(R.Target) << " :=";
for (unsigned I = 0; I < R.Size; ++I) {
OS << " " << symbolName(R.Sequence[I]);
if (R.RecoveryIndex == I)
OS << " [recover=" << T->AttributeValues[R.Recovery] << "]";
}
if (R.Guarded)
OS << " [guard]";
return Result;
}
std::string Grammar::dumpRules(SymbolID SID) const {
assert(isNonterminal(SID));
std::string Result;
const auto &Range = T->Nonterminals[SID].RuleRange;
for (RuleID RID = Range.Start; RID < Range.End; ++RID)
Result.append(dumpRule(RID)).push_back('\n');
return Result;
}
std::string Grammar::dump() const {
std::string Result;
llvm::raw_string_ostream OS(Result);
OS << "Nonterminals:\n";
for (SymbolID SID = 0; SID < T->Nonterminals.size(); ++SID)
OS << llvm::formatv(" {0} {1}\n", SID, symbolName(SID));
OS << "Rules:\n";
for (RuleID RID = 0; RID < T->Rules.size(); ++RID)
OS << llvm::formatv(" {0} {1}\n", RID, dumpRule(RID));
return OS.str();
}
std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &G) {
std::vector<llvm::DenseSet<SymbolID>> FirstSets(
G.table().Nonterminals.size());
auto ExpandFirstSet = [&FirstSets](SymbolID Target, SymbolID First) {
assert(isNonterminal(Target));
if (isToken(First))
return FirstSets[Target].insert(First).second;
bool Changed = false;
for (SymbolID SID : FirstSets[First])
Changed |= FirstSets[Target].insert(SID).second;
return Changed;
};
// A rule S := T ... implies elements in FIRST(S):
// - if T is a terminal, FIRST(S) contains T
// - if T is a nonterminal, FIRST(S) contains FIRST(T)
// Since FIRST(T) may not have been fully computed yet, FIRST(S) itself may
// end up being incomplete.
// We iterate until we hit a fixed point.
// (This isn't particularly efficient, but table building isn't on the
// critical path).
bool Changed = true;
while (Changed) {
Changed = false;
for (const auto &R : G.table().Rules)
// We only need to consider the first element because symbols are
// non-nullable.
Changed |= ExpandFirstSet(R.Target, R.seq().front());
}
return FirstSets;
}
std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &G) {
auto FirstSets = firstSets(G);
std::vector<llvm::DenseSet<SymbolID>> FollowSets(
G.table().Nonterminals.size());
// Expand the follow set of a nonterminal symbol Y by adding all from the
// given symbol set.
auto ExpandFollowSet = [&FollowSets](SymbolID Y,
const llvm::DenseSet<SymbolID> &ToAdd) {
assert(isNonterminal(Y));
bool Changed = false;
for (SymbolID F : ToAdd)
Changed |= FollowSets[Y].insert(F).second;
return Changed;
};
// Follow sets is computed based on the following 3 rules, the computation
// is completed at a fixed point where there is no more new symbols can be
// added to any of the follow sets.
//
// Rule 1: add endmarker to the FOLLOW(S), where S is the start symbol of the
// augmented grammar, in our case it is '_'.
FollowSets[G.underscore()].insert(tokenSymbol(tok::eof));
bool Changed = true;
while (Changed) {
Changed = false;
for (const auto &R : G.table().Rules) {
// Rule 2: for a rule X := ... Y Z, we add all symbols from FIRST(Z) to
// FOLLOW(Y).
for (size_t I = 0; I + 1 < R.seq().size(); ++I) {
if (isToken(R.seq()[I]))
continue;
// We only need to consider the next symbol because symbols are
// non-nullable.
SymbolID Next = R.seq()[I + 1];
if (isToken(Next))
// First set for a terminal is itself.
Changed |= ExpandFollowSet(R.seq()[I], {Next});
else
Changed |= ExpandFollowSet(R.seq()[I], FirstSets[Next]);
}
// Rule 3: for a rule X := ... Z, we add all symbols from FOLLOW(X) to
// FOLLOW(Z).
SymbolID Z = R.seq().back();
if (isNonterminal(Z))
Changed |= ExpandFollowSet(Z, FollowSets[R.Target]);
}
}
return FollowSets;
}
static llvm::ArrayRef<std::string> getTerminalNames() {
static const auto &TerminalNames = []() {
auto TerminalNames = new std::string[NumTerminals];
#define PUNCTUATOR(Tok, Spelling) TerminalNames[tok::Tok] = Spelling;
#define KEYWORD(Keyword, Condition) \
TerminalNames[tok::kw_##Keyword] = llvm::StringRef(#Keyword).upper();
#define TOK(Tok) TerminalNames[tok::Tok] = llvm::StringRef(#Tok).upper();
#include "clang/Basic/TokenKinds.def"
return llvm::makeArrayRef(TerminalNames, NumTerminals);
}();
return TerminalNames;
}
GrammarTable::GrammarTable() : Terminals(getTerminalNames()) {}
} // namespace pseudo
} // namespace clang
|