1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
|
//===--- Grammar.h - grammar used by clang pseudoparser ---------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file defines base structures for parsing & modeling a grammar for a
// programming language:
//
// # This is a fake C++ BNF grammar
// _ := translation-unit
// translation-unit := declaration-seq_opt
// declaration-seq := declaration
// declaration-seq := declaration-seq declaration
//
// A grammar formally describes a language, and it is constructed by a set of
// production rules. A rule is of BNF form (AAA := BBB CCC). A symbol is either
// nonterminal or terminal, identified by a SymbolID.
//
// Annotations are supported in a syntax form of [key=value]. They specify
// attributes which are associated with either a grammar symbol (on the
// right-hand side of the symbol) or a grammar rule (at the end of the rule
// body).
// Attributes provide a way to inject custom code into the GLR parser. Each
// unique attribute value creates an extension point (identified by ExtensionID
// ), and an extension point corresponds to a piece of native code. For
// example, C++ grammar has a rule:
//
// compound_statement := { statement-seq [recover=Brackets] }
//
// The `recover` attribute instructs the parser that we should perform error
// recovery if parsing the statement-seq fails. The `Brackets` recovery
// heuristic is implemented in CXX.cpp by binding the ExtensionID for the
// `Recovery` value to a specific C++ function that finds the recovery point.
//
// Notions about the BNF grammar:
// - "_" is the start symbol of the augmented grammar;
// - single-line comment is supported, starting with a #
// - A rule describes how a nonterminal (left side of :=) is constructed, and
// it is *per line* in the grammar file
// - Terminals (also called tokens) correspond to the clang::TokenKind; they
// are written in the grammar like "IDENTIFIER", "USING", "+"
// - Nonterminals are specified with "lower-case" names in the grammar; they
// shouldn't be nullable (has an empty sequence)
// - optional symbols are supported (specified with a _opt suffix), and they
// will be eliminated during the grammar parsing stage
//
//===----------------------------------------------------------------------===//
#ifndef CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
#define CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
#include "clang/Basic/TokenKinds.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/DenseSet.h"
#include "llvm/ADT/Optional.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Support/raw_ostream.h"
#include <cstdint>
#include <vector>
namespace clang {
namespace pseudo {
// A SymbolID uniquely identifies a terminal/nonterminal symbol in a grammar.
// nonterminal IDs are indexes into a table of nonterminal symbols.
// Terminal IDs correspond to the clang TokenKind enum.
using SymbolID = uint16_t;
// SymbolID is only 12 bits wide.
// There are maximum 2^11 terminals (aka tokens) and 2^11 nonterminals.
static constexpr uint16_t SymbolBits = 12;
static constexpr uint16_t NumTerminals = tok::NUM_TOKENS;
// SymbolIDs with the top bit set are tokens/terminals.
static constexpr SymbolID TokenFlag = 1 << (SymbolBits - 1);
inline bool isToken(SymbolID ID) { return ID & TokenFlag; }
inline bool isNonterminal(SymbolID ID) { return !isToken(ID); }
// The terminals are always the clang tok::TokenKind (not all are used).
inline tok::TokenKind symbolToToken(SymbolID SID) {
assert(isToken(SID));
SID &= ~TokenFlag;
assert(SID < NumTerminals);
return static_cast<tok::TokenKind>(SID);
}
inline constexpr SymbolID tokenSymbol(tok::TokenKind TK) {
return TokenFlag | static_cast<SymbolID>(TK);
}
// An extension is a piece of native code specific to a grammar that modifies
// the behavior of annotated rules. One ExtensionID is assigned for each unique
// attribute value (all attributes share a namespace).
using ExtensionID = uint16_t;
// A RuleID uniquely identifies a production rule in a grammar.
// It is an index into a table of rules.
using RuleID = uint16_t;
// There are maximum 2^12 rules.
static constexpr unsigned RuleBits = 12;
// Represent a production rule in the grammar, e.g.
// expression := a b c
// ^Target ^Sequence
struct Rule {
Rule(SymbolID Target, llvm::ArrayRef<SymbolID> Seq);
// We occupy 4 bits for the sequence, in theory, it can be at most 2^4 tokens
// long, however, we're stricter in order to reduce the size, we limit the max
// length to 9 (this is the longest sequence in cxx grammar).
static constexpr unsigned SizeBits = 4;
static constexpr unsigned MaxElements = 9;
static_assert(MaxElements < (1 << SizeBits), "Exceeds the maximum limit");
static_assert(SizeBits + SymbolBits <= 16,
"Must be able to store symbol ID + size efficiently");
// 16 bits for target symbol and size of sequence:
// SymbolID : 12 | Size : 4
SymbolID Target : SymbolBits;
uint8_t Size : SizeBits; // Size of the Sequence
SymbolID Sequence[MaxElements];
// A guarded rule has extra logic to determine whether the RHS is eligible.
bool Guarded = false;
// Specifies the index within Sequence eligible for error recovery.
// Given stmt := { stmt-seq_opt }, if we fail to parse the stmt-seq then we
// should recover by finding the matching brace, and forcing stmt-seq to match
// everything between braces.
// For now, only a single strategy at a single point is possible.
uint8_t RecoveryIndex = -1;
ExtensionID Recovery = 0;
llvm::ArrayRef<SymbolID> seq() const {
return llvm::ArrayRef<SymbolID>(Sequence, Size);
}
friend bool operator==(const Rule &L, const Rule &R) {
return L.Target == R.Target && L.seq() == R.seq() && L.Guarded == R.Guarded;
}
};
struct GrammarTable;
// Grammar that describes a programming language, e.g. C++. It represents the
// contents of the specified grammar.
// It is a building block for constructing a table-based parser.
class Grammar {
public:
Grammar() = default; // Creates an invalid dummy grammar.
explicit Grammar(std::unique_ptr<GrammarTable>);
// Parses grammar from a BNF file.
// Diagnostics emitted during parsing are stored in Diags.
static Grammar parseBNF(llvm::StringRef BNF, std::vector<std::string> &Diags);
// Returns the SymbolID of the symbol '_'.
SymbolID underscore() const { return Underscore; };
// Returns all rules of the given nonterminal symbol.
llvm::ArrayRef<Rule> rulesFor(SymbolID SID) const;
const Rule &lookupRule(RuleID RID) const;
// Gets symbol (terminal or nonterminal) name.
// Terminals have names like "," (kw_comma) or "OPERATOR" (kw_operator).
llvm::StringRef symbolName(SymbolID) const;
// Gets the mangled name for a terminal/nonterminal.
// Compared to names in the grammar,
// nonterminals `ptr-declartor` becomes `ptr_declarator`;
// terminal `,` becomes `comma`;
// terminal `IDENTIFIER` becomes `identifier`;
// terminal `INT` becomes `int`;
// NOTE: for nonterminals, the mangled name is the same as the cxx::Symbol
// enum class; for terminals, we deliberately stripped the `kw_` prefix in
// favor of the simplicity.
std::string mangleSymbol(SymbolID) const;
// Gets the mangled name for the rule.
// E.g. for the grammar rule `ptr-declarator := ptr-operator ptr-declarator`,
// it is `ptr_declarator_0ptr_operator_1ptr_declarator`.
std::string mangleRule(RuleID) const;
// Lookup the SymbolID of the nonterminal symbol by Name.
llvm::Optional<SymbolID> findNonterminal(llvm::StringRef Name) const;
// Dumps the whole grammar.
std::string dump() const;
// Dumps a particular rule.
std::string dumpRule(RuleID) const;
// Dumps all rules of the given nonterminal symbol.
std::string dumpRules(SymbolID) const;
const GrammarTable &table() const { return *T; }
private:
std::unique_ptr<GrammarTable> T;
// The symbol ID of '_'. (In the LR literature, this is the start symbol of
// the augmented grammar.)
SymbolID Underscore;
};
// For each nonterminal X, computes the set of terminals that begin strings
// derived from X. (Known as FIRST sets in grammar-based parsers).
std::vector<llvm::DenseSet<SymbolID>> firstSets(const Grammar &);
// For each nonterminal X, computes the set of terminals that could immediately
// follow X. (Known as FOLLOW sets in grammar-based parsers).
std::vector<llvm::DenseSet<SymbolID>> followSets(const Grammar &);
// Storage for the underlying data of the Grammar.
// It can be constructed dynamically (from compiling BNF file) or statically
// (a compiled data-source).
struct GrammarTable {
GrammarTable();
struct Nonterminal {
std::string Name;
// Corresponding rules that construct the nonterminal, it is a [Start, End)
// index range of the Rules table.
struct {
RuleID Start;
RuleID End;
} RuleRange;
};
// RuleID is an index into this table of rule definitions.
//
// Rules with the same target symbol (LHS) are grouped into a single range.
// The relative order of different target symbols is *not* by SymbolID, but
// rather a topological sort: if S := T then the rules producing T have lower
// RuleIDs than rules producing S.
// (This strange order simplifies the GLR parser: for a given token range, if
// we reduce in increasing RuleID order then we need never backtrack --
// prerequisite reductions are reached before dependent ones).
std::vector<Rule> Rules;
// A table of terminals (aka tokens). It corresponds to the clang::Token.
// clang::tok::TokenKind is the index of the table.
llvm::ArrayRef<std::string> Terminals;
// A table of nonterminals, sorted by name.
// SymbolID is the index of the table.
std::vector<Nonterminal> Nonterminals;
// A table of attribute values, sorted by name.
// ExtensionID is the index of the table.
std::vector<std::string> AttributeValues;
};
} // namespace pseudo
} // namespace clang
#endif // CLANG_PSEUDO_GRAMMAR_GRAMMAR_H
|