1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223
|
//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// A parse forest represents a set of possible parse trees efficiently, it is
// produced by the GLR parser.
//
// Despite the name, its data structure is a tree-like DAG with a single root.
// Multiple ways to parse the same tokens are presented as an ambiguous node
// with all possible interpretations as children.
// Common sub-parses are shared: if two interpretations both parse "1 + 1" as
// "expr := expr + expr", they will share a Sequence node representing the expr.
//
//===----------------------------------------------------------------------===//
#ifndef CLANG_PSEUDO_FOREST_H
#define CLANG_PSEUDO_FOREST_H
#include "clang-pseudo/Token.h"
#include "clang-pseudo/grammar/Grammar.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Allocator.h"
#include <cstdint>
namespace clang {
namespace pseudo {
// A node represents ways to parse a sequence of tokens, it interprets a fixed
// range of tokens as a fixed grammar symbol.
//
// There are different kinds of nodes, some nodes have "children" (stored in a
// trailing array) and have pointers to them. "Children" has different semantics
// depending on the node kinds. For an Ambiguous node, it means all
// possible interpretations; for a Sequence node, it means each symbol on the
// right hand side of the production rule.
//
// Since this is a node in a DAG, a node may have multiple parents. And a node
// doesn't have parent pointers.
class alignas(class ForestNode *) ForestNode {
public:
class RecursiveIterator;
enum Kind {
// A Terminal node is a single terminal symbol bound to a token.
Terminal,
// A Sequence node is a nonterminal symbol parsed from a grammar rule,
// elements() are the parses of each symbol on the RHS of the rule.
// If the rule is A := X Y Z, the node is for nonterminal A, and elements()
// are [X, Y, Z].
Sequence,
// An Ambiguous node exposes multiple ways to interpret the code as the
// same symbol, alternatives() are all possible parses.
Ambiguous,
// An Opaque node is a placeholder. It asserts that tokens match a symbol,
// without saying how.
// It is used for lazy-parsing (not parsed yet), or error-recovery (invalid
// code).
Opaque,
};
Kind kind() const { return K; }
SymbolID symbol() const { return Symbol; }
// The start of the token range, it is a poistion within a token stream.
Token::Index startTokenIndex() const { return StartIndex; }
// Returns the corresponding grammar rule.
// REQUIRES: this is a Sequence node.
RuleID rule() const {
assert(kind() == Sequence);
return Data & ((1 << RuleBits) - 1);
}
// Returns the parses of each element on the RHS of the rule.
// REQUIRES: this is a Sequence node;
llvm::ArrayRef<const ForestNode *> elements() const {
assert(kind() == Sequence);
return children(Data >> RuleBits);
};
// Returns all possible interpretations of the code.
// REQUIRES: this is an Ambiguous node.
llvm::ArrayRef<const ForestNode *> alternatives() const {
assert(kind() == Ambiguous);
return children(Data);
}
llvm::ArrayRef<const ForestNode *> children() const {
switch (kind()) {
case Sequence:
return elements();
case Ambiguous:
return alternatives();
case Terminal:
case Opaque:
return {};
}
llvm_unreachable("Bad kind");
}
// Iteration over all nodes in the forest, including this.
llvm::iterator_range<RecursiveIterator> descendants() const;
std::string dump(const Grammar &) const;
std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const;
private:
friend class ForestArena;
ForestNode(Kind K, SymbolID Symbol, Token::Index StartIndex, uint16_t Data)
: StartIndex(StartIndex), K(K), Symbol(Symbol), Data(Data) {}
ForestNode(const ForestNode &) = delete;
ForestNode &operator=(const ForestNode &) = delete;
ForestNode(ForestNode &&) = delete;
ForestNode &operator=(ForestNode &&) = delete;
static uint16_t sequenceData(RuleID Rule,
llvm::ArrayRef<const ForestNode *> Elements) {
assert(Rule < (1 << RuleBits));
assert(Elements.size() < (1 << (16 - RuleBits)));
return Rule | Elements.size() << RuleBits;
}
static uint16_t
ambiguousData(llvm::ArrayRef<const ForestNode *> Alternatives) {
return Alternatives.size();
}
// Retrieves the trailing array.
llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const {
return llvm::makeArrayRef(reinterpret_cast<ForestNode *const *>(this + 1),
Num);
}
Token::Index StartIndex;
Kind K : 4;
SymbolID Symbol : SymbolBits;
// Sequence - child count : 4 | RuleID : RuleBits (12)
// Ambiguous - child count : 16
// Terminal, Opaque - unused
uint16_t Data;
// An array of ForestNode* following the object.
};
// ForestNode may not be destroyed (for BumpPtrAllocator).
static_assert(std::is_trivially_destructible<ForestNode>(), "");
// A memory arena for the parse forest.
class ForestArena {
public:
llvm::ArrayRef<ForestNode> createTerminals(const TokenStream &Code);
ForestNode &createSequence(SymbolID SID, RuleID RID,
llvm::ArrayRef<const ForestNode *> Elements) {
assert(!Elements.empty());
return create(ForestNode::Sequence, SID,
Elements.front()->startTokenIndex(),
ForestNode::sequenceData(RID, Elements), Elements);
}
ForestNode &createAmbiguous(SymbolID SID,
llvm::ArrayRef<const ForestNode *> Alternatives) {
assert(!Alternatives.empty());
assert(llvm::all_of(Alternatives,
[SID](const ForestNode *Alternative) {
return SID == Alternative->symbol();
}) &&
"Ambiguous alternatives must represent the same symbol!");
return create(ForestNode::Ambiguous, SID,
Alternatives.front()->startTokenIndex(),
ForestNode::ambiguousData(Alternatives), Alternatives);
}
ForestNode &createOpaque(SymbolID SID, Token::Index Start) {
return create(ForestNode::Opaque, SID, Start, 0, {});
}
ForestNode &createTerminal(tok::TokenKind TK, Token::Index Start) {
return create(ForestNode::Terminal, tokenSymbol(TK), Start, 0, {});
}
size_t nodeCount() const { return NodeCount; }
size_t bytes() const { return Arena.getBytesAllocated() + sizeof(this); }
private:
ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start,
uint16_t Data,
llvm::ArrayRef<const ForestNode *> Elements) {
++NodeCount;
ForestNode *New = new (Arena.Allocate(
sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *),
alignof(ForestNode))) ForestNode(K, SID, Start, Data);
if (!Elements.empty())
llvm::copy(Elements, reinterpret_cast<const ForestNode **>(New + 1));
return *New;
}
llvm::BumpPtrAllocator Arena;
uint32_t NodeCount = 0;
};
class ForestNode::RecursiveIterator
: public std::iterator<std::input_iterator_tag, const ForestNode> {
llvm::DenseSet<const ForestNode *> Seen;
struct StackFrame {
const ForestNode *Parent;
unsigned ChildIndex;
};
std::vector<StackFrame> Stack;
const ForestNode *Cur;
public:
RecursiveIterator(const ForestNode *N = nullptr) : Cur(N) {}
const ForestNode &operator*() const { return *Cur; };
void operator++();
bool operator==(const RecursiveIterator &I) const { return Cur == I.Cur; }
bool operator!=(const RecursiveIterator &I) const { return !(*this == I); }
};
} // namespace pseudo
} // namespace clang
#endif // CLANG_PSEUDO_FOREST_H
|