File: Forest.h

package info (click to toggle)
llvm-toolchain-15 1%3A15.0.6-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,554,644 kB
  • sloc: cpp: 5,922,452; ansic: 1,012,136; asm: 674,362; python: 191,568; objc: 73,855; f90: 42,327; lisp: 31,913; pascal: 11,973; javascript: 10,144; sh: 9,421; perl: 7,447; ml: 5,527; awk: 3,523; makefile: 2,520; xml: 885; cs: 573; fortran: 567
file content (223 lines) | stat: -rw-r--r-- 8,101 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
//===--- Forest.h - Parse forest, the output of the GLR parser ---*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// A parse forest represents a set of possible parse trees efficiently, it is
// produced by the GLR parser.
//
// Despite the name, its data structure is a tree-like DAG with a single root.
// Multiple ways to parse the same tokens are presented as an ambiguous node
// with all possible interpretations as children.
// Common sub-parses are shared: if two interpretations both parse "1 + 1" as
// "expr := expr + expr", they will share a Sequence node representing the expr.
//
//===----------------------------------------------------------------------===//

#ifndef CLANG_PSEUDO_FOREST_H
#define CLANG_PSEUDO_FOREST_H

#include "clang-pseudo/Token.h"
#include "clang-pseudo/grammar/Grammar.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/STLExtras.h"
#include "llvm/Support/Allocator.h"
#include <cstdint>

namespace clang {
namespace pseudo {

// A node represents ways to parse a sequence of tokens, it interprets a fixed
// range of tokens as a fixed grammar symbol.
//
// There are different kinds of nodes, some nodes have "children" (stored in a
// trailing array) and have pointers to them. "Children" has different semantics
// depending on the node kinds. For an Ambiguous node, it means all
// possible interpretations; for a Sequence node, it means each symbol on the
// right hand side of the production rule.
//
// Since this is a node in a DAG, a node may have multiple parents. And a node
// doesn't have parent pointers.
class alignas(class ForestNode *) ForestNode {
public:
  class RecursiveIterator;
  enum Kind {
    // A Terminal node is a single terminal symbol bound to a token.
    Terminal,
    // A Sequence node is a nonterminal symbol parsed from a grammar rule,
    // elements() are the parses of each symbol on the RHS of the rule.
    // If the rule is A := X Y Z, the node is for nonterminal A, and elements()
    // are [X, Y, Z].
    Sequence,
    // An Ambiguous node exposes multiple ways to interpret the code as the
    // same symbol, alternatives() are all possible parses.
    Ambiguous,
    // An Opaque node is a placeholder. It asserts that tokens match a symbol,
    // without saying how.
    // It is used for lazy-parsing (not parsed yet), or error-recovery (invalid
    // code).
    Opaque,
  };
  Kind kind() const { return K; }

  SymbolID symbol() const { return Symbol; }

  // The start of the token range, it is a poistion within a token stream.
  Token::Index startTokenIndex() const { return StartIndex; }

  // Returns the corresponding grammar rule.
  // REQUIRES: this is a Sequence node.
  RuleID rule() const {
    assert(kind() == Sequence);
    return Data & ((1 << RuleBits) - 1);
  }
  // Returns the parses of each element on the RHS of the rule.
  // REQUIRES: this is a Sequence node;
  llvm::ArrayRef<const ForestNode *> elements() const {
    assert(kind() == Sequence);
    return children(Data >> RuleBits);
  };

  // Returns all possible interpretations of the code.
  // REQUIRES: this is an Ambiguous node.
  llvm::ArrayRef<const ForestNode *> alternatives() const {
    assert(kind() == Ambiguous);
    return children(Data);
  }

  llvm::ArrayRef<const ForestNode *> children() const {
    switch (kind()) {
    case Sequence:
      return elements();
    case Ambiguous:
      return alternatives();
    case Terminal:
    case Opaque:
      return {};
    }
    llvm_unreachable("Bad kind");
  }

  // Iteration over all nodes in the forest, including this.
  llvm::iterator_range<RecursiveIterator> descendants() const;

  std::string dump(const Grammar &) const;
  std::string dumpRecursive(const Grammar &, bool Abbreviated = false) const;

private:
  friend class ForestArena;

  ForestNode(Kind K, SymbolID Symbol, Token::Index StartIndex, uint16_t Data)
      : StartIndex(StartIndex), K(K), Symbol(Symbol), Data(Data) {}

  ForestNode(const ForestNode &) = delete;
  ForestNode &operator=(const ForestNode &) = delete;
  ForestNode(ForestNode &&) = delete;
  ForestNode &operator=(ForestNode &&) = delete;

  static uint16_t sequenceData(RuleID Rule,
                               llvm::ArrayRef<const ForestNode *> Elements) {
    assert(Rule < (1 << RuleBits));
    assert(Elements.size() < (1 << (16 - RuleBits)));
    return Rule | Elements.size() << RuleBits;
  }
  static uint16_t
  ambiguousData(llvm::ArrayRef<const ForestNode *> Alternatives) {
    return Alternatives.size();
  }

  // Retrieves the trailing array.
  llvm::ArrayRef<const ForestNode *> children(uint16_t Num) const {
    return llvm::makeArrayRef(reinterpret_cast<ForestNode *const *>(this + 1),
                              Num);
  }

  Token::Index StartIndex;
  Kind K : 4;
  SymbolID Symbol : SymbolBits;
  // Sequence - child count : 4 | RuleID : RuleBits (12)
  // Ambiguous - child count : 16
  // Terminal, Opaque - unused
  uint16_t Data;
  // An array of ForestNode* following the object.
};
// ForestNode may not be destroyed (for BumpPtrAllocator).
static_assert(std::is_trivially_destructible<ForestNode>(), "");

// A memory arena for the parse forest.
class ForestArena {
public:
  llvm::ArrayRef<ForestNode> createTerminals(const TokenStream &Code);
  ForestNode &createSequence(SymbolID SID, RuleID RID,
                             llvm::ArrayRef<const ForestNode *> Elements) {
    assert(!Elements.empty());
    return create(ForestNode::Sequence, SID,
                  Elements.front()->startTokenIndex(),
                  ForestNode::sequenceData(RID, Elements), Elements);
  }
  ForestNode &createAmbiguous(SymbolID SID,
                              llvm::ArrayRef<const ForestNode *> Alternatives) {
    assert(!Alternatives.empty());
    assert(llvm::all_of(Alternatives,
                        [SID](const ForestNode *Alternative) {
                          return SID == Alternative->symbol();
                        }) &&
           "Ambiguous alternatives must represent the same symbol!");
    return create(ForestNode::Ambiguous, SID,
                  Alternatives.front()->startTokenIndex(),
                  ForestNode::ambiguousData(Alternatives), Alternatives);
  }
  ForestNode &createOpaque(SymbolID SID, Token::Index Start) {
    return create(ForestNode::Opaque, SID, Start, 0, {});
  }

  ForestNode &createTerminal(tok::TokenKind TK, Token::Index Start) {
    return create(ForestNode::Terminal, tokenSymbol(TK), Start, 0, {});
  }

  size_t nodeCount() const { return NodeCount; }
  size_t bytes() const { return Arena.getBytesAllocated() + sizeof(this); }

private:
  ForestNode &create(ForestNode::Kind K, SymbolID SID, Token::Index Start,
                     uint16_t Data,
                     llvm::ArrayRef<const ForestNode *> Elements) {
    ++NodeCount;
    ForestNode *New = new (Arena.Allocate(
        sizeof(ForestNode) + Elements.size() * sizeof(ForestNode *),
        alignof(ForestNode))) ForestNode(K, SID, Start, Data);
    if (!Elements.empty())
      llvm::copy(Elements, reinterpret_cast<const ForestNode **>(New + 1));
    return *New;
  }

  llvm::BumpPtrAllocator Arena;
  uint32_t NodeCount = 0;
};

class ForestNode::RecursiveIterator
    : public std::iterator<std::input_iterator_tag, const ForestNode> {
  llvm::DenseSet<const ForestNode *> Seen;
  struct StackFrame {
    const ForestNode *Parent;
    unsigned ChildIndex;
  };
  std::vector<StackFrame> Stack;
  const ForestNode *Cur;

public:
  RecursiveIterator(const ForestNode *N = nullptr) : Cur(N) {}

  const ForestNode &operator*() const { return *Cur; };
  void operator++();
  bool operator==(const RecursiveIterator &I) const { return Cur == I.Cur; }
  bool operator!=(const RecursiveIterator &I) const { return !(*this == I); }
};

} // namespace pseudo
} // namespace clang

#endif // CLANG_PSEUDO_FOREST_H