File: LRTable.h

package info (click to toggle)
llvm-toolchain-15 1%3A15.0.6-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,554,644 kB
  • sloc: cpp: 5,922,452; ansic: 1,012,136; asm: 674,362; python: 191,568; objc: 73,855; f90: 42,327; lisp: 31,913; pascal: 11,973; javascript: 10,144; sh: 9,421; perl: 7,447; ml: 5,527; awk: 3,523; makefile: 2,520; xml: 885; cs: 573; fortran: 567
file content (276 lines) | stat: -rw-r--r-- 11,168 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
//===--- LRTable.h - Define LR Parsing Table ---------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//  The LRTable (referred as LR parsing table in the LR literature) is the core
//  component in LR parsers, it drives the LR parsers by specifying an action to
//  take given the current state on the top of the stack and the current
//  lookahead token.
//
//  The LRTable can be described as a matrix where the rows represent
//  the states of the LR graph, the columns represent the symbols of the
//  grammar, and each entry of the matrix (called action) represents a
//  state transition in the graph.
//
//  Typically, based on the category of the grammar symbol, the LRTable is
//  broken into two logically separate tables:
//    - ACTION table with terminals as columns -- e.g. ACTION[S, a] specifies
//      next action (shift/reduce) on state S under a lookahead terminal a
//    - GOTO table with nonterminals as columns -- e.g. GOTO[S, X] specifies
//      the state which we transist to from the state S with the nonterminal X
//
//  LRTable is *performance-critial* as it is consulted frequently during a
//  parse. In general, LRTable is very sparse (most of the entries are empty).
//  For example, for the C++ language, the SLR table has ~1500 states and 650
//  symbols which results in a matrix having 975K entries, ~90% of entries are
//  empty.
//
//  This file implements a speed-and-space-efficient LRTable.
//
//===----------------------------------------------------------------------===//

#ifndef CLANG_PSEUDO_GRAMMAR_LRTABLE_H
#define CLANG_PSEUDO_GRAMMAR_LRTABLE_H

#include "clang-pseudo/grammar/Grammar.h"
#include "llvm/ADT/ArrayRef.h"
#include "llvm/ADT/BitVector.h"
#include "llvm/ADT/SmallSet.h"
#include "llvm/Support/Capacity.h"
#include "llvm/Support/MathExtras.h"
#include <cstdint>
#include <vector>

namespace clang {
namespace pseudo {

// Represents the LR parsing table, which can efficiently the question "what is
// the next step given the lookahead token and current state on top of the
// stack?".
//
// This is a dense implementation, which only takes an amount of space that is
// proportional to the number of non-empty entries in the table.
//
// Unlike the typical LR parsing table which allows at most one available action
// per entry, conflicted actions are allowed in LRTable. The LRTable is designed
// to be used in nondeterministic LR parsers (e.g. GLR).
//
// There are no "accept" actions in the LRTable, instead the stack is inspected
// after parsing completes: is the state goto(StartState, StartSymbol)?
class LRTable {
public:
  // StateID is only 13 bits wide.
  using StateID = uint16_t;
  static constexpr unsigned StateBits = 13;

  struct Recovery {
    ExtensionID Strategy;
    SymbolID Result;
  };

  // Returns the state after we reduce a nonterminal.
  // Expected to be called by LR parsers.
  // If the nonterminal is invalid here, returns None.
  llvm::Optional<StateID> getGoToState(StateID State,
                                       SymbolID Nonterminal) const {
    return Gotos.get(gotoIndex(State, Nonterminal, numStates()));
  }
  // Returns the state after we shift a terminal.
  // Expected to be called by LR parsers.
  // If the terminal is invalid here, returns None.
  llvm::Optional<StateID> getShiftState(StateID State,
                                        SymbolID Terminal) const {
    return Shifts.get(shiftIndex(State, Terminal, numStates()));
  }

  // Returns the possible reductions from a state.
  //
  // These are not keyed by a lookahead token. Instead, call canFollow() to
  // check whether a reduction should apply in the current context:
  //   for (RuleID R : LR.getReduceRules(S)) {
  //     if (!LR.canFollow(G.lookupRule(R).Target, NextToken))
  //       continue;
  //     // ...apply reduce...
  //   }
  llvm::ArrayRef<RuleID> getReduceRules(StateID State) const {
    assert(State + 1u < ReduceOffset.size());
    return llvm::makeArrayRef(Reduces.data() + ReduceOffset[State],
                              Reduces.data() + ReduceOffset[State+1]);
  }
  // Returns whether Terminal can follow Nonterminal in a valid source file.
  bool canFollow(SymbolID Nonterminal, SymbolID Terminal) const {
    assert(isToken(Terminal));
    assert(isNonterminal(Nonterminal));
    return FollowSets.test(tok::NUM_TOKENS * Nonterminal +
                           symbolToToken(Terminal));
  }

  // Looks up available recovery actions if we stopped parsing in this state.
  llvm::ArrayRef<Recovery> getRecovery(StateID State) const {
    return llvm::makeArrayRef(Recoveries.data() + RecoveryOffset[State],
                              Recoveries.data() + RecoveryOffset[State + 1]);
  }

  // Returns the state from which the LR parser should start to parse the input
  // tokens as the given StartSymbol.
  //
  // In LR parsing, the start state of `translation-unit` corresponds to
  // `_ := • translation-unit`.
  //
  // Each start state responds to **a** single grammar rule like `_ := start`.
  // REQUIRE: The given StartSymbol must exist in the grammar (in a form of
  //          `_ := start`).
  StateID getStartState(SymbolID StartSymbol) const;

  size_t bytes() const {
    return sizeof(*this) + Gotos.bytes() + Shifts.bytes() +
           llvm::capacity_in_bytes(Reduces) +
           llvm::capacity_in_bytes(ReduceOffset) +
           llvm::capacity_in_bytes(FollowSets);
  }

  std::string dumpStatistics() const;
  std::string dumpForTests(const Grammar &G) const;

  // Build a SLR(1) parsing table.
  static LRTable buildSLR(const Grammar &G);

  // Helper for building a table with specified actions/states.
  struct Builder {
    Builder() = default;
    Builder(const Grammar &G) {
      NumNonterminals = G.table().Nonterminals.size();
      FollowSets = followSets(G);
    }

    unsigned int NumNonterminals = 0;
    // States representing `_ := . start` for various start symbols.
    std::vector<std::pair<SymbolID, StateID>> StartStates;
    // State transitions `X := ABC . D EFG` => `X := ABC D . EFG`.
    // Key is (initial state, D), value is final state.
    llvm::DenseMap<std::pair<StateID, SymbolID>, StateID> Transition;
    // Reductions available in a given state.
    llvm::DenseMap<StateID, llvm::SmallSet<RuleID, 4>> Reduce;
    // FollowSets[NT] is the set of terminals that can follow the nonterminal.
    std::vector<llvm::DenseSet<SymbolID>> FollowSets;
    // Recovery options available at each state.
    std::vector<std::pair<StateID, Recovery>> Recoveries;

    LRTable build() &&;
  };

private:
  unsigned numStates() const { return ReduceOffset.size() - 1; }

  // A map from unsigned key => StateID, used to store actions.
  // The keys should be sequential but the values are somewhat sparse.
  //
  // In practice, the keys encode (origin state, symbol) pairs, and the values
  // are the state we should move to after seeing that symbol.
  //
  // We store one bit for presence/absence of the value for each key.
  // At every 64th key, we store the offset into the table of values.
  //   e.g. key 0x500 is checkpoint 0x500/64 = 20
  //                     Checkpoints[20] = 34
  //        get(0x500) = Values[34]                (assuming it has a value)
  // To look up values in between, we count the set bits:
  //        get(0x509) has a value if HasValue[20] & (1<<9)
  //        #values between 0x500 and 0x509: popcnt(HasValue[20] & (1<<9 - 1))
  //        get(0x509) = Values[34 + popcnt(...)]
  //
  // Overall size is 1.25 bits/key + 16 bits/value.
  // Lookup is constant time with a low factor (no hashing).
  class TransitionTable {
    using Word = uint64_t;
    constexpr static unsigned WordBits = CHAR_BIT * sizeof(Word);

    std::vector<StateID> Values;
    std::vector<Word> HasValue;
    std::vector<uint16_t> Checkpoints;

  public:
    TransitionTable() = default;
    TransitionTable(const llvm::DenseMap<unsigned, StateID> &Entries,
                    unsigned NumKeys) {
      assert(
          Entries.size() <
              std::numeric_limits<decltype(Checkpoints)::value_type>::max() &&
          "16 bits too small for value offsets!");
      unsigned NumWords = (NumKeys + WordBits - 1) / WordBits;
      HasValue.resize(NumWords, 0);
      Checkpoints.reserve(NumWords);
      Values.reserve(Entries.size());
      for (unsigned I = 0; I < NumKeys; ++I) {
        if ((I % WordBits) == 0)
          Checkpoints.push_back(Values.size());
        auto It = Entries.find(I);
        if (It != Entries.end()) {
          HasValue[I / WordBits] |= (Word(1) << (I % WordBits));
          Values.push_back(It->second);
        }
      }
    }

    llvm::Optional<StateID> get(unsigned Key) const {
      // Do we have a value for this key?
      Word KeyMask = Word(1) << (Key % WordBits);
      unsigned KeyWord = Key / WordBits;
      if ((HasValue[KeyWord] & KeyMask) == 0)
        return llvm::None;
      // Count the number of values since the checkpoint.
      Word BelowKeyMask = KeyMask - 1;
      unsigned CountSinceCheckpoint =
          llvm::countPopulation(HasValue[KeyWord] & BelowKeyMask);
      // Find the value relative to the last checkpoint.
      return Values[Checkpoints[KeyWord] + CountSinceCheckpoint];
    }

    unsigned size() const { return Values.size(); }

    size_t bytes() const {
      return llvm::capacity_in_bytes(HasValue) +
             llvm::capacity_in_bytes(Values) +
             llvm::capacity_in_bytes(Checkpoints);
    }
  };
  // Shift and Goto tables are keyed by encoded (State, Symbol).
  static unsigned shiftIndex(StateID State, SymbolID Terminal,
                             unsigned NumStates) {
    return NumStates * symbolToToken(Terminal) + State;
  }
  static unsigned gotoIndex(StateID State, SymbolID Nonterminal,
                            unsigned NumStates) {
    assert(isNonterminal(Nonterminal));
    return NumStates * Nonterminal + State;
  }
  TransitionTable Shifts;
  TransitionTable Gotos;

  // A sorted table, storing the start state for each target parsing symbol.
  std::vector<std::pair<SymbolID, StateID>> StartStates;

  // Given a state ID S, the half-open range of Reduces is
  // [ReduceOffset[S], ReduceOffset[S+1])
  std::vector<uint32_t> ReduceOffset;
  std::vector<RuleID> Reduces;
  // Conceptually this is a bool[SymbolID][Token], each entry describing whether
  // the grammar allows the (nonterminal) symbol to be followed by the token.
  //
  // This is flattened by encoding the (SymbolID Nonterminal, tok::Kind Token)
  // as an index: Nonterminal * NUM_TOKENS + Token.
  llvm::BitVector FollowSets;

  // Recovery stores all recovery actions from all states.
  // A given state has [RecoveryOffset[S], RecoveryOffset[S+1]).
  std::vector<uint32_t> RecoveryOffset;
  std::vector<Recovery> Recoveries;
};

} // namespace pseudo
} // namespace clang

#endif // CLANG_PSEUDO_GRAMMAR_LRTABLE_H