File: LRGraph.h

package info (click to toggle)
llvm-toolchain-15 1%3A15.0.6-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,554,644 kB
  • sloc: cpp: 5,922,452; ansic: 1,012,136; asm: 674,362; python: 191,568; objc: 73,855; f90: 42,327; lisp: 31,913; pascal: 11,973; javascript: 10,144; sh: 9,421; perl: 7,447; ml: 5,527; awk: 3,523; makefile: 2,520; xml: 885; cs: 573; fortran: 567
file content (196 lines) | stat: -rw-r--r-- 6,954 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
//===--- LRGraph.h - Build an LR automaton  ------------------*- C++-*-===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
//  LR parsers are bottom-up parsers -- they scan the input from left to right,
//  and collect the right-hand side of a production rule (called handle) on top
//  of the stack, then replace (reduce) the handle with the nonterminal defined
//  by the production rule.
//
//  This file defines LRGraph, a deterministic handle-finding finite-state
//  automaton, which is a key component in LR parsers to recognize any of
//  handles in the grammar efficiently. We build the LR table (ACTION and GOTO
//  Table) based on the LRGraph.
//
//  LRGraph can be constructed for any context-free grammars.
//  Even for a LR-ambiguous grammar, we can construct a deterministic FSA, but
//  interpretation of the FSA is nondeterministic -- we might in a state where
//  we can continue searching an handle and identify a handle (called
//  shift/reduce conflicts), or identify more than one handle (callled
//  reduce/reduce conflicts).
//
//  LRGraph is a common model for all variants of LR automatons, from the most
//  basic one LR(0), the powerful SLR(1), LR(1) which uses a one-token lookahead
//  in making decisions.
//===----------------------------------------------------------------------===//

#ifndef CLANG_PSEUDO_GRAMMAR_LRGRAPH_H
#define CLANG_PSEUDO_GRAMMAR_LRGRAPH_H

#include "clang-pseudo/grammar/Grammar.h"
#include "llvm/ADT/Hashing.h"
#include <vector>

namespace clang {
namespace pseudo {

// An LR item -- a grammar rule with a dot at some position of the body.
// e.g. a production rule A := X Y yields 3 items:
//   A := . X Y
//   A := X . Y
//   A := X Y .
// An item indicates how much of a production rule has been recognized at a
// position (described by dot), for example, A := X . Y indicates that we have
// recognized the X part from the input, and we hope next to see the input
// derivable from Y.
class Item {
public:
  static Item start(RuleID ID, const Grammar &G) {
    Item I;
    I.RID = ID;
    I.RuleLength = G.lookupRule(ID).Size;
    return I;
  }
  static Item sentinel(RuleID ID) {
    Item I;
    I.RID = ID;
    return I;
  }

  RuleID rule() const { return RID; }
  uint8_t dot() const { return DotPos; }

  bool hasNext() const { return DotPos < RuleLength; }
  SymbolID next(const Grammar &G) const {
    assert(hasNext());
    return G.lookupRule(RID).Sequence[DotPos];
  }

  Item advance() const {
    assert(hasNext());
    Item I = *this;
    ++I.DotPos;
    return I;
  }

  std::string dump(const Grammar &G) const;

  bool operator==(const Item &I) const {
    return DotPos == I.DotPos && RID == I.RID;
  }
  bool operator<(const Item &I) const {
    return std::tie(RID, DotPos) < std::tie(I.RID, I.DotPos);
  }
  friend llvm::hash_code hash_value(const Item &I) {
    return llvm::hash_combine(I.RID, I.DotPos);
  }

private:
  RuleID RID = 0;
  uint8_t DotPos = 0;
  uint8_t RuleLength = 0; // the length of rule body.
};

// A state represents a node in the LR automaton graph. It is an item set, which
// contains all possible rules that the LR parser may be parsing in that state.
//
// Conceptually, If we knew in advance what we're parsing, at any point we're
// partway through parsing a production, sitting on a stack of partially parsed
// productions. But because we don't know, there could be *several* productions
// we're partway through. The set of possibilities is the parser state, and we
// precompute all the transitions between these states.
struct State {
  // A full set of items (including non-kernel items) representing the state,
  // in a canonical order (see SortByNextSymbol in the cpp file).
  std::vector<Item> Items;

  std::string dump(const Grammar &G, unsigned Indent = 0) const;
};

// LRGraph is a deterministic finite state automaton for LR parsing.
//
// Intuitively, an LR automaton is a transition graph. The graph has a
// collection of nodes, called States. Each state corresponds to a particular
// item set, which represents a condition that could occur during the process of
// parsing a production. Edges are directed from one state to another. Each edge
// is labeled by a grammar symbol (terminal or nonterminal).
//
// LRGraph is used to construct the LR parsing table which is a core
// data-structure driving the LR parser.
class LRGraph {
public:
  // StateID is the index in States table.
  using StateID = uint16_t;

  // Constructs an LR(0) automaton.
  static LRGraph buildLR0(const Grammar &);

  // An edge in the LR graph, it represents a transition in the LR automaton.
  // If the parser is at state Src, with a lookahead Label, then it
  // transits to state Dst.
  struct Edge {
    StateID Src, Dst;
    SymbolID Label;
  };

  // A possible error recovery: choose to match some tokens against a symbol.
  //
  // e.g. a state that contains
  //   stmt := { . stmt-seq [recover=braces] }
  // has a Recovery { Src = S, Strategy=braces, Result=stmt-seq }.
  struct Recovery {
    StateID Src; // The state we are in when encountering the error.
    ExtensionID Strategy;      // Heuristic choosing the tokens to match.
    SymbolID Result;           // The symbol that is produced.
  };

  llvm::ArrayRef<State> states() const { return States; }
  llvm::ArrayRef<Edge> edges() const { return Edges; }
  llvm::ArrayRef<Recovery> recoveries() const { return Recoveries; }
  llvm::ArrayRef<std::pair<SymbolID, StateID>> startStates() const {
    return StartStates;
  }

  std::string dumpForTests(const Grammar &) const;

private:
  LRGraph(std::vector<State> States, std::vector<Edge> Edges,
          std::vector<Recovery> Recoveries,
          std::vector<std::pair<SymbolID, StateID>> StartStates)
      : States(std::move(States)), Edges(std::move(Edges)),
        Recoveries(std::move(Recoveries)), StartStates(std::move(StartStates)) {
  }

  std::vector<State> States;
  std::vector<Edge> Edges;
  std::vector<Recovery> Recoveries;
  std::vector<std::pair<SymbolID, StateID>> StartStates;
};

} // namespace pseudo
} // namespace clang

namespace llvm {
// Support clang::pseudo::Item as DenseMap keys.
template <> struct DenseMapInfo<clang::pseudo::Item> {
  static inline clang::pseudo::Item getEmptyKey() {
    return clang::pseudo::Item::sentinel(-1);
  }
  static inline clang::pseudo::Item getTombstoneKey() {
    return clang::pseudo::Item::sentinel(-2);
  }
  static unsigned getHashValue(const clang::pseudo::Item &I) {
    return hash_value(I);
  }
  static bool isEqual(const clang::pseudo::Item &LHS,
                      const clang::pseudo::Item &RHS) {
    return LHS == RHS;
  }
};
} // namespace llvm

#endif // CLANG_PSEUDO_GRAMMAR_LRGRAPH_H