File: NonDependent.agda

package info (click to toggle)
agda-stdlib 2.1-4
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 9,196 kB
  • sloc: haskell: 375; makefile: 32; sh: 28; lisp: 1
file content (221 lines) | stat: -rw-r--r-- 7,522 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
------------------------------------------------------------------------
-- The Agda standard library
--
-- Example use case for a trie: a wee generic lexer
------------------------------------------------------------------------

{-# OPTIONS --cubical-compatible --sized-types #-}

module README.Data.Trie.NonDependent where

------------------------------------------------------------------------
-- Introduction

-- A Trie is a tree of values indexed by words in a finite language. It
-- allows users to quickly compute the Brzozowski derivative of that
-- little mapping from words to values.

-- In the most general case, values can depend upon the list of characters
-- that constitutes the path leading to them. Here however we consider a
-- non-dependent case (cf. README.Trie.Dependent for a dependent use case).

-- We can recognize keywords by storing the list of characters they
-- correspond to as paths in a Trie and the constructor they are decoded
-- to as the tree's values.

-- E.g.
-- [     .      ] is a root
-- [  -- m -->  ] is an m-labeled edge and is followed when reading 'm'
-- [    (X)     ] is a value leaf storing constructor X

--                     --> -- m --> -- m --> -- a --> (LEMMA)
--                    /
--       -- l --> -- e --> -- t --> (LET)
--      /
--     /    -- u --> -- t --> -- u --> -- a --> -- l --> (MUTUAL)
--    /    /
--  .< -- m --> -- o --> -- d --> -- u --> -- l --> -- e --> (MODULE)
--    \
--     -- w --> -- h --> -- e --> -- r --> -- e --> (WHERE)
--                           \
--                            --> -- n --> (WHEN)


-- after reading 'w', we get the derivative:

--  . -- h --> -- e --> -- r --> -- e --> (WHERE)
--                 \
--                  --> -- n --> (WHEN)


open import Level
open import Data.Unit
open import Data.Bool
open import Data.Char              as Char
import Data.Char.Properties        as Char
open import Data.List.Base         as List using (List; []; _∷_)
open import Data.List.Fresh        as List# using (List#; []; _∷#_)
open import Data.Maybe             as Maybe
open import Data.Product.Base      as Product using (_×_; ∃; proj₁; _,_)
open import Data.String.Base       as String using (String)
open import Data.String.Properties as String using (_≟_)
open import Data.These             as These

open import Function.Base using (case_of_; _$_; _∘′_; id; _on_)
open import Relation.Nary
open import Relation.Binary.Core using (Rel)
open import Relation.Nullary.Decidable using (¬?)

open import Data.Trie Char.<-strictTotalOrder
open import Data.Tree.AVL.Value

------------------------------------------------------------------------
-- Generic lexer

record Lexer t : Set (suc t) where
  field
  -- Our lexer is parametrised over the type of tokens
    Tok : Set t

  -- Keywords are distinguished strings associated to tokens
  Keyword : Set t
  Keyword = String × Tok

  -- Two keywords are considered distinct if the strings are not equal
  Distinct : Rel Keyword 0ℓ
  Distinct a b = ⌊ ¬? ((proj₁ a) String.≟ (proj₁ b)) ⌋

  field

  -- We ask users to provide us with a fresh list of keywords to guarantee
  -- that no two keywords share the same string representation
    keywords : List# Keyword Distinct

  -- Some characters are special: they are separators, breaking a string
  -- into a list of tokens. Some are associated to a token value
  -- (e.g. parentheses) others are not (e.g. space)
    breaking : Char → ∃ λ b → if b then Maybe Tok else Lift _ ⊤

  -- Finally, strings which are not decoded as keywords are coerced
  -- using a function to token values.
    default  : String → Tok


module _ {t} (L : Lexer t) where
  open Lexer L

  tokenize : String → List Tok
  tokenize = start ∘′ String.toList where

   mutual

    -- A Trie is defined for an alphabet of strictly ordered letters (here
    -- we have picked Char for letters and decided to use the strict total
    -- order induced by their injection into ℕ as witnessed by the statement
    -- open import Data.Trie Char.strictTotalOrder earlier in this file).

    -- It is parametrised by a set of Values indexed over list of letters.
    -- Because we focus on the non-dependent case, we pick the constant
    -- family of Value uniformly equal to Tok. It is trivially compatible
    -- with the notion of equality underlying the strict total order on Chars.

    Keywords : Set _
    Keywords = Trie (const _ Tok) _

    -- We build a trie from the association list so that we may easily
    -- compute the successive derivatives obtained by eating the
    -- characters one by one

    init : Keywords
    init = fromList $ List.map (Product.map₁ String.toList) $ proj₁ $ List#.toList keywords

    -- Kickstart the tokeniser with an empty accumulator and the initial
    -- trie.
    start : List Char → List Tok
    start = loop [] init

    -- The main loop
    loop : (acc  : List Char)  → -- chars read so far in this token
           (toks : Keywords)   → -- keyword candidates left at this point
           (input : List Char) → -- list of chars to tokenize
           List Tok
    -- Empty input: finish up, check whether we have a non-empty accumulator
    loop acc toks []         = push acc []
    -- At least one character
    loop acc toks (c ∷ cs)   = case breaking c of λ where
      -- if we are supposed to break on this character, we do
      (true , m)  → push acc $ maybe′ _∷_ id m $ start cs
      -- otherwise we see whether it leads to a recognized keyword
      (false , _) → case lookupValue toks (c ∷ []) of λ where
        -- if so we can forget about the current accumulator and
        -- restart the tokenizer on the rest of the input
        (just tok) → tok ∷ start cs
        -- otherwise we record the character we read in the accumulator,
        -- compute the derivative of the map of keyword candidates and
        -- keep going with the rest of the input
        nothing    → loop (c ∷ acc) (lookupTrie toks c) cs

    -- Grab the accumulator and, unless it is empty, push it on top of
    -- the decoded list of tokens
    push : List Char → List Tok → List Tok
    push [] ts = ts
    push cs ts = default (String.fromList (List.reverse cs)) ∷ ts


------------------------------------------------------------------------
-- Concrete instance

-- A small set of keywords for a language with expressions of the form
-- `let x = e in b`.

module LetIn where

  data TOK : Set where
    LET EQ IN : TOK
    LPAR RPAR : TOK
    ID : String → TOK

  keywords : List# (String × TOK) (λ a b → ⌊ ¬? ((proj₁ a) String.≟ (proj₁ b)) ⌋)
  keywords =  ("let" , LET)
           ∷# ("="   , EQ)
           ∷# ("in"  , IN)
           ∷# []

  -- Breaking characters: spaces (thrown away) and parentheses (kept)
  breaking : Char → ∃ (λ b → if b then Maybe TOK else Lift 0ℓ ⊤)
  breaking c = if isSpace c then true , nothing else parens c where

    parens : Char → _
    parens '(' = true , just LPAR
    parens ')' = true , just RPAR
    parens _   = false , _

  default : String → TOK
  default = ID

letIn : Lexer 0ℓ
letIn = record { LetIn }

open import Agda.Builtin.Equality

-- A test case:

open LetIn
_ : tokenize letIn "fix f x = let b = fix f in (f b) x"
  ≡ ID "fix"
  ∷ ID "f"
  ∷ ID "x"
  ∷ EQ
  ∷ LET
  ∷ ID "b"
  ∷ EQ
  ∷ ID "fix"
  ∷ ID "f"
  ∷ IN
  ∷ LPAR
  ∷ ID "f"
  ∷ ID "b"
  ∷ RPAR
  ∷ ID "x"
  ∷ []
_ = refl