File: pythonic.rs

package info (click to toggle)
rust-chumsky 0.9.3-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 464 kB
  • sloc: python: 13; makefile: 2
file content (100 lines) | stat: -rw-r--r-- 3,182 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
use chumsky::{prelude::*, BoxStream, Flat};
use std::ops::Range;

// Represents the different kinds of delimiters we care about
#[derive(Copy, Clone, Debug)]
enum Delim {
    Paren,
    Block,
}

// An 'atomic' token (i.e: it has no child tokens)
#[derive(Clone, Debug)]
enum Token {
    Int(u64),
    Ident(String),
    Op(String),
    Open(Delim),
    Close(Delim),
}

// The output of the lexer: a recursive tree of nested tokens
#[derive(Debug)]
enum TokenTree {
    Token(Token),
    Tree(Delim, Vec<Spanned<TokenTree>>),
}

type Span = Range<usize>;

type Spanned<T> = (T, Span);

// A parser that turns pythonic code with semantic whitespace into a token tree
fn lexer() -> impl Parser<char, Vec<Spanned<TokenTree>>, Error = Simple<char>> {
    let tt = recursive(|tt| {
        // Define some atomic tokens
        let int = text::int(10).from_str().unwrapped().map(Token::Int);
        let ident = text::ident().map(Token::Ident);
        let op = one_of("=.:%,")
            .repeated()
            .at_least(1)
            .collect()
            .map(Token::Op);

        let single_token = int.or(op).or(ident).map(TokenTree::Token);

        // Tokens surrounded by parentheses get turned into parenthesised token trees
        let token_tree = tt
            .padded()
            .repeated()
            .delimited_by(just('('), just(')'))
            .map(|tts| TokenTree::Tree(Delim::Paren, tts));

        single_token
            .or(token_tree)
            .map_with_span(|tt, span| (tt, span))
    });

    // Whitespace indentation creates code block token trees
    text::semantic_indentation(tt, |tts, span| (TokenTree::Tree(Delim::Block, tts), span))
        .then_ignore(end())
}

/// Flatten a series of token trees into a single token stream, ready for feeding into the main parser
fn tts_to_stream(
    eoi: Span,
    token_trees: Vec<Spanned<TokenTree>>,
) -> BoxStream<'static, Token, Span> {
    use std::iter::once;

    BoxStream::from_nested(eoi, token_trees.into_iter(), |(tt, span)| match tt {
        // Single tokens remain unchanged
        TokenTree::Token(token) => Flat::Single((token, span)),
        // Nested token trees get flattened into their inner contents, surrounded by `Open` and `Close` tokens
        TokenTree::Tree(delim, tree) => Flat::Many(
            once((TokenTree::Token(Token::Open(delim)), span.clone()))
                .chain(tree.into_iter())
                .chain(once((TokenTree::Token(Token::Close(delim)), span))),
        ),
    })
}

fn main() {
    let code = include_str!("sample.py");

    // First, lex the code into some nested token trees
    let tts = lexer().parse(code).unwrap();

    println!("--- Token Trees ---\n{:#?}", tts);

    // Next, flatten
    let eoi = 0..code.chars().count();
    let mut token_stream = tts_to_stream(eoi, tts);

    // At this point, we have a token stream that can be fed into the main parser! Because this is just an example,
    // we're instead going to just collect the token stream into a vector and print it.

    let flattened_trees = token_stream.fetch_tokens().collect::<Vec<_>>();

    println!("--- Flattened Token Trees ---\n{:?}", flattened_trees);
}