File: word_count.cpp

package info (click to toggle)
boost1.74 1.74.0-9
  • links: PTS, VCS
  • area: main
  • in suites: bullseye
  • size: 464,084 kB
  • sloc: cpp: 3,338,324; xml: 131,293; python: 33,088; ansic: 14,336; asm: 4,034; sh: 3,351; makefile: 1,193; perl: 1,036; yacc: 478; php: 212; ruby: 102; lisp: 24; sql: 13; csh: 6
file content (166 lines) | stat: -rw-r--r-- 5,269 bytes parent folder | download | duplicates (7)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
//  Copyright (c) 2001-2010 Hartmut Kaiser
// 
//  Distributed under the Boost Software License, Version 1.0. (See accompanying 
//  file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)

//  This example is the equivalent to the following lex program:
/*
//[wcp_flex_version
    %{
        int c = 0, w = 0, l = 0;
    %}
    word   [^ \t\n]+
    eol    \n
    %%
    {word} { ++w; c += yyleng; }
    {eol}  { ++c; ++l; }
    .      { ++c; }
    %%
    main()
    {
        yylex();
        printf("%d %d %d\n", l, w, c);
    }
//]
*/
//  Its purpose is to do the word count function of the wc command in UNIX. It 
//  prints the number of lines, words and characters in a file. 
//
//  The example additionally demonstrates how to use the add_pattern(...)(...)
//  syntax to define lexer patterns. These patterns are essentially parameter-
//  less 'macros' for regular expressions, allowing to simplify their 
//  definition.

// #define BOOST_SPIRIT_LEXERTL_DEBUG
#define BOOST_VARIANT_MINIMIZE_SIZE

#include <boost/config/warning_disable.hpp>
//[wcp_includes
#include <boost/spirit/include/qi.hpp>
#include <boost/spirit/include/lex_lexertl.hpp>
#include <boost/spirit/include/phoenix_operator.hpp>
#include <boost/spirit/include/phoenix_statement.hpp>
#include <boost/spirit/include/phoenix_container.hpp>
//]

#include <iostream>
#include <string>

#include "example.hpp"

//[wcp_namespaces
using namespace boost::spirit;
using namespace boost::spirit::ascii;
//]

///////////////////////////////////////////////////////////////////////////////
//  Token definition: We use the lexertl based lexer engine as the underlying 
//                    lexer type.
///////////////////////////////////////////////////////////////////////////////
//[wcp_token_ids
enum tokenids 
{
    IDANY = lex::min_token_id + 10
};
//]

//[wcp_token_definition
template <typename Lexer>
struct word_count_tokens : lex::lexer<Lexer>
{
    word_count_tokens()
    {
        // define patterns (lexer macros) to be used during token definition 
        // below
        this->self.add_pattern
            ("WORD", "[^ \t\n]+")
        ;

        // define tokens and associate them with the lexer
        word = "{WORD}";    // reference the pattern 'WORD' as defined above

        // this lexer will recognize 3 token types: words, newlines, and 
        // everything else
        this->self.add
            (word)          // no token id is needed here
            ('\n')          // characters are usable as tokens as well
            (".", IDANY)    // string literals will not be escaped by the library
        ;
    }

    // the token 'word' exposes the matched string as its parser attribute
    lex::token_def<std::string> word;
};
//]

///////////////////////////////////////////////////////////////////////////////
//  Grammar definition
///////////////////////////////////////////////////////////////////////////////
//[wcp_grammar_definition
template <typename Iterator>
struct word_count_grammar : qi::grammar<Iterator>
{
    template <typename TokenDef>
    word_count_grammar(TokenDef const& tok)
      : word_count_grammar::base_type(start)
      , c(0), w(0), l(0)
    {
        using boost::phoenix::ref;
        using boost::phoenix::size;

        start =  *(   tok.word          [++ref(w), ref(c) += size(_1)]
                  |   lit('\n')         [++ref(c), ++ref(l)] 
                  |   qi::token(IDANY)  [++ref(c)]
                  )
              ;
    }

    std::size_t c, w, l;
    qi::rule<Iterator> start;
};
//]

///////////////////////////////////////////////////////////////////////////////
//[wcp_main
int main(int argc, char* argv[])
{
/*<  Define the token type to be used: `std::string` is available as the 
     type of the token attribute 
>*/  typedef lex::lexertl::token<
        char const*, boost::mpl::vector<std::string>
    > token_type;

/*<  Define the lexer type to use implementing the state machine
>*/  typedef lex::lexertl::lexer<token_type> lexer_type;

/*<  Define the iterator type exposed by the lexer type
>*/  typedef word_count_tokens<lexer_type>::iterator_type iterator_type;

    // now we use the types defined above to create the lexer and grammar
    // object instances needed to invoke the parsing process
    word_count_tokens<lexer_type> word_count;          // Our lexer
    word_count_grammar<iterator_type> g (word_count);  // Our parser 

    // read in the file int memory
    std::string str (read_from_file(1 == argc ? "word_count.input" : argv[1]));
    char const* first = str.c_str();
    char const* last = &first[str.size()];

/*<  Parsing is done based on the token stream, not the character 
     stream read from the input. The function `tokenize_and_parse()` wraps
     the passed iterator range `[first, last)` by the lexical analyzer and 
     uses its exposed iterators to parse the token stream.
>*/  bool r = lex::tokenize_and_parse(first, last, word_count, g);

    if (r) {
        std::cout << "lines: " << g.l << ", words: " << g.w 
                  << ", characters: " << g.c << "\n";
    }
    else {
        std::string rest(first, last);
        std::cerr << "Parsing failed\n" << "stopped at: \"" 
                  << rest << "\"\n";
    }
    return 0;
}
//]