File: mt_word_counter.cc

package info (click to toggle)
parallel-hashmap 1.4.1%2Bds-2
  • links: PTS, VCS
  • area: main
  • in suites: forky, sid, trixie
  • size: 3,872 kB
  • sloc: cpp: 20,492; ansic: 1,114; python: 492; makefile: 85; haskell: 56; perl: 43; sh: 23
file content (84 lines) | stat: -rw-r--r-- 3,076 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#include <iostream>
#include <fstream>
#include <sstream>
#include <parallel_hashmap/phmap.h>
#include <parallel_hashmap/btree.h>
#include <thread>
#include <array>
#include <vector>
#include <algorithm>
#include <cstdlib>


/*
 * count the number of occurrences of each word in a large text file using multiple threads
 */

int main() {
    // download Jane Austin "Pride and Prejudice"
    // ------------------------------------------
    if (system("curl https://www.gutenberg.org/files/1342/1342-0.txt -o 1342-0.txt") != 0) {
        std::cout << "Error: could not retrieve test file https://www.gutenberg.org/files/1342/1342-0.txt\n";
        return 1;
    }
    
    const std::string filename = "1342-0.txt";
    
    constexpr int num_threads = 4;
    std::vector<std::thread> threads;
    std::array<std::vector<std::string>, num_threads> lines_array;

    {
        // populate 4 vectors with lines from the book
        std::ifstream file(filename);
        if (!file.is_open()) {
            std::cout << "Error: could not open file " << filename << std::endl;
            return 1;
        }
        int line_idx = 0;
        std::string line;
        while (std::getline(file, line)) {
            lines_array[line_idx % num_threads].push_back(std::move(line));
            ++line_idx;
        }
    }

    using Map = phmap::parallel_flat_hash_map_m<std::string, int>; // parallel_flat_hash_map_m has default internal mutex
    Map word_counts;

    // run 4 threads, each thread processing lines from one of the vectors
    // -------------------------------------------------------------------
    threads.reserve(num_threads);
    for (int i = 0; i < num_threads; ++i) {
        threads.emplace_back(
            [&word_counts](std::vector<std::string>&& lines) {
                for (auto& line : lines) {
                    std::replace_if(line.begin(), line.end(), [](char c) -> bool { return !std::isalnum(c); }, ' ');
                    std::istringstream iss(line);
                    std::string word;
                    while (iss >> word) {
                        // use lazy_emplace to modify the map while the mutex is locked
                        word_counts.lazy_emplace_l(word,
                                                   [&](Map::value_type& p) { ++p.second; },  // called only when key was already present
                                                   [&](const Map::constructor& ctor) // construct value_type in place when key not present
                                                   { ctor(std::move(word), 1); } );
                    }
                }
            },
            std::move(lines_array[i]));
    }
    
    for (auto& thread : threads)
        thread.join();

    // print one word used at each frequency
    // -------------------------------------
    phmap::btree_map<int, std::string> result;
    for (const auto& pair : word_counts)
        result[pair.second] = pair.first;

    for (const auto& p : result) 
        std::cout << p.first << ": " << p.second << std::endl;

    return 0;
}