1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163
|
/* Boost.Flyweight example of parallel tokenization.
*
* Copyright 2024 Joaquin M Lopez Munoz.
* Distributed under the Boost Software License, Version 1.0.
* (See accompanying file LICENSE_1_0.txt or copy at
* http://www.boost.org/LICENSE_1_0.txt)
*
* See http://www.boost.org/libs/flyweight for library home page.
*/
#include <boost/flyweight.hpp>
#include <boost/flyweight/concurrent_factory.hpp>
#include <boost/flyweight/no_locking.hpp>
#include <boost/flyweight/no_tracking.hpp>
#include <chrono>
#include <cstdlib>
#include <fstream>
#include <iomanip>
#include <iostream>
#include <string>
#include <thread>
#include <vector>
/* Handcrafted tokenizer for sequences of alphabetic characters */
inline bool match(char ch)
{
return (ch>='a' && ch<='z') || (ch>='A' && ch<='Z');
}
template<typename ForwardIterator,typename F>
void tokenize(ForwardIterator first,ForwardIterator last,F f)
{
goto start;
for(;;)
{
for(;;){
++first;
start:
if(first==last)return;
if(match(*first))break;
}
auto begin_word=first;
for(;;){
if(++first==last||!match(*first)){
f(begin_word,first);
if(first==last)return;
else break;
}
}
}
}
/* Tokenize a string into words in parallel and store the results into a
* std::vector<String>, String being std::string or a flyweight type.
*/
template<typename String>
void parse(const std::string& in,const char* type_name,std::size_t num_threads)
{
using namespace std::chrono;
using string_iterator=std::string::const_iterator;
auto t1=steady_clock::now();
/* Divide input in num_threads chunks, taking care that boundaries are not
* placed in the middle of a token.
*/
std::vector<string_iterator> boundaries(num_threads+1);
boundaries[0]=in.begin();
for(std::size_t i=0;i<num_threads;++i){
auto& it=boundaries[i+1];
it=boundaries[i]+(in.end()-boundaries[i])/(num_threads-i);
while(it!=in.end()&&match(*it))++it;
}
/* do a first pass to precalculate # of words produced by each thread */
std::vector<std::thread> threads(num_threads);
std::vector<std::size_t> partial_num_words(num_threads);
for(std::size_t i=0;i<num_threads;++i){
threads[i]=std::thread([&,i]{
std::size_t s=0;
tokenize(
boundaries[i],boundaries[i+1],
[&](string_iterator,string_iterator){++s;});
partial_num_words[i]=s;
});
}
std::size_t num_words=0;
std::vector<std::size_t> thread_output_starts(num_threads);
for(std::size_t i=0;i<num_threads;++i){
threads[i].join();
thread_output_starts[i]=num_words;
num_words+=partial_num_words[i];
}
/* do a second pass, this time populating the result vector */
std::vector<String> words(num_words,String());
for(std::size_t i=0;i<num_threads;++i){
threads[i]=std::thread([&,i]{
auto out=words.begin()+thread_output_starts[i];
tokenize(
boundaries[i],boundaries[i+1],
[&](string_iterator first,string_iterator last){
*out++=String(first,last);
});
});
}
for(std::size_t i=0;i<num_threads;++i){threads[i].join();}
auto t2=steady_clock::now();
std::cout
<<std::setw(20)<<type_name<<", "<<num_threads<<" thread(s): "
<<num_words<<" words, "
<<std::setw(9)<<duration_cast<duration<double>>(t2-t1).count()<< " s\n";
}
/* accept a file and parse it with std::string and various flyweight types */
int main(int argc, char** argv)
{
using namespace boost::flyweights;
using regular_flyweight=flyweight<std::string>;
using concurrent_flyweight=flyweight<
std::string,
concurrent_factory<>,
no_locking,
no_tracking
>;
if(argc<2){
std::cout<<"specify a file\n";
std::exit(EXIT_FAILURE);
}
std::ifstream is(argv[1]);
if(!is)
{
std::cout<<"can't open "<<argv[1]<<"\n";
std::exit(EXIT_FAILURE);
}
std::string in(
std::istreambuf_iterator<char>(is),std::istreambuf_iterator<char>{});
parse<std::string>(in,"std::string",1);
parse<std::string>(in,"std::string",8);
parse<regular_flyweight>(in,"regular flyweight",1);
parse<regular_flyweight>(in,"regular flyweight",8);
parse<concurrent_flyweight>(in,"concurrent flyweight",1);
parse<concurrent_flyweight>(in,"concurrent flyweight",8);
}
|