File: wordtokenizer.cpp

package info (click to toggle)

source-highlight 3.1.7-1

links: PTS
area: main
in suites: jessie, jessie-kfreebsd
size: 10,332 kB
ctags: 5,233
sloc: sh: 11,270; cpp: 10,206; ansic: 9,515; makefile: 1,865; lex: 1,200; yacc: 1,021; php: 213; perl: 211; awk: 98; erlang: 94; lisp: 90; java: 75; ruby: 69; python: 61; asm: 43; ml: 38; ada: 36; haskell: 27; xml: 23; cs: 11; sql: 8; tcl: 6; sed: 4

file content (42 lines) | stat: -rw-r--r-- 975 bytes

parent folder | download | duplicates (6)

//
// Author: Lorenzo Bettini <http://www.lorenzobettini.it>, (C) 2004-2008
//
// Copyright: See COPYING file that comes with this distribution
//

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#include <boost/regex.hpp>

#include "wordtokenizer.h"

namespace srchilite {

/**
 * the regular expression for detecting spaces and words
 */
static boost::regex string_or_space_regex("([^[:blank:]]+)|([[:blank:]]+)");

using namespace std;

#define SPACE 2
#define NOT_SPACE 1

void WordTokenizer::tokenize(const std::string &s,
        WordTokenizerResults &results) {
    boost::sregex_iterator i(s.begin(), s.end(), string_or_space_regex);
    boost::sregex_iterator j;
    while (i != j) {
        if ((*i)[SPACE].matched) {
            results.push_back(make_pair(string((*i)[SPACE].first, (*i)[SPACE].second), ""));
        } else {
            results.push_back(make_pair("", string((*i)[NOT_SPACE].first, (*i)[NOT_SPACE].second)));
        }

        ++i;
    }
}

}