1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35
|
#include <re2/re2.h>
#include <torch/script.h>
#include <torchtext/csrc/export.h>
namespace torchtext {
typedef std::tuple<std::vector<std::string>, std::vector<std::string>, bool>
RegexTokenizerStates;
struct RegexTokenizer : torch::CustomClassHolder {
private:
std::vector<RE2*> compiled_patterns_;
void split_(
std::string& str,
std::vector<std::string>& tokens,
const char& delimiter = ' ') const;
public:
std::vector<std::string> patterns_;
std::vector<std::string> replacements_;
bool to_lower_;
TORCHTEXT_API explicit RegexTokenizer(
const std::vector<std::string>& patterns,
const std::vector<std::string>& replacements,
const bool to_lower);
TORCHTEXT_API std::vector<std::string> forward(std::string str) const;
};
TORCHTEXT_API RegexTokenizerStates
_serialize_regex_tokenizer(const c10::intrusive_ptr<RegexTokenizer>& self);
TORCHTEXT_API c10::intrusive_ptr<RegexTokenizer> _deserialize_regex_tokenizer(
RegexTokenizerStates&& states);
} // namespace torchtext
|