1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128
|
#include <fstream>
#include "caffe2/core/blob.h"
#include "caffe2/core/context.h"
#include "caffe2/core/tensor.h"
#include "caffe2/proto/caffe2_pb.h"
#include "caffe2/utils/math.h"
#include <gtest/gtest.h>
#include "caffe2/operators/text_file_reader_utils.h"
#include "caffe2/utils/string_utils.h"
#include <c10/util/irange.h>
#include <cstdio>
#include <cstdlib>
namespace caffe2 {
TEST(TextFileReaderUtilsTest, TokenizeTest) {
TokenizedString tokenized;
std::string ch =
"label\1text\xc3\xbf\nlabel2\\\nTest\1tex\\\\t2\n"
"Two\\\\Escapes\\\1\1Second\n";
std::vector<char> seps = {'\n', '\1'};
Tokenizer tokenizer(seps, '\\');
tokenizer.next(&ch.front(), &ch.back() + 1, tokenized);
std::vector<std::pair<int, std::string>> expected = {{0, "label"},
{1, "text\xc3\xbf"},
{0, "label2\nTest"},
{1, "tex\\t2"},
{0, "Two\\Escapes\1"},
{1, "Second"}};
EXPECT_EQ(expected.size(), tokenized.tokens().size());
for (const auto i : c10::irange(expected.size())) {
const auto& token = tokenized.tokens().at(i);
EXPECT_EQ(expected.at(i).first, token.startDelimId);
EXPECT_EQ(expected.at(i).second, std::string(token.start, token.end));
}
// try each of the subsplits
for (const auto i : c10::irange(ch.size() - 1)) {
tokenizer.reset();
char* mid = &ch.front() + i;
tokenizer.next(&ch.front(), mid, tokenized);
EXPECT_GE(expected.size(), tokenized.tokens().size());
for (const auto j : c10::irange(tokenized.tokens().size())) {
const auto& token = tokenized.tokens().at(j);
EXPECT_EQ(expected.at(j).first, token.startDelimId);
EXPECT_EQ(expected.at(j).second, std::string(token.start, token.end));
}
int s1 = tokenized.tokens().size();
tokenizer.next(mid, &ch.back() + 1, tokenized);
EXPECT_EQ(expected.size(), s1 + tokenized.tokens().size());
for (const auto j : c10::irange(tokenized.tokens().size())) {
const auto& token = tokenized.tokens().at(j);
EXPECT_EQ(expected.at(j + s1).first, token.startDelimId);
EXPECT_EQ(
expected.at(j + s1).second, std::string(token.start, token.end));
}
EXPECT_EQ(0, tokenized.lastDelim());
}
struct ChunkProvider : public StringProvider {
// NOLINTNEXTLINE(modernize-pass-by-value)
ChunkProvider(const std::string& str) : ch(str) {}
std::string ch;
size_t charIdx{0};
void operator()(CharRange& range) override {
if (charIdx >= ch.size()) {
range.start = nullptr;
range.end = nullptr;
} else {
size_t endIdx = std::min(charIdx + 10, ch.size());
range.start = &ch.front() + charIdx;
range.end = &ch.front() + endIdx;
charIdx = endIdx;
}
};
void reset() override {
charIdx = 0;
}
};
for (int numPasses = 1; numPasses <= 2; ++numPasses) {
ChunkProvider chunkProvider(ch);
BufferedTokenizer bt(tokenizer, &chunkProvider, numPasses);
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
Token token;
int i = 0;
for (i = 0; bt.next(token); ++i) {
EXPECT_GT(expected.size() * numPasses, i);
const auto& expectedToken = expected.at(i % expected.size());
EXPECT_EQ(expectedToken.first, token.startDelimId);
EXPECT_EQ(expectedToken.second, std::string(token.start, token.end));
}
EXPECT_EQ(expected.size() * numPasses, i);
EXPECT_EQ(0, bt.endDelim());
}
char* tmpname = std::tmpnam(nullptr);
std::ofstream outFile;
outFile.open(tmpname);
outFile << ch;
outFile.close();
for (int numPasses = 1; numPasses <= 2; ++numPasses) {
FileReader fr(tmpname, 5);
BufferedTokenizer fileTokenizer(tokenizer, &fr, numPasses);
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
Token token;
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
int i;
for (i = 0; fileTokenizer.next(token); ++i) {
EXPECT_GT(expected.size() * numPasses, i);
const auto& expectedToken = expected.at(i % expected.size());
EXPECT_EQ(expectedToken.first, token.startDelimId);
EXPECT_EQ(expectedToken.second, std::string(token.start, token.end));
}
EXPECT_EQ(expected.size() * numPasses, i);
EXPECT_EQ(0, fileTokenizer.endDelim());
}
std::remove(tmpname);
}
} // namespace caffe2
|