File: text_file_reader_utils_test.cc

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (128 lines) | stat: -rw-r--r-- 4,550 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
#include <fstream>
#include "caffe2/core/blob.h"
#include "caffe2/core/context.h"
#include "caffe2/core/tensor.h"
#include "caffe2/proto/caffe2_pb.h"
#include "caffe2/utils/math.h"
#include <gtest/gtest.h>

#include "caffe2/operators/text_file_reader_utils.h"
#include "caffe2/utils/string_utils.h"

#include <c10/util/irange.h>

#include <cstdio>
#include <cstdlib>

namespace caffe2 {

TEST(TextFileReaderUtilsTest, TokenizeTest) {
  TokenizedString tokenized;
  std::string ch =
      "label\1text\xc3\xbf\nlabel2\\\nTest\1tex\\\\t2\n"
      "Two\\\\Escapes\\\1\1Second\n";
  std::vector<char> seps = {'\n', '\1'};
  Tokenizer tokenizer(seps, '\\');
  tokenizer.next(&ch.front(), &ch.back() + 1, tokenized);

  std::vector<std::pair<int, std::string>> expected = {{0, "label"},
                                                       {1, "text\xc3\xbf"},
                                                       {0, "label2\nTest"},
                                                       {1, "tex\\t2"},
                                                       {0, "Two\\Escapes\1"},
                                                       {1, "Second"}};

  EXPECT_EQ(expected.size(), tokenized.tokens().size());
  for (const auto i : c10::irange(expected.size())) {
    const auto& token = tokenized.tokens().at(i);
    EXPECT_EQ(expected.at(i).first, token.startDelimId);
    EXPECT_EQ(expected.at(i).second, std::string(token.start, token.end));
  }

  // try each of the subsplits
  for (const auto i : c10::irange(ch.size() - 1)) {
    tokenizer.reset();
    char* mid = &ch.front() + i;

    tokenizer.next(&ch.front(), mid, tokenized);
    EXPECT_GE(expected.size(), tokenized.tokens().size());
    for (const auto j : c10::irange(tokenized.tokens().size())) {
      const auto& token = tokenized.tokens().at(j);
      EXPECT_EQ(expected.at(j).first, token.startDelimId);
      EXPECT_EQ(expected.at(j).second, std::string(token.start, token.end));
    }
    int s1 = tokenized.tokens().size();

    tokenizer.next(mid, &ch.back() + 1, tokenized);
    EXPECT_EQ(expected.size(), s1 + tokenized.tokens().size());
    for (const auto j : c10::irange(tokenized.tokens().size())) {
      const auto& token = tokenized.tokens().at(j);
      EXPECT_EQ(expected.at(j + s1).first, token.startDelimId);
      EXPECT_EQ(
          expected.at(j + s1).second, std::string(token.start, token.end));
    }
    EXPECT_EQ(0, tokenized.lastDelim());
  }

  struct ChunkProvider : public StringProvider {
    // NOLINTNEXTLINE(modernize-pass-by-value)
    ChunkProvider(const std::string& str) : ch(str) {}
    std::string ch;
    size_t charIdx{0};
    void operator()(CharRange& range) override {
      if (charIdx >= ch.size()) {
        range.start = nullptr;
        range.end = nullptr;
      } else {
        size_t endIdx = std::min(charIdx + 10, ch.size());
        range.start = &ch.front() + charIdx;
        range.end = &ch.front() + endIdx;
        charIdx = endIdx;
      }
    };
    void reset() override {
      charIdx = 0;
    }
  };

  for (int numPasses = 1; numPasses <= 2; ++numPasses) {
    ChunkProvider chunkProvider(ch);
    BufferedTokenizer bt(tokenizer, &chunkProvider, numPasses);
    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
    Token token;
    int i = 0;
    for (i = 0; bt.next(token); ++i) {
      EXPECT_GT(expected.size() * numPasses, i);
      const auto& expectedToken = expected.at(i % expected.size());
      EXPECT_EQ(expectedToken.first, token.startDelimId);
      EXPECT_EQ(expectedToken.second, std::string(token.start, token.end));
    }
    EXPECT_EQ(expected.size() * numPasses, i);
    EXPECT_EQ(0, bt.endDelim());
  }

  char* tmpname = std::tmpnam(nullptr);
  std::ofstream outFile;
  outFile.open(tmpname);
  outFile << ch;
  outFile.close();
  for (int numPasses = 1; numPasses <= 2; ++numPasses) {
    FileReader fr(tmpname, 5);
    BufferedTokenizer fileTokenizer(tokenizer, &fr, numPasses);
    // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init)
    Token token;
    // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
    int i;
    for (i = 0; fileTokenizer.next(token); ++i) {
      EXPECT_GT(expected.size() * numPasses, i);
      const auto& expectedToken = expected.at(i % expected.size());
      EXPECT_EQ(expectedToken.first, token.startDelimId);
      EXPECT_EQ(expectedToken.second, std::string(token.start, token.end));
    }
    EXPECT_EQ(expected.size() * numPasses, i);
    EXPECT_EQ(0, fileTokenizer.endDelim());
  }
  std::remove(tmpname);
}

} // namespace caffe2