File: bench_parallel.cpp

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (71 lines) | stat: -rw-r--r-- 1,958 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#include <benchmark/benchmark.h>
#include <c10/util/irange.h>
#include <torch/csrc/jit/tensorexpr/analysis.h>
#include <torch/csrc/jit/tensorexpr/ir_simplifier.h>
#include <torch/csrc/jit/tensorexpr/llvm_codegen.h>
#include <torch/csrc/jit/tensorexpr/loopnest.h>
#include <torch/csrc/jit/tensorexpr/tensor.h>
#include <torch/torch.h>

#include <immintrin.h>

namespace torch {
namespace jit {
namespace tensorexpr {

class ParallelAdd : public benchmark::Fixture {
 public:
  void SetUp(const benchmark::State& state) override {
    at::set_num_threads(4);
    torch::manual_seed(0x12345678);
    M = state.range(0);
    A = torch::randn({M});
    B = torch::randn({M});
    C = torch::zeros({M});
  }

  void TearDown(benchmark::State& state) override {
    state.counters["tasks"] = benchmark::Counter(
        uint64_t(state.iterations()) * M, benchmark::Counter::kIsRate);
  }

  int M;
  at::Tensor A;
  at::Tensor B;
  at::Tensor C;
};

BENCHMARK_DEFINE_F(ParallelAdd, Simple)(benchmark::State& state) {
  BufHandle a_buf("a", {M}, kFloat);
  BufHandle b_buf("b", {M}, kFloat);
  Tensor c_tensor = Compute("c", {M}, [&](const VarHandle& m) {
    return a_buf.load(m) + b_buf.load(m);
  });
  LoopNest loop_nest({c_tensor});
  auto const& loops = loop_nest.getLoopStmtsFor(c_tensor);
  ForPtr m = loops[0];
  m->set_parallel();
  loop_nest.prepareForCodegen();
  StmtPtr stmt = loop_nest.root_stmt();
  LLVMCodeGen cg(stmt, {c_tensor, a_buf, b_buf});

  float* a_ptr = A.data_ptr<float>();
  float* b_ptr = B.data_ptr<float>();
  float* c_ptr = C.data_ptr<float>();
  std::vector<void*> args({c_ptr, a_ptr, b_ptr});
  cg.value<int>(args);
  for (const auto i : c10::irange(M)) {
    float diff = fabs(a_ptr[i] + b_ptr[i] - c_ptr[i]);
    TORCH_CHECK(diff < 1e-5);
  }

  for (auto _ : state) {
    cg.value<int>(args);
  }
}

BENCHMARK_REGISTER_F(ParallelAdd, Simple)->Args({1 << 16});

} // namespace tensorexpr
} // namespace jit
} // namespace torch