File: recurrent_network_executor_gpu.h

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (82 lines) | stat: -rw-r--r-- 2,434 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#ifndef CAFFE2_OPERATORS_RECURRENT_NETWORK_GPU_EXECUTOR_H_
#define CAFFE2_OPERATORS_RECURRENT_NETWORK_GPU_EXECUTOR_H_

#include "caffe2/core/context_gpu.h"
#include "caffe2/operators/rnn/recurrent_network_executor.h"


#include <map>

namespace caffe2 {

class CUDARecurrentNetworkExecutor : public RecurrentNetworkExecutorBase {
 public:
  CUDARecurrentNetworkExecutor(
      const NetDef& step_net_def,
      std::map<string, string>& recurrent_input_map,
      std::string timestep_blob)
  : RecurrentNetworkExecutorBase(step_net_def, recurrent_input_map, timestep_blob) {}

  ~CUDARecurrentNetworkExecutor();

 protected:
  bool Run(int T) override;

  bool RunBackwards(int T) override;

  bool ignoreLinkDependencies() override {
    return true;
  }

  void AnalyzeOps() override {
    /**
      * Check if there is an op that only depends on ops from previous
      * timestep, and that ops is not the last op. Then we can start computation
      * in subsequent timesteps before the whole previous timestep has finished.
      * If there is no parallelism, we can avoid overhead of event-based
      * dependency management.
      */
    has_timestep_parallelism_ = false;
    for (auto& rnn_op : timestep_ops_template_) {
      int i = rnn_op.order;
      if (rnn_op.parents.size() >= 1 && i < timestep_ops_template_.size() - 1) {
        bool only_recurrent_deps = std::all_of(
                  rnn_op.parents.begin(),
                  rnn_op.parents.end(), [&](const int &parent) {
                    return parent > i;
                  }
        );
        if (only_recurrent_deps) {
          VLOG(1) << "Timestep parallel op: " << ProtoDebugString(step_net_def_.op(i));
          has_timestep_parallelism_ = true;

          for (int dep : rnn_op.parents) {
            if (dep == timestep_ops_template_.size() - 1) {
              // This op depends on the last op of the previous iteration,
              // so it will block any parallelism
              has_timestep_parallelism_ = false;
              break;
            }
          }
          break;
        }
      }
    }
    LOG(INFO) << "Analyzed ops for timestep parallelism: " << has_timestep_parallelism_;
 }

 public:

   void setMaxStreams(int n) {
     max_cuda_streams_ = n;
   }

 private:
  void _ExecRange(int from, int to);

  std::vector<cudaEvent_t> events_;
  bool has_timestep_parallelism_ = false;
  int max_cuda_streams_ = 2;
};
}
#endif