File: recvrpc_backward.h

package info (click to toggle)
pytorch-cuda 2.6.0%2Bdfsg-7
  • links: PTS, VCS
  • area: contrib
  • in suites: forky, sid, trixie
  • size: 161,620 kB
  • sloc: python: 1,278,832; cpp: 900,322; ansic: 82,710; asm: 7,754; java: 3,363; sh: 2,811; javascript: 2,443; makefile: 597; ruby: 195; xml: 84; objc: 68
file content (45 lines) | stat: -rw-r--r-- 1,662 bytes parent folder | download | duplicates (3)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#pragma once

#include <torch/csrc/autograd/function.h>
#include <torch/csrc/distributed/autograd/context/context.h>
#include <torch/csrc/distributed/autograd/rpc_messages/autograd_metadata.h>
#include <torch/csrc/distributed/rpc/rpc_agent.h>

namespace torch::distributed::autograd {

// Forward declarations.
class DistAutogradContext;

// As part of our distributed autograd implementation, whenever we receive an
// RPC from a node, we add a 'RecvRpcBackward' autograd function to the
// autograd graph. This is more or less a placeholder function that is used to
// pass gradients to the remote host during the backward pass. The inputs to the
// RPC function are the inputs to this autograd function.
class TORCH_API RecvRpcBackward : public torch::autograd::Node {
 public:
  explicit RecvRpcBackward(
      const AutogradMetadata& autogradMetadata,
      const std::shared_ptr<DistAutogradContext>& autogradContext,
      rpc::worker_id_t fromWorkerId,
      rpc::DeviceMap deviceMap);

  torch::autograd::variable_list apply(
      torch::autograd::variable_list&& grads) override;

 private:
  const AutogradMetadata autogradMetadata_;

  // Hold a weak reference to the autograd context to avoid circular
  // dependencies with the context (since it holds a reference to
  // RecvRpcBackward).
  std::weak_ptr<DistAutogradContext> autogradContext_;

  // The worker id from which the RPC was received. During the backward pass,
  // we need to propagate the gradients to this workerId.
  rpc::worker_id_t fromWorkerId_;

  // Device mapping for tensors sent over RPC.
  const rpc::DeviceMap deviceMap_;
};

} // namespace torch::distributed::autograd