1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203
|
#pragma once
#include <c10/core/Device.h>
#include <c10/core/Layout.h>
#include <c10/core/MemoryFormat.h>
#include <c10/core/SymIntArrayRef.h>
#include <c10/macros/Macros.h>
#include <c10/util/ArrayRef.h>
#include <c10/util/intrusive_ptr.h>
#include <c10/util/python_stub.h>
#include <string>
#include <vector>
// Forward declarations
namespace c10 {
struct IValue;
class OperatorHandle;
struct TensorImpl;
struct SafePyObject;
} // namespace c10
namespace torch {
namespace jit {
using Stack = std::vector<c10::IValue>;
}
} // namespace torch
// Actual implementation
namespace c10 {
namespace impl {
struct C10_API PyInterpreter;
// Note [Python interpreter tag]
// ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
// Traditionally, PyTorch is layered such that our Python library
// (libtorch_python) references our pure C++ library (libtorch) as the
// natural order of things. However, sometimes this natural order is
// subverted: C++ objects refer to Python objects (for example, we
// store a PyObject* pointer on TensorImpl so that converting from a
// C++ Tensor to a Python Tensor is just a memory dereference).
//
// These unusual orderings must be treated with care. To start, you need to
// virtualize the destructor so that the PyObject can be decref'ed on
// destruction (because the C++ object itself doesn't know anything about
// Python--remember, layering!). This process itself is fraught, since
// acquiring the GIL could lead to deadlocks if someone is blocking on you
// while holding the GIL. Furthermore, if the C++ objects outlive the
// interpreter (which can happen if you stash them in a static global
// variable defined in libtorch), you may attempt to decref the object when
// the Python interpreter has already been shutdown.
//
// BUT WAIT, IT GETS WORSE. With torchdeploy, there may be multiple Python
// interpreters in a single process. If a C++ object is accessible from
// multiple interpreters, we must take care not to accidentally pass a
// PyObject from one interpreter with another interpreter.
//
// To prevent these mixups, we introduce a PyInterpreter "tag" (object with
// a vtable), which specifies a specific Python interpreter.
//
// - Any given object can be associated with AT MOST one Python interpreter.
// We represent the interpreter tag as a memory address to an instance of
// a virtual class that is allocated once per interpreter (this is so that
// we can request the interpreter to perform operations for us, if
// necessary).
//
// - It can be recorded with a PyObject (PyInterpreterObject) so that
// we know what interpreter the object is associated with, and we can
// raise an error if you try to use the PyObject from the wrong
// interpreter context.
//
// - It contains a vtable that can be used to perform various Python
// operations from ordinary C++ code that ordinarily wouldn't be accessible
// from libtorch.
//
// A simple use case is when a C++ object must be associated with a PyObject.
// However, for TensorImpl, we lazily allocate a PyObject the first time the
// object passes into Python. The invariants for this situation are more
// subtle:
//
// - A given TensorImpl's interpreter tag can only go from uninitialized to
// tagged; once tagged, this is a quiescent state (once tagged to an
// interpreter, ALWAYS tagged to that interpreter)
//
// - A thread may mutate the PyObject field of a TensorImpl if and only if it
// holds the GIL for the interpreter tagged on the TensorImpl. (If the
// TensorImpl is not tagged, it must first atomically claim its tag before it
// can validly write)
//
// WARNING: This class has to be written very carefully, because it may be
// possible for a Tensor to have a reference an interpreter corresponding to
// a shared library that has ALREADY BEEN UNLOADED. This makes blindly calling
// virtual methods very dangerous, because the vtable may be garbage at that
// point (on a good day, you might get "pure virtual method called").
//
// The idea to solve this problem is we always leak PyInterpreters (so they
// always stay live even after dlclose), and make sure we can disarm their
// virtual methods by indirecting through a separate PyInterpreterVTable
// object. This can be replaced with a no-op vtable from libc10.so, which
// is guaranteed to stick around until the bitter end.
//
// NB: The downside with representing PyInterpreter tags as full objects is that
// it takes an extra word on TensorImpl. If tags were instead just integer
// indices, on 64-bit architectures we could pack the tag and PyObject together
// into a single atomic word. On 32-bit architectures we could simply say that
// only one Python interpreter is supported (erroring if a nontrivial
// interpreter tag is attempted to be set).
//
// The difficulty with this scheme is we need to maintain an out-of-line table
// to get at the PyInterpreters so that we can do virtual method calls on them,
// and registration/deregistration to this table must be done in a thread safe
// manner. This can be easily done if the number of possible PyInterpreters is
// small enough (e.g., 8-bit integer) by simply preallocating an array of
// sufficient size to hold all possible interpreters. Surely 128 threads is
// more than enough for anyone!
//
// I didn't decide to do this technique at the moment, because the extra word
// added by the PyInterpreter tag takes us to 24 words, which means that we
// still fit inside three eight word cache lines. If you need to penny pinch
// another word consider doing this!
struct C10_API PyInterpreterVTable {
virtual ~PyInterpreterVTable() {}
// Report the name of this interpreter
virtual std::string name() const = 0;
// Run Py_DECREF on a PyObject. We DO NOT assume the GIL is held on call
// See NOTE [PyInterpreter::decref takes an `is_tensor` arg]
virtual void decref(PyObject* pyobj, bool is_tensor) const = 0;
// Perform a detach by deferring to the __torch_dispatch__ implementation of
// detach, which will also arrange for the PyObject to get copied in this
// situation
virtual c10::intrusive_ptr<TensorImpl> detach(
const TensorImpl* self) const = 0;
// Invoke the Python boxed fallback dispatch to go back into Python
virtual void dispatch(const c10::OperatorHandle& op, torch::jit::Stack* stack)
const = 0;
// Invoke the Python dispatcher to handle this call
virtual void python_dispatcher(
const c10::OperatorHandle& op,
c10::DispatchKeySet,
torch::jit::Stack* stack) const = 0;
virtual bool is_contiguous(const TensorImpl* self, at::MemoryFormat)
const = 0;
virtual bool is_strides_like(const TensorImpl* self, at::MemoryFormat)
const = 0;
virtual bool is_non_overlapping_and_dense(const TensorImpl* self) const = 0;
virtual c10::Device device(const TensorImpl* self) const = 0;
virtual int64_t dim(const TensorImpl* self) const = 0;
virtual c10::IntArrayRef strides(const TensorImpl* self) const = 0;
virtual c10::IntArrayRef sizes(const TensorImpl* self) const = 0;
virtual c10::SymIntArrayRef sym_sizes(const TensorImpl* self) const = 0;
virtual c10::Layout layout(const TensorImpl* self) const = 0;
virtual c10::SymInt sym_numel(const TensorImpl* self) const = 0;
virtual c10::SymIntArrayRef sym_strides(const TensorImpl* self) const = 0;
virtual c10::SymInt sym_storage_offset(const TensorImpl* self) const = 0;
virtual void trace_gpu_event_creation(uintptr_t event) const = 0;
virtual void trace_gpu_event_deletion(uintptr_t event) const = 0;
virtual void trace_gpu_event_record(uintptr_t event, uintptr_t stream)
const = 0;
virtual void trace_gpu_event_wait(uintptr_t event, uintptr_t stream)
const = 0;
virtual void trace_gpu_memory_allocation(uintptr_t ptr) const = 0;
virtual void trace_gpu_memory_deallocation(uintptr_t ptr) const = 0;
virtual void trace_gpu_stream_creation(uintptr_t stream) const = 0;
virtual void trace_gpu_device_synchronization() const = 0;
virtual void trace_gpu_stream_synchronization(uintptr_t stream) const = 0;
virtual void trace_gpu_event_synchronization(uintptr_t event) const = 0;
};
struct C10_API PyInterpreter {
const PyInterpreterVTable* vtable_;
PyInterpreter(const PyInterpreterVTable* vtable) : vtable_(vtable){};
const PyInterpreterVTable& operator*() const noexcept {
return *vtable_;
}
const PyInterpreterVTable* operator->() const noexcept {
return vtable_;
}
// Disarm this PyInterpreter, making all of its methods noops.
// The vtable pointer is not an atomic at the moment, which means
// a disarm() invocation that is concurrent with active destructors
// is not thread safe and will trigger TSAN. My hope is that this
// situations doesn't ever actually happen; tensor destruction should
// quiesce when a dlclose happens, and any long lived tensors whose
// destructors would be disarmed here only begin the destruction process
// on process shutdown (long after the dlclose has occurred).
void disarm() noexcept;
};
} // namespace impl
} // namespace c10
|