File: CPUCachingAllocator.h

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (108 lines) | stat: -rw-r--r-- 4,203 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
#pragma once

#include <algorithm>
#include <deque>
#include <memory>
#include <mutex>

#include <c10/util/Exception.h>
#include <c10/util/SmallVector.h>
#include <c10/util/flat_hash_map.h>

/*
 * CPUCachingAllocator:
 * DISCLAIMER:
 *    This is subject to change (beta) and only supported on mobile builds.
 *    If code snippet such as in 'Usage pattern' is used outside of mobile
 *    build you will not observe the intended behavior.
 *    See below for more information.
 * Why?
 *    It has been observed that some mobile platforms, such as pixel 3, return
 *    memory aggressively to the system. This results in page faults in some
 * cases and ends up hurting performance. This caching allocator aims to address
 * that. Furthermore it also allows users to specify their own allocator by
 * implementing allocate/free virtual interfaces. What are the cons? There are
 * some cons that were observed where use of caching allocator led to worse
 * performance on some platforms. Reason being that the caching mechanism used
 * by this allocator left us worse off compared to the corresponding platform's
 *    tuned memory allocator. In that case it seemed better to not use this
 * allocator. Note there are some ideas to fix this in the works.
 *
 * Usage:
 * Usage pattern:
 * Instantiate and own the caching allocator.
 * std::unique_ptr<c10::CPUCachingAllocator> caching_allocator =
 *   std::make_unique<c10::CPUCachingAllocator>();
 * Use caching allocator with a scoped guard at inference time.
 * {
 * WithCPUCachingAllocatorGuard(caching_allocator.get());
 * ... model.forward(...);
 * }
 */

namespace c10 {

class C10_API CPUCachingAllocator {
  /*
   * What it does:
   * Caches all the allocations carried out by this allocator.
   * Cache key is the size of the allocation.
   * If requested size is found in the cache returns the cached pointer.
   * What it does not do:
   * No speculative allocation for any future allocations.
   */
 private:
  inline void* allocate_and_cache(const size_t bytes);
  void free_cached();

 protected:
  // Invariants.
  // 1. If memory is ever allocated via this allocator then
  //    the pointer will exist in allocation_map_, unless the allocator
  //    returned the memory to OS via free_cached.
  //  1.1. Therefore even when the said memory is "freed" via this
  //       allocator (and thus cached), it will continue to stay
  //       in allocation_map_. Furthermore it will also exist in
  //       available_map_. Thus an allocated memory pointer can be in both
  //       allocation_map_ and available_map_ simultaneously.
  // 2. Memory pointer maybe removed from allocation_map_, when it
  //    is freed outside of the scope of this allocator, but was allocated
  //    by this allocator.
  // 3. Available map only contains that memory which was allocated
  //    by this allocator and subsequently freed by this allocator.
  // As a result of above invariants, allocated memory ptr cannot be in
  // available_map_ unless it is in allocation_map_ as well.
  ska::flat_hash_map<size_t, c10::SmallVector<void*, 16>> available_map_;
  static ska::flat_hash_map<void*, size_t> allocation_map_;
  // Since allocation_map, which is a global instance, is mutated/read via
  // all public APIs we need a global mutex.
  static std::mutex mutex_;

 public:
  static void record_free(void* ptr);
  virtual ~CPUCachingAllocator();
  // Checks the cache to see if allocation of size bytes can be found.
  // If so return cached memory, else
  // allocates memory, records it for caching and returns.
  virtual void* allocate(const size_t bytes);
  // Checks if the memory being freed is was marked for allocation by
  // an earlier call to allocate. If so cache the allocation.
  // Otherwise free.
  virtual void free(void* ptr);
};

CPUCachingAllocator* GetDefaultCPUCachingAllocator();

bool ThreadLocalCachingAllocatorEnabled();
CPUCachingAllocator* GetThreadLocalCachingAllocator();

class C10_API WithCPUCachingAllocatorGuard {
 public:
  WithCPUCachingAllocatorGuard(CPUCachingAllocator* allocator);
  ~WithCPUCachingAllocatorGuard();

 private:
  CPUCachingAllocator* prev_caching_allocator_ptr_{nullptr};
};

} // namespace c10