File: rmm_wrap.cpp

package info (click to toggle)
suitesparse-graphblas 7.4.0%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 67,112 kB
  • sloc: ansic: 1,072,243; cpp: 8,081; sh: 512; makefile: 503; asm: 369; python: 125; awk: 10
file content (482 lines) | stat: -rw-r--r-- 15,420 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
//------------------------------------------------------------------------------
// rmm_wrap.cpp: C-callable wrapper for an RMM memory resource
//------------------------------------------------------------------------------

// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// rmm_wrap.cpp contains a single global object, the RMM_Wrap_Handle that holds
// an RMM (Rapids Memory Manager) memory resource and a hash map (C++
// std:unordered_map).  This allows rmm_wrap to provide 7 functions to a C
// application:

// Create/destroy an RMM resource:
//      rmm_wrap_initialize: create the RMM resource
//      rmm_wrap_finalize: destroy the RMM resource

// C-style malloc/calloc/realloc/free methods:
//      rmm_wrap_malloc:  malloc a block of memory using RMM
//      rmm_wrap_calloc:  calloc a block of memory using RMM
//      rmm_wrap_realloc: realloc a block of allocated by this RMM wrapper
//      rmm_wrap_free:    free a block of memory allocated by this RMM wrapper

// PMR-based allocate/deallocate methods (C-callable):
//      rmm_wrap_allocate (std::size_t *size)
//      rmm_wrap_deallocate (void *p, std::size_t size)

#include "rmm_wrap.hpp"
#include <iostream>

//------------------------------------------------------------------------------
// RMM_Wrap_Handle: a global object containing the RMM context
//------------------------------------------------------------------------------

// rmm_wrap_context is a pointer to a single, global RMM_Wrap_Handle object
// that all methods in this file can access.  The object and its pointer
// cannot be accessed outside this file.

typedef struct
{
    RMM_MODE mode;
    std::shared_ptr<rmm::mr::device_memory_resource>   resource;
    std::shared_ptr<std::pmr::memory_resource>         host_resource;
    std::shared_ptr<alloc_map>                         size_map ;
    std::shared_ptr<cuda_stream_pool>                  stream_pool;
    cudaStream_t                                       main_stream;
}
RMM_Wrap_Handle ;

// rmm_wrap_context: global pointer to the single RMM_Wrap_Handle object
static RMM_Wrap_Handle *rmm_wrap_context = NULL ;

//------------------------------------------------------------------------------
// make a resource pool
//------------------------------------------------------------------------------

#if 0
inline auto make_host()
{
    return std::make_shared<rmm::mr::new_delete_resource>() ;
}

inline auto make_host_pinned()
{
    return std::make_shared<rmm::mr::pinned_memory_resource>() ;
}
#endif

inline auto make_cuda()
{
    return std::make_shared<rmm::mr::cuda_memory_resource>() ;
}

inline auto make_managed()
{
    return std::make_shared<rmm::mr::managed_memory_resource>() ;
}

#if 0
inline auto make_and_set_host_pool
(
    std::size_t initial_size,
    std::size_t maximum_size
)
{
    auto resource = std::pmr::synchronized_pool_resource() ;
    rmm::mr::set_current_device_resource( resource ) ;
    return resource;
}

inline auto make_and_set_host_pinned_pool
(
    std::size_t initial_size,
    std::size_t maximum_size
)
{
    auto resource = rmm::mr::make_owning_wrapper<pool_mr>
        ( make_host_pinned(), initial_size, maximum_size ) ;
    rmm::mr::set_current_device_resource( resource.get()) ;
    return resource;
}
#endif

// size_map is an unordered alloc_map that maps allocation address to the size
// of each allocation

inline auto make_and_set_device_pool
(
    std::size_t initial_size,
    std::size_t maximum_size
)
{
    auto resource = rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>
                    ( make_cuda(), initial_size, maximum_size ) ;
    rmm::mr::set_current_device_resource( resource.get()) ;
    return resource;
}

inline auto make_and_set_managed_pool
(
    std::size_t initial_size,
    std::size_t maximum_size
)
{
    //  std::cout<< " make_managed_pool called with  init_size"
    //  <<initial_size<<" max_size "<<maximum_size<<"\n";

    auto resource = rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>
                        ( make_managed(), initial_size, maximum_size ) ;
    rmm::mr::set_current_device_resource( resource.get()) ;
    return resource;
}

inline std::shared_ptr<rmm::cuda_stream_pool> make_and_set_cuda_stream_pool
(
    std::size_t num_streams
)
{
    return std::make_shared<rmm::cuda_stream_pool>(num_streams);
}

inline cudaStream_t make_cuda_stream()
{
    cudaStream_t user_stream;
    RMM_WRAP_CHECK_CUDA(cudaStreamCreate(&user_stream));
    return user_stream;
}
//------------------------------------------------------------------------------
// rmm_wrap_finalize: destroy the global rmm_wrap_context
//------------------------------------------------------------------------------

// Destroy the rmm_wrap_context.  This method allows destroys the contents of
// the rmm_wrap_context:  the memory resource (host or device) and the
// alloc_map.

void rmm_wrap_finalize (void)
{
    if (rmm_wrap_context != NULL)
    {
        RMM_WRAP_CHECK_CUDA(cudaStreamDestroy(rmm_wrap_context->main_stream));
        delete (rmm_wrap_context) ;
        rmm_wrap_context = NULL ;
    }
}

//------------------------------------------------------------------------------
// rmm_wrap_initialize: initialize the global rmm_wrap_context
//------------------------------------------------------------------------------

int rmm_wrap_initialize             // returns -1 on error, 0 on success
(
    RMM_MODE mode,                  // TODO: describe
    std::size_t init_pool_size,     // TODO: describe
    std::size_t max_pool_size,       // TODO: describe
    std::size_t stream_pool_size
)
{

    //--------------------------------------------------------------------------
    // check inputs
    //--------------------------------------------------------------------------

    if (rmm_wrap_context != NULL)
    {
        // rmm_wrap_initialize cannot be called twice
        return (-1) ;
    }

    if(stream_pool_size <= 0)
    {
        std::cout << "Stream pool size must be >=0" << std::endl;
        // failed to create the alloc_map
        return (-1) ;
    }

    // create the RMM wrap handle and save it as a global pointer.
    rmm_wrap_context = new RMM_Wrap_Handle() ;

    //  std::cout<< " init called with mode "<<mode<<" init_size "
    // <<init_pool_size<<" max_size "<<max_pool_size<<"\n";

    //--------------------------------------------------------------------------
    // Construct a resource that uses a coalescing best-fit pool allocator
    //--------------------------------------------------------------------------

    // Set CUDA stream pool
    rmm_wrap_context->stream_pool = make_and_set_cuda_stream_pool(stream_pool_size);
    rmm_wrap_context->main_stream = make_cuda_stream();

    if (mode == rmm_wrap_host )
    {
        // rmm_wrap_context->host_resource =
        //  std::pmr::synchronized_pool_resource() ;
        //  // (init_pool_size, max_pool_size) ;
        // rmm_wrap_context->host_resource =  make_and_set_host_pool() ;
        //  // (init_pool_size, max_pool_size) ;
    }
    else if (mode == rmm_wrap_host_pinned )
    {
        // rmm_wrap_context->host_resource =
        //  std::pmr::synchronized_pool_resource() ;
        //  // (init_pool_size, max_pool_size) ;
    }
    else if (mode == rmm_wrap_device )
    {
        rmm_wrap_context->resource =
            make_and_set_device_pool( init_pool_size, max_pool_size) ;
    }
    else if ( mode == rmm_wrap_managed )
    {
        rmm_wrap_context->resource =
            make_and_set_managed_pool( init_pool_size, max_pool_size) ;
    }
    else
    {
        // invalid mode
        return (-1) ;
    }

    // Mark down the mode for reference later
    rmm_wrap_context->mode = mode;

    //--------------------------------------------------------------------------
    // create size map to lookup size of each allocation
    //--------------------------------------------------------------------------

    rmm_wrap_context->size_map = std::make_shared<alloc_map> () ;
    if (rmm_wrap_context->size_map.get() == NULL)
    {
        // failed to create the alloc_map
        return (-1) ;
    }

    return (0) ;
}

cudaStream_t get_next_stream_from_pool() {
    return rmm_wrap_context->stream_pool->get_stream();
}

cudaStream_t get_stream_from_pool(std::size_t stream_id) {
    return rmm_wrap_context->stream_pool->get_stream(stream_id);
}

cudaStream_t get_main_stream() {
    return rmm_wrap_context->main_stream;
}
//------------------------------------------------------------------------------
// rmm_wrap_malloc: malloc-equivalent method using RMM
//------------------------------------------------------------------------------

// rmm_wrap_malloc is identical to the ANSI C11 malloc function, except that
// it uses RMM underneath to allocate the space.

void *rmm_wrap_malloc (std::size_t size)
{
    return (rmm_wrap_allocate (&size)) ;
}

//------------------------------------------------------------------------------
// rmm_wrap_calloc: calloc-equivalent method using RMM
//------------------------------------------------------------------------------

// rmm_wrap_calloc is identical to the ANSI C11 calloc function, except that
// it uses RMM underneath to allocate the space.

void *rmm_wrap_calloc (std::size_t n, std::size_t size)
{
    std::size_t s = n * size ;
    void *p = rmm_wrap_allocate (&s) ;
    // NOTE: this is single-threaded on the CPU.  If you want a faster method,
    // malloc the space and use cudaMemset for the GPU or GB_memset on the CPU.
    // The GraphBLAS GB_calloc_memory method uses malloc and GB_memset.
    memset (p, 0, s) ;
    return (p) ;
}

//------------------------------------------------------------------------------
// rmm_wrap_realloc: realloc-equivalent method using RMM
//------------------------------------------------------------------------------

// rmm_wrap_realloc is identical to the ANSI C11 realloc function, except that
// it uses RMM underneath to allocate the space.

void *rmm_wrap_realloc (void *p, std::size_t newsize)
{
    if (p == NULL)
    {
        // allocate a new block.  This is OK.
        return (rmm_wrap_allocate (&newsize)) ;
    }

    if (newsize == 0)
    {
        // free the block.  This OK.
        rmm_wrap_deallocate (p, 0) ;
        return (NULL) ;
    }

    alloc_map *am = rmm_wrap_context->size_map.get() ;
    std::size_t oldsize = am->at( (std::size_t)(p) ) ;

    if (oldsize == 0)
    {
        // the block is not in the hashmap; cannot realloc it.
        // This is a failure.
        return (NULL) ;
    }

    // check for quick return
    if (newsize >= oldsize/2 && newsize <= oldsize)
    {
        // Be lazy. If the block does not change, or is shrinking but only by a
        // small amount, then leave the block as-is.
        return (p) ;
    }

    // allocate the new space
    void *pnew = rmm_wrap_allocate (&newsize) ;
    if (pnew == NULL)
    {
        // old block is not modified.  This is a failure, but the old block is
        // still in the hashmap.
        return (NULL) ;
    }

    // copy the old space into the new space
    std::size_t s = (oldsize < newsize) ? oldsize : newsize ;
    // FIXME: query the pointer if it's on the GPU.
    memcpy (pnew, p, s) ; // NOTE: single-thread CPU, not GPU.  Slow!

    // free the old space
    rmm_wrap_deallocate (p, oldsize) ;

    // return the new space
    return (pnew) ;
}

//------------------------------------------------------------------------------
// rmm_wrap_free: free a block of memory, size not needed
//------------------------------------------------------------------------------

// rmm_wrap_free is identical to the ANSI C11 free function, except that
// it uses RMM underneath to allocate the space.

void rmm_wrap_free (void *p)
{
    rmm_wrap_deallocate (p, 0) ;
}

//------------------------------------------------------------------------------
// rmm_wrap_allocate: allocate a block of memory using RMM
//------------------------------------------------------------------------------

void *rmm_wrap_allocate( std::size_t *size)
{
    if (rmm_wrap_context == NULL) return (NULL) ;

    alloc_map *am = rmm_wrap_context->size_map.get() ;
    if (am == NULL)
    {
        // PANIC!
        // std::cout<< "Uh oh, can't allocate before initializing RMM"
        // << std::endl;
        return (NULL) ;
    }

    // ensure size is nonzero
    if (*size == 0) *size = 256 ;
    // round-up the allocation to a multiple of 256
    std::size_t aligned = (*size) % 256 ;
    if (aligned > 0)
    {
        *size += (256 - aligned) ;
    }

//  printf(" rmm_wrap_alloc %ld bytes\n",*size) ;


    rmm::mr::device_memory_resource *memoryresource =
        rmm::mr::get_current_device_resource() ;
    void *p = memoryresource->allocate( *size ) ;
    if (p == NULL)
    {
        // out of memory
        *size = 0 ;
        return (NULL) ;
    }

    // insert p into the hashmap
    am->emplace ((std::size_t)p, (std::size_t)(*size)) ;

    // return the allocated block
    return (p) ;
}

//------------------------------------------------------------------------------
// rmm_wrap_deallocate: deallocate a block previously allocated by RMM
//------------------------------------------------------------------------------

void rmm_wrap_deallocate( void *p, std::size_t size)
{
    if (rmm_wrap_context == NULL) return ;

    // Note: there are 3 PANIC cases below.  The API of rmm_wrap_deallocate
    // does not allow an error condition to be returned.  These PANICs could be
    // logged, or they could terminate the program if debug mode enabled, etc.
    // In production, all we can do is ignore the PANIC.

    if (p == NULL)
    {
        // nothing to do; ignore a double-free
        if (size > 0)
        {
            // PANIC!  Why does a NULL pointer have a nonzero size??
        }
        return ;
    }

    // check the size given.  If the input size is zero, then the
    // size is unknown (say rmm_wrap_free(p)).  In that case, just trust the
    // hashmap.  Otherwise, double-check to make sure the size is correct.
    alloc_map *am = rmm_wrap_context->size_map.get() ;
    size_t actual_size = 0 ;
    if (am == NULL)
    {
        // PANIC!
        // std::cout<< "Uh oh, can't deallocate before initializing RMM"
        // << std::endl;
        return ;
    }
    else
    {
       //actual_size = am->at( (std::size_t)(p) )  ;
       auto iter = am->find( (std::size_t)(p) )  ;
       if (iter != am->end() ) actual_size = iter->second;
       else std::cout<< " rmm_wrap:: tried to free unallocated pointer ! " << p ;
    }

    if (actual_size == 0)
    {
        // PANIC!  oops, p is not in the hashmap.  Ignore it.  TODO: could add
        // a printf here, write to a log file, etc.  if debug mode, abort, etc.
        return ;
    }

    if (size > 0 && size != actual_size)
    {
        // PANIC!  oops, invalid old size.  Ignore the input size, and free p
        // anyway.  TODO: could add a printf here, write to a log file, etc.
        // if debug mode, abort, etc.
    }

    // remove p from the hashmap
    am->erase ( (std::size_t)(p) ) ;

    // deallocate the block of memory
    rmm::mr::device_memory_resource *memoryresource =
        rmm::mr::get_current_device_resource() ;
    memoryresource->deallocate( p, actual_size ) ;
}