File: GB_memcpy.c

package info (click to toggle)

suitesparse 1%3A7.10.1%2Bdfsg-1

links: PTS, VCS
area: main
in suites: forky, sid, trixie
size: 254,920 kB
sloc: ansic: 1,134,743; cpp: 46,133; makefile: 4,875; fortran: 2,087; java: 1,826; sh: 996; ruby: 725; python: 495; asm: 371; sed: 166; awk: 44

file content (68 lines) | stat: -rw-r--r-- 2,118 bytes

parent folder | download | duplicates (2)

//------------------------------------------------------------------------------
// GB_memcpy: parallel memcpy
//------------------------------------------------------------------------------

// SuiteSparse:GraphBLAS, Timothy A. Davis, (c) 2017-2025, All Rights Reserved.
// SPDX-License-Identifier: Apache-2.0

//------------------------------------------------------------------------------

// Note that this function uses its own hard-coded chunk size.

#include "GB.h"

#define GB_MEM_CHUNK (1024*1024)

void GB_memcpy                  // parallel memcpy
(
    void *dest,                 // destination
    const void *src,            // source
    size_t n,                   // # of bytes to copy
    int nthreads                // max # of threads to use
)
{

    // Fixme for CUDA: do:
    //  cpu <- cpu (already done below)
    //  cpu <- gpu (effectively done below but could be better)
    //  gpu <- cpu (need this)
    //  gpu <- gpu (need this)

    if (nthreads <= 1 || n <= GB_MEM_CHUNK)
    { 

        //----------------------------------------------------------------------
        // memcpy using a single thread
        //----------------------------------------------------------------------

        memcpy (dest, src, n) ;
    }
    else
    {

        //----------------------------------------------------------------------
        // memcpy using multiple threads
        //----------------------------------------------------------------------

        size_t nchunks = 1 + (n / GB_MEM_CHUNK) ;
        if (((size_t) nthreads) > nchunks)
        { 
            nthreads = (int) nchunks ;
        }
        GB_void *pdest = (GB_void *) dest ;
        const GB_void *psrc = (GB_void *) src ;

        int64_t k ;
        #pragma omp parallel for num_threads(nthreads) schedule(dynamic,1)
        for (k = 0 ; k < nchunks ; k++)
        {
            size_t start = k * GB_MEM_CHUNK ;
            if (start < n)
            { 
                size_t chunk = GB_IMIN (n - start, GB_MEM_CHUNK) ;
                memcpy (pdest + start, psrc + start, chunk) ;
            }
        }
    }
}