File: kineto_playground.cu

package info (click to toggle)
pytorch 1.13.1%2Bdfsg-4
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 139,252 kB
  • sloc: cpp: 1,100,274; python: 706,454; ansic: 83,052; asm: 7,618; java: 3,273; sh: 2,841; javascript: 612; makefile: 323; xml: 269; ruby: 185; yacc: 144; objc: 68; lex: 44
file content (83 lines) | stat: -rw-r--r-- 1,877 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
// (c) Meta Platforms, Inc. and affiliates. Confidential and proprietary.

#include <stdio.h>

#include "kineto_playground.cuh"


namespace kineto {

void warmup(void) {
  // Inititalizing CUDA can take a while which we normally do not want to see in Kineto traces.
  // This is done in various ways that take Kineto as dependency. This is our way of doing warmup
  // for kineto_playground
	size_t bytes = 1000;
	float* mem = NULL;
	auto error = cudaMalloc(&mem, bytes);
  if (error != cudaSuccess) {
    printf("cudaMalloc failed during kineto_playground warmup. error code: %d", error);
    return;
  }

  cudaFree(mem); 
}

float *hA, *dA, *hOut;
int num = 50'000;

void basicMemcpyToDevice(void) {
  size_t size = num * sizeof(float);
  cudaError_t err;

  hA = (float*)malloc(size);
  hOut = (float*)malloc(size);
  err = cudaMalloc(&dA, size);
  if (err != cudaSuccess) {
    printf("cudaMalloc failed during %s", __func__);
    return;
  }

  memset(hA, 1, size);
  err = cudaMemcpy(dA, hA, size, cudaMemcpyHostToDevice);
  if (err != cudaSuccess) {
    printf("cudaMemcpy failed during %s", __func__);
    return;
  }
}

void basicMemcpyFromDevice(void) {

  size_t size = num * sizeof(float);
  cudaError_t err;

  err = cudaMemcpy(hOut, dA, size, cudaMemcpyDeviceToHost);
  if (err != cudaSuccess) {
    printf("cudaMemcpy failed during %s", __func__);
    return;
  }

  free(hA);
  free(hOut);
  cudaFree(dA);
}

__global__ void square(float* A, int N) {
  int i = blockDim.x * blockIdx.x + threadIdx.x;
  if (i < N) {
    A[i] *= A[i];
  }
}

void playground(void) {
  // Add your experimental CUDA implementation here. 
}

void compute(void) {
  int threadsPerBlock = 256;
  int blocksPerGrid = (num + threadsPerBlock - 1) / threadsPerBlock;
  for (int i = 0; i < 10; i++) {
    square<<<blocksPerGrid, threadsPerBlock>>> (dA, num);
  }
}

} // namespace kineto