File: NsightEclipse.xml

package info (click to toggle)
nvidia-cuda-samples 12.4.1~dfsg-1
  • links: PTS, VCS
  • area: contrib
  • in suites: forky, sid, trixie
  • size: 313,216 kB
  • sloc: cpp: 82,042; makefile: 53,971; xml: 15,381; ansic: 8,630; sh: 91; python: 74
file content (99 lines) | stat: -rw-r--r-- 2,861 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
<?xml version="1.0" encoding="UTF-8"?> 
<!DOCTYPE entry SYSTEM "SamplesInfo.dtd">
<entry>
  <name>globalToShmemAsyncCopy</name>
  <cflags>
    <flag>--std=c++11</flag>
  </cflags>
  <cuda_api_list>
    <toolkit>cudaStreamCreateWithFlags</toolkit>
    <toolkit>cudaMalloc</toolkit>
    <toolkit>cudaDeviceGetAttribute</toolkit>
    <toolkit>cudaFree</toolkit>
    <toolkit>cudaMallocHost</toolkit>
    <toolkit>cudaEventSynchronize</toolkit>
    <toolkit>cudaEventRecord</toolkit>
    <toolkit>cudaFreeHost</toolkit>
    <toolkit>cudaStreamSynchronize</toolkit>
    <toolkit>cudaEventDestroy</toolkit>
    <toolkit>cudaEventElapsedTime</toolkit>
    <toolkit>cudaMemsetAsync</toolkit>
    <toolkit>cudaMemcpyAsync</toolkit>
    <toolkit>cudaEventCreate</toolkit>
  </cuda_api_list>
  <description><![CDATA[This sample implements matrix multiplication which uses asynchronous copy of data from global to shared memory when on compute capability 8.0 or higher. Also demonstrates arrive-wait barrier for synchronization.]]></description>
  <devicecompilation>whole</devicecompilation>
  <includepaths>
    <path>./</path>
    <path>../</path>
    <path>../../../Common</path>
  </includepaths>
  <keyconcepts>
    <concept level="basic">CUDA Runtime API</concept>
    <concept level="basic">Linear Algebra</concept>
    <concept level="basic">CPP11 CUDA</concept>
  </keyconcepts>
  <keywords>
    <keyword>CUDA</keyword>
    <keyword>matrix multiply</keyword>
    <keyword>Async copy</keyword>
    <keyword>CPP11</keyword>
    <keyword>GCC 5.1.0</keyword>
  </keywords>
  <libraries>
  </libraries>
  <librarypaths>
  </librarypaths>
  <nsight_eclipse>true</nsight_eclipse>
  <primary_file>globalToShmemAsyncCopy.cu</primary_file>
  <required_dependencies>
    <dependency>CPP11</dependency>
  </required_dependencies>
  <scopes>
    <scope>1:CUDA Basic Topics</scope>
    <scope>3:Linear Algebra</scope>
  </scopes>
  <sm-arch>sm70</sm-arch>
  <sm-arch>sm72</sm-arch>
  <sm-arch>sm75</sm-arch>
  <sm-arch>sm80</sm-arch>
  <sm-arch>sm86</sm-arch>
  <sm-arch>sm87</sm-arch>
  <sm-arch>sm89</sm-arch>
  <sm-arch>sm90</sm-arch>
  <supported_envs>
    <env>
      <arch>x86_64</arch>
      <platform>linux</platform>
    </env>
    <env>
      <arch>x86_64</arch>
      <platform>macosx</platform>
    </env>
    <env>
      <arch>arm</arch>
    </env>
    <env>
      <arch>sbsa</arch>
    </env>
    <env>
      <arch>ppc64le</arch>
      <platform>linux</platform>
    </env>
    <env>
      <arch>aarch64</arch>
      <platform>linux</platform>
    </env>
    <env>
      <arch>aarch64</arch>
      <platform>qnx</platform>
    </env>
    <env>
      <platform>windows7</platform>
    </env>
  </supported_envs>
  <supported_sm_architectures>
    <from>7.0</from>
  </supported_sm_architectures>
  <title>Global Memory to Shared Memory Async Copy</title>
</entry>