File: Loader.h

package info (click to toggle)
llvm-toolchain-19 1%3A19.1.7-3~deb12u1
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 1,998,492 kB
  • sloc: cpp: 6,951,680; ansic: 1,486,157; asm: 913,598; python: 232,024; f90: 80,126; objc: 75,281; lisp: 37,276; pascal: 16,990; sh: 10,009; ml: 5,058; perl: 4,724; awk: 3,523; makefile: 3,167; javascript: 2,504; xml: 892; fortran: 664; cs: 573
file content (230 lines) | stat: -rw-r--r-- 7,303 bytes parent folder | download | duplicates (9)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
//===-- Generic device loader interface -----------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//

#ifndef LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H
#define LLVM_LIBC_UTILS_GPU_LOADER_LOADER_H

#include "utils/gpu/server/llvmlibc_rpc_server.h"

#include "include/llvm-libc-types/test_rpc_opcodes_t.h"
#include "llvm-libc-types/rpc_opcodes_t.h"

#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <cstdlib>
#include <cstring>

/// Generic launch parameters for configuration the number of blocks / threads.
struct LaunchParameters {
  uint32_t num_threads_x;
  uint32_t num_threads_y;
  uint32_t num_threads_z;
  uint32_t num_blocks_x;
  uint32_t num_blocks_y;
  uint32_t num_blocks_z;
};

/// The arguments to the '_begin' kernel.
struct begin_args_t {
  int argc;
  void *argv;
  void *envp;
};

/// The arguments to the '_start' kernel.
struct start_args_t {
  int argc;
  void *argv;
  void *envp;
  void *ret;
};

/// The arguments to the '_end' kernel.
struct end_args_t {
  int argc;
};

/// Generic interface to load the \p image and launch execution of the _start
/// kernel on the target device. Copies \p argc and \p argv to the device.
/// Returns the final value of the `main` function on the device.
int load(int argc, char **argv, char **evnp, void *image, size_t size,
         const LaunchParameters &params, bool print_resource_usage);

/// Return \p V aligned "upwards" according to \p Align.
template <typename V, typename A> inline V align_up(V val, A align) {
  return ((val + V(align) - 1) / V(align)) * V(align);
}

/// Copy the system's argument vector to GPU memory allocated using \p alloc.
template <typename Allocator>
void *copy_argument_vector(int argc, char **argv, Allocator alloc) {
  size_t argv_size = sizeof(char *) * (argc + 1);
  size_t str_size = 0;
  for (int i = 0; i < argc; ++i)
    str_size += strlen(argv[i]) + 1;

  // We allocate enough space for a null terminated array and all the strings.
  void *dev_argv = alloc(argv_size + str_size);
  if (!dev_argv)
    return nullptr;

  // Store the strings linerally in the same memory buffer.
  void *dev_str = reinterpret_cast<uint8_t *>(dev_argv) + argv_size;
  for (int i = 0; i < argc; ++i) {
    size_t size = strlen(argv[i]) + 1;
    std::memcpy(dev_str, argv[i], size);
    static_cast<void **>(dev_argv)[i] = dev_str;
    dev_str = reinterpret_cast<uint8_t *>(dev_str) + size;
  }

  // Ensure the vector is null terminated.
  reinterpret_cast<void **>(dev_argv)[argv_size] = nullptr;
  return dev_argv;
}

/// Copy the system's environment to GPU memory allocated using \p alloc.
template <typename Allocator>
void *copy_environment(char **envp, Allocator alloc) {
  int envc = 0;
  for (char **env = envp; *env != 0; ++env)
    ++envc;

  return copy_argument_vector(envc, envp, alloc);
}

inline void handle_error_impl(const char *file, int32_t line, const char *msg) {
  fprintf(stderr, "%s:%d:0: Error: %s\n", file, line, msg);
  exit(EXIT_FAILURE);
}

inline void handle_error_impl(const char *file, int32_t line,
                              rpc_status_t err) {
  fprintf(stderr, "%s:%d:0: Error: %d\n", file, line, err);
  exit(EXIT_FAILURE);
}
#define handle_error(X) handle_error_impl(__FILE__, __LINE__, X)

template <uint32_t lane_size>
inline void register_rpc_callbacks(rpc_device_t device) {
  static_assert(lane_size == 32 || lane_size == 64, "Invalid Lane size");
  // Register the ping test for the `libc` tests.
  rpc_register_callback(
      device, static_cast<rpc_opcode_t>(RPC_TEST_INCREMENT),
      [](rpc_port_t port, void *data) {
        rpc_recv_and_send(
            port,
            [](rpc_buffer_t *buffer, void *data) {
              reinterpret_cast<uint64_t *>(buffer->data)[0] += 1;
            },
            data);
      },
      nullptr);

  // Register the interface test callbacks.
  rpc_register_callback(
      device, static_cast<rpc_opcode_t>(RPC_TEST_INTERFACE),
      [](rpc_port_t port, void *data) {
        uint64_t cnt = 0;
        bool end_with_recv;
        rpc_recv(
            port,
            [](rpc_buffer_t *buffer, void *data) {
              *reinterpret_cast<bool *>(data) = buffer->data[0];
            },
            &end_with_recv);
        rpc_recv(
            port,
            [](rpc_buffer_t *buffer, void *data) {
              *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
            },
            &cnt);
        rpc_send(
            port,
            [](rpc_buffer_t *buffer, void *data) {
              uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
              buffer->data[0] = cnt = cnt + 1;
            },
            &cnt);
        rpc_recv(
            port,
            [](rpc_buffer_t *buffer, void *data) {
              *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
            },
            &cnt);
        rpc_send(
            port,
            [](rpc_buffer_t *buffer, void *data) {
              uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
              buffer->data[0] = cnt = cnt + 1;
            },
            &cnt);
        rpc_recv(
            port,
            [](rpc_buffer_t *buffer, void *data) {
              *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
            },
            &cnt);
        rpc_recv(
            port,
            [](rpc_buffer_t *buffer, void *data) {
              *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
            },
            &cnt);
        rpc_send(
            port,
            [](rpc_buffer_t *buffer, void *data) {
              uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
              buffer->data[0] = cnt = cnt + 1;
            },
            &cnt);
        rpc_send(
            port,
            [](rpc_buffer_t *buffer, void *data) {
              uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
              buffer->data[0] = cnt = cnt + 1;
            },
            &cnt);
        if (end_with_recv)
          rpc_recv(
              port,
              [](rpc_buffer_t *buffer, void *data) {
                *reinterpret_cast<uint64_t *>(data) = buffer->data[0];
              },
              &cnt);
        else
          rpc_send(
              port,
              [](rpc_buffer_t *buffer, void *data) {
                uint64_t &cnt = *reinterpret_cast<uint64_t *>(data);
                buffer->data[0] = cnt = cnt + 1;
              },
              &cnt);
      },
      nullptr);

  // Register the stream test handler.
  rpc_register_callback(
      device, static_cast<rpc_opcode_t>(RPC_TEST_STREAM),
      [](rpc_port_t port, void *data) {
        uint64_t sizes[lane_size] = {0};
        void *dst[lane_size] = {nullptr};
        rpc_recv_n(
            port, dst, sizes,
            [](uint64_t size, void *) -> void * { return new char[size]; },
            nullptr);
        rpc_send_n(port, dst, sizes);
        for (uint64_t i = 0; i < lane_size; ++i) {
          if (dst[i])
            delete[] reinterpret_cast<uint8_t *>(dst[i]);
        }
      },
      nullptr);
}

#endif