1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219
|
#include "ggml.h"
#include "ggml-cpu.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#ifdef GGML_USE_CUDA
#include "ggml-cuda.h"
#endif
#ifdef GGML_USE_METAL
#include "ggml-metal.h"
#endif
#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
#include <map>
#include <string>
#include <vector>
static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
(void) level;
(void) user_data;
fputs(text, stderr);
fflush(stderr);
}
// This is a simple model with two tensors a and b
struct simple_model {
struct ggml_tensor * a;
struct ggml_tensor * b;
// the backend to perform the computation (CPU, CUDA, METAL)
ggml_backend_t backend = NULL;
// the backend buffer to storage the tensors data of a and b
ggml_backend_buffer_t buffer;
// the context to define the tensor information (dimensions, size, memory address)
struct ggml_context * ctx;
};
// initialize the tensors of the model in this case two matrices 2x2
void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B) {
ggml_log_set(ggml_log_callback_default, nullptr);
// initialize the backend
#ifdef GGML_USE_CUDA
fprintf(stderr, "%s: using CUDA backend\n", __func__);
model.backend = ggml_backend_cuda_init(0); // init device 0
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
}
#endif
#ifdef GGML_USE_METAL
fprintf(stderr, "%s: using Metal backend\n", __func__);
model.backend = ggml_backend_metal_init();
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
}
#endif
// if there aren't GPU Backends fallback to CPU backend
if (!model.backend) {
model.backend = ggml_backend_cpu_init();
}
int num_tensors = 2;
struct ggml_init_params params {
/*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
// create context
model.ctx = ggml_init(params);
// create tensors
model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A);
model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B);
// create a backend buffer (backend memory) and alloc the tensors from the context
model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend);
// load data from cpu memory to backend buffer
ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a));
ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b));
}
// build the compute graph to perform a matrix multiplication
struct ggml_cgraph * build_graph(const simple_model& model) {
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};
// create a temporally context to build the graph
struct ggml_context * ctx0 = ggml_init(params0);
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
// result = a*b^T
struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, model.b);
// build operations nodes
ggml_build_forward_expand(gf, result);
// delete the temporally context used to build the graph
ggml_free(ctx0);
return gf;
}
// compute with backend
struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr) {
// reset the allocator to free all the memory allocated during the previous inference
struct ggml_cgraph * gf = build_graph(model);
// allocate tensors
ggml_gallocr_alloc_graph(allocr, gf);
int n_threads = 1; // number of threads to perform some operations with multi-threading
if (ggml_backend_is_cpu(model.backend)) {
ggml_backend_cpu_set_n_threads(model.backend, n_threads);
}
ggml_backend_graph_compute(model.backend, gf);
// in this case, the output tensor is the last one in the graph
return ggml_graph_node(gf, -1);
}
int main(void) {
ggml_time_init();
// initialize data of matrices to perform matrix multiplication
const int rows_A = 4, cols_A = 2;
float matrix_A[rows_A * cols_A] = {
2, 8,
5, 1,
4, 2,
8, 6
};
const int rows_B = 3, cols_B = 2;
/* Transpose([
10, 9, 5,
5, 9, 4
]) 2 rows, 3 cols */
float matrix_B[rows_B * cols_B] = {
10, 5,
9, 9,
5, 4
};
simple_model model;
load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B);
// calculate the temporaly memory required to compute
ggml_gallocr_t allocr = NULL;
{
allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
// create the worst case graph for memory usage estimation
struct ggml_cgraph * gf = build_graph(model);
ggml_gallocr_reserve(allocr, gf);
size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0);
}
// perform computation
struct ggml_tensor * result = compute(model, allocr);
// create a array to print result
std::vector<float> out_data(ggml_nelements(result));
// bring the data from the backend memory
ggml_backend_tensor_get(result, out_data.data(), 0, ggml_nbytes(result));
// expected result:
// [ 60.00 55.00 50.00 110.00
// 90.00 54.00 54.00 126.00
// 42.00 29.00 28.00 64.00 ]
printf("mul mat (%d x %d) (transposed result):\n[", (int) result->ne[0], (int) result->ne[1]);
for (int j = 0; j < result->ne[1] /* rows */; j++) {
if (j > 0) {
printf("\n");
}
for (int i = 0; i < result->ne[0] /* cols */; i++) {
printf(" %.2f", out_data[j * result->ne[0] + i]);
}
}
printf(" ]\n");
// release backend memory used for computation
ggml_gallocr_free(allocr);
// free memory
ggml_free(model.ctx);
// release backend memory and free backend
ggml_backend_buffer_free(model.buffer);
ggml_backend_free(model.backend);
return 0;
}
|