File: transformers.c

package info (click to toggle)
gpaw 25.7.0-1
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 18,888 kB
  • sloc: python: 174,804; ansic: 17,564; cpp: 5,668; sh: 972; csh: 139; makefile: 45
file content (252 lines) | stat: -rw-r--r-- 7,321 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
/*  Copyright (C) 2003-2007  CAMP
 *  Copyright (C) 2007-2008  CAMd
 *  Copyright (C) 2005-2020  CSC - IT Center for Science Ltd.
 *  Please see the accompanying LICENSE file for further information. */

#include <Python.h>
#ifdef _OPENMP
#include <omp.h>
#endif
#define PY_ARRAY_UNIQUE_SYMBOL GPAW_ARRAY_API
#define NO_IMPORT_ARRAY
#include <numpy/arrayobject.h>
#include "extensions.h"
#include "bc.h"
#include "mympi.h"
#include "bmgs/bmgs.h"
#include "threading.h"

#define __TRANSFORMERS_C
#include "transformers.h"
#undef __TRANSFORMERS_C

#ifdef GPAW_GPU
#include "gpu/gpu.h"
#include "gpu/bmgs.h"

PyObject* Transformer_apply_gpu(TransformerObject *self, PyObject *args);
#endif

static void Transformer_dealloc(TransformerObject *self)
{
#ifdef GPAW_GPU
  if (self->use_gpu) {
    transformer_dealloc_gpu(0);
    bc_dealloc_gpu(0);
  }
#endif
  free(self->bc);
  PyObject_DEL(self);
}

// The actual computation routine for interpolation and restriction
// operations. The routine is used also in C-preconditioner
void transapply_worker(TransformerObject *self, int chunksize, int start,
		       int end, int thread_id, int nthreads, 
		       const double* in, double* out,
		       bool real, const double_complex* ph)
{
  boundary_conditions* bc = self->bc;
  const int* size1 = bc->size1;
  const int* size2 = bc->size2;
  int ng = bc->ndouble * size1[0] * size1[1] * size1[2];
  int ng2 = bc->ndouble * size2[0] * size2[1] * size2[2];

  double* sendbuf = GPAW_MALLOC(double, bc->maxsend * chunksize);
  double* recvbuf = GPAW_MALLOC(double, bc->maxrecv * chunksize);
  double* buf = GPAW_MALLOC(double, ng2 * chunksize);
  int buf2size = ng2;
  if (self->interpolate)
    buf2size *= 16;
  else
    buf2size /= 2;
  double* buf2 = GPAW_MALLOC(double, buf2size * chunksize);
  MPI_Request recvreq[2];
  MPI_Request sendreq[2];

  const double* my_in;
  double* my_out;
  
  int out_ng = bc->ndouble * self->size_out[0] * self->size_out[1]
               * self->size_out[2];

  for (int n = start; n < end; n += chunksize)
    {
      if (n + chunksize >= end && chunksize > 1)
        chunksize = end - n;
      my_in = in + n * ng;
      my_out = out + n * out_ng;
      for (int i = 0; i < 3; i++)
        {
          bc_unpack1(bc, my_in, buf, i,
                     recvreq, sendreq,
                     recvbuf, sendbuf, ph + 2 * i,
                     thread_id, 1);
          bc_unpack2(bc, buf, i,
                     recvreq, sendreq, recvbuf, 1);
        }
      for (int m = 0; m < chunksize; m++) {
        if (real)
          {
            if (self->interpolate)
              bmgs_interpolate(self->k, self->skip, buf + m * ng2, bc->size2,
                               my_out + m * out_ng, buf2 + m * buf2size);
            else
              bmgs_restrict(self->k, buf + m * ng2, bc->size2,
                            my_out + m * out_ng, buf2 + m * buf2size);
          }
        else
          {
            if (self->interpolate)
              bmgs_interpolatez(self->k, self->skip, (double_complex*)(buf + m * ng2),
                                bc->size2, (double_complex*)(my_out + m * out_ng),
                                (double_complex*)(buf2 + m * buf2size));
            else
              bmgs_restrictz(self->k, (double_complex*)(buf + m * ng2),
                             bc->size2, (double_complex*)(my_out + m * out_ng),
                             (double_complex*)(buf2 + m * buf2size));
          }
      }
    }
  free(buf2);
  free(buf);
  free(recvbuf);
  free(sendbuf);
}


static PyObject* Transformer_apply(TransformerObject *self, PyObject *args)
{
  PyArrayObject* input;
  PyArrayObject* output;
  PyArrayObject* phases = 0;
  if (!PyArg_ParseTuple(args, "OO|O", &input, &output, &phases))
    return NULL;

  int nin = 1;
  if (PyArray_NDIM(input) == 4)
    nin = PyArray_DIMS(input)[0];

  boundary_conditions* bc = self->bc;
  
  const double* in = DOUBLEP(input);
  double* out = DOUBLEP(output);
  bool real = (PyArray_DESCR(input)->type_num == NPY_DOUBLE);
  const double_complex* ph = (real ? 0 : COMPLEXP(phases));

  int chunksize = 1;
  if (getenv("GPAW_MPI_OPTIMAL_MSG_SIZE") != NULL)
    {
      int opt_msg_size = atoi(getenv("GPAW_MPI_OPTIMAL_MSG_SIZE"));
      if (bc->maxsend > 0 )
          chunksize = opt_msg_size * 1024 / (bc->maxsend / 2 * 
                                             (2 - (int)real) * sizeof(double));
      chunksize = (chunksize > 0) ? chunksize : 1;
      chunksize = (chunksize < nin) ? chunksize : nin;
    }

#ifdef _OPENMP
#pragma omp parallel
#endif
  {
  int thread_id = 0;
  int nthreads = 1;
  int start, end;
#ifdef _OPENMP
  thread_id = omp_get_thread_num();
  nthreads = omp_get_num_threads();
#endif
  SHARE_WORK(nin, nthreads, thread_id, &start, &end);
  transapply_worker(self, chunksize, start, end, thread_id, nthreads, 
		    in, out, real, ph);
  } // omp parallel for 
  Py_RETURN_NONE;
}

static PyObject * Transformer_get_async_sizes(TransformerObject *self, PyObject *args)
{
  if (!PyArg_ParseTuple(args, ""))
    return NULL;

#ifdef GPAW_ASYNC
  return Py_BuildValue("(ii)", 1, GPAW_ASYNC_D);
#else
  return Py_BuildValue("(ii)", 0, GPAW_ASYNC_D);
#endif
}

static PyMethodDef Transformer_Methods[] = {
    {"apply", (PyCFunction)Transformer_apply, METH_VARARGS, NULL},
#ifdef GPAW_GPU
    {"apply_gpu", (PyCFunction)Transformer_apply_gpu,
        METH_VARARGS, NULL},
#endif
    {"get_async_sizes",
     (PyCFunction)Transformer_get_async_sizes, METH_VARARGS, NULL},
    {NULL, NULL, 0, NULL}
};

PyTypeObject TransformerType = {
    PyVarObject_HEAD_INIT(NULL, 0)
    "Transformer",
    sizeof(TransformerObject),
    0,
    (destructor)Transformer_dealloc,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE,
    "Transformer object",
    0, 0, 0, 0, 0, 0,
    Transformer_Methods
};

PyObject * NewTransformerObject(PyObject *obj, PyObject *args)
{
  PyArrayObject* size_in;
  PyArrayObject* size_out;
  int k;
  PyArrayObject* paddings;
  PyArrayObject* npaddings;
  PyArrayObject* skip;
  PyArrayObject* neighbors;
  int real;
  PyObject* comm_obj;
  int interpolate;
  int use_gpu = 0;
  if (!PyArg_ParseTuple(args, "OOiOOOOiOi|i",
                        &size_in, &size_out, &k, &paddings, &npaddings, &skip,
                        &neighbors, &real, &comm_obj,
                        &interpolate, &use_gpu))
    return NULL;

  TransformerObject* self = PyObject_NEW(TransformerObject, &TransformerType);
  if (self == NULL)
    return NULL;

  self->k = k;
  self->interpolate = interpolate;

  MPI_Comm comm = MPI_COMM_NULL;
  if (comm_obj != Py_None)
    comm = ((MPIObject*)comm_obj)->comm;

  const long (*nb)[2] = (const long (*)[2])LONGP(neighbors);
  const long (*pad)[2] = (const long (*)[2])LONGP(paddings);
  const long (*npad)[2] = (const long (*)[2])LONGP(npaddings);
  const long (*skp)[2] = (const long (*)[2])LONGP(skip);
  self->bc = bc_init(LONGP(size_in), pad, npad, nb, comm, real, 0);

  for (int c = 0; c < 3; c++)
      self->size_out[c] = LONGP(size_out)[c];

  for (int c = 0; c < 3; c++)
    for (int d = 0; d < 2; d++)
      self->skip[c][d] = (int)skp[c][d];

#ifdef GPAW_GPU
  self->use_gpu = use_gpu;
  if (self->use_gpu) {
    transformer_init_gpu(self);
  }
#endif
  return (PyObject*)self;
}