File: cuda.py

package info (click to toggle)
python-mne 0.17%2Bdfsg-1
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 95,104 kB
  • sloc: python: 110,639; makefile: 222; sh: 15
file content (331 lines) | stat: -rw-r--r-- 11,318 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# Authors: Eric Larson <larson.eric.d@gmail.com>
#
# License: BSD (3-clause)

import numpy as np

from .utils import (sizeof_fmt, logger, get_config, warn, _explain_exception,
                    verbose)


_cuda_capable = False


def get_cuda_memory(kind='available'):
    """Get the amount of free memory for CUDA operations.

    Parameters
    ----------
    kind : str
        Can be "available" or "total".

    Returns
    -------
    memory : str
        The amount of available or total memory as a human-readable string.
    """
    if not _cuda_capable:
        warn('CUDA not enabled, returning zero for memory')
        mem = 0
    else:
        import cupy
        mem = cupy.cuda.runtime.memGetInfo()[dict(available=0, total=1)[kind]]
    return sizeof_fmt(mem)


@verbose
def init_cuda(ignore_config=False, verbose=None):
    """Initialize CUDA functionality.

    This function attempts to load the necessary interfaces
    (hardware connectivity) to run CUDA-based filtering. This
    function should only need to be run once per session.

    If the config var (set via mne.set_config or in ENV)
    MNE_USE_CUDA == 'true', this function will be executed when
    the first CUDA setup is performed. If this variable is not
    set, this function can be manually executed.

    Parameters
    ----------
    ignore_config : bool
        If True, ignore the config value MNE_USE_CUDA and force init.
    verbose : bool, str, int, or None
        If not None, override default verbose level (see :func:`mne.verbose`
        and :ref:`Logging documentation <tut_logging>` for more). Defaults to
        self.verbose.
    """
    global _cuda_capable
    if _cuda_capable:
        return
    if not ignore_config and (get_config('MNE_USE_CUDA', 'false').lower() !=
                              'true'):
        logger.info('CUDA not enabled in config, skipping initialization')
        return
    # Triage possible errors for informative messaging
    _cuda_capable = False
    try:
        import cupy
    except ImportError:
        warn('module cupy not found, CUDA not enabled')
        return
    try:
        # Initialize CUDA
        cupy.cuda.Device()
    except Exception:
        warn('so CUDA device could be initialized, likely a hardware error, '
             'CUDA not enabled%s' % _explain_exception())
        return

    _cuda_capable = True
    # Figure out limit for CUDA FFT calculations
    logger.info('Enabling CUDA with %s available memory' % get_cuda_memory())


###############################################################################
# Repeated FFT multiplication

def _setup_cuda_fft_multiply_repeated(n_jobs, h, n_fft):
    """Set up repeated CUDA FFT multiplication with a given filter.

    Parameters
    ----------
    n_jobs : int | str
        If n_jobs == 'cuda', the function will attempt to set up for CUDA
        FFT multiplication.
    h : array
        The filtering function that will be used repeatedly.
    n_fft : int
        The number of points in the FFT.

    Returns
    -------
    n_jobs : int
        Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise
        original n_jobs is passed.
    cuda_dict : dict
        Dictionary with the following CUDA-related variables:
            use_cuda : bool
                Whether CUDA should be used.
            fft_plan : instance of FFTPlan
                FFT plan to use in calculating the FFT.
            ifft_plan : instance of FFTPlan
                FFT plan to use in calculating the IFFT.
            x_fft : instance of gpuarray
                Empty allocated GPU space for storing the result of the
                frequency-domain multiplication.
            x : instance of gpuarray
                Empty allocated GPU space for the data to filter.
    h_fft : array | instance of gpuarray
        This will either be a gpuarray (if CUDA enabled) or ndarray.

    Notes
    -----
    This function is designed to be used with fft_multiply_repeated().
    """
    cuda_dict = dict(n_fft=n_fft, rfft=np.fft.rfft, irfft=np.fft.irfft,
                     h_fft=np.fft.rfft(h, n=n_fft))
    if n_jobs == 'cuda':
        n_jobs = 1
        init_cuda()
        if _cuda_capable:
            import cupy
            try:
                # do the IFFT normalization now so we don't have to later
                h_fft = cupy.array(cuda_dict['h_fft'])
                logger.info('Using CUDA for FFT FIR filtering')
            except Exception as exp:
                logger.info('CUDA not used, could not instantiate memory '
                            '(arrays may be too large: "%s"), falling back to '
                            'n_jobs=1' % str(exp))
            cuda_dict.update(h_fft=h_fft,
                             rfft=_cuda_upload_rfft,
                             irfft=_cuda_irfft_get)
        else:
            logger.info('CUDA not used, CUDA could not be initialized, '
                        'falling back to n_jobs=1')
    return n_jobs, cuda_dict


def _fft_multiply_repeated(x, cuda_dict):
    """Do FFT multiplication by a filter function (possibly using CUDA).

    Parameters
    ----------
    h_fft : 1-d array or gpuarray
        The filtering array to apply.
    x : 1-d array
        The array to filter.
    n_fft : int
        The number of points in the FFT.
    cuda_dict : dict
        Dictionary constructed using setup_cuda_multiply_repeated().

    Returns
    -------
    x : 1-d array
        Filtered version of x.
    """
    # do the fourier-domain operations
    x_fft = cuda_dict['rfft'](x, cuda_dict['n_fft'])
    x_fft *= cuda_dict['h_fft']
    x = cuda_dict['irfft'](x_fft, cuda_dict['n_fft'])
    return x


###############################################################################
# FFT Resampling

def _setup_cuda_fft_resample(n_jobs, W, new_len):
    """Set up CUDA FFT resampling.

    Parameters
    ----------
    n_jobs : int | str
        If n_jobs == 'cuda', the function will attempt to set up for CUDA
        FFT resampling.
    W : array
        The filtering function to be used during resampling.
        If n_jobs='cuda', this function will be shortened (since CUDA
        assumes FFTs of real signals are half the length of the signal)
        and turned into a gpuarray.
    new_len : int
        The size of the array following resampling.

    Returns
    -------
    n_jobs : int
        Sets n_jobs = 1 if n_jobs == 'cuda' was passed in, otherwise
        original n_jobs is passed.
    cuda_dict : dict
        Dictionary with the following CUDA-related variables:
            use_cuda : bool
                Whether CUDA should be used.
            fft_plan : instance of FFTPlan
                FFT plan to use in calculating the FFT.
            ifft_plan : instance of FFTPlan
                FFT plan to use in calculating the IFFT.
            x_fft : instance of gpuarray
                Empty allocated GPU space for storing the result of the
                frequency-domain multiplication.
            x : instance of gpuarray
                Empty allocated GPU space for the data to resample.

    Notes
    -----
    This function is designed to be used with fft_resample().
    """
    cuda_dict = dict(use_cuda=False, rfft=np.fft.rfft, irfft=np.fft.irfft)
    rfft_len_x = len(W) // 2 + 1
    # fold the window onto inself (should be symmetric) and truncate
    W = W.copy()
    W[1:rfft_len_x] = (W[1:rfft_len_x] + W[::-1][:rfft_len_x - 1]) / 2.
    W = W[:rfft_len_x]
    if n_jobs == 'cuda':
        n_jobs = 1
        init_cuda()
        if _cuda_capable:
            try:
                import cupy
                # do the IFFT normalization now so we don't have to later
                W = cupy.array(W)
                logger.info('Using CUDA for FFT resampling')
            except Exception:
                logger.info('CUDA not used, could not instantiate memory '
                            '(arrays may be too large), falling back to '
                            'n_jobs=1')
            else:
                cuda_dict.update(use_cuda=True,
                                 rfft=_cuda_upload_rfft,
                                 irfft=_cuda_irfft_get)
        else:
            logger.info('CUDA not used, CUDA could not be initialized, '
                        'falling back to n_jobs=1')
    cuda_dict['W'] = W
    return n_jobs, cuda_dict


def _cuda_upload_rfft(x, n):
    """Upload and compute rfft."""
    import cupy
    return cupy.fft.rfft(cupy.array(x), n)


def _cuda_irfft_get(x, n):
    """Compute irfft and get."""
    import cupy
    return cupy.fft.irfft(x, n).get()


def _fft_resample(x, new_len, npads, to_removes, cuda_dict=None,
                  pad='reflect_limited'):
    """Do FFT resampling with a filter function (possibly using CUDA).

    Parameters
    ----------
    x : 1-d array
        The array to resample. Will be converted to float64 if necessary.
    new_len : int
        The size of the output array (before removing padding).
    npads : tuple of int
        Amount of padding to apply to the start and end of the
        signal before resampling.
    to_removes : tuple of int
        Number of samples to remove after resampling.
    cuda_dict : dict
        Dictionary constructed using setup_cuda_multiply_repeated().
    pad : str
        The type of padding to use. Supports all :func:`np.pad` ``mode``
        options. Can also be "reflect_limited" (default), which pads with a
        reflected version of each vector mirrored on the first and last values
        of the vector, followed by zeros.

        .. versionadded:: 0.15

    Returns
    -------
    x : 1-d array
        Filtered version of x.
    """
    cuda_dict = dict(use_cuda=False) if cuda_dict is None else cuda_dict
    # add some padding at beginning and end to make this work a little cleaner
    if x.dtype != np.float64:
        x = x.astype(np.float64)
    x = _smart_pad(x, npads, pad)
    old_len = len(x)
    shorter = new_len < old_len
    use_len = new_len if shorter else old_len
    x_fft = cuda_dict['rfft'](x, None)
    if use_len % 2 == 0:
        nyq = use_len // 2
        x_fft[nyq:nyq + 1] *= 2 if shorter else 0.5
    x_fft *= cuda_dict['W']
    y = cuda_dict['irfft'](x_fft, new_len)

    # now let's trim it back to the correct size (if there was padding)
    if (to_removes > 0).any():
        y = y[to_removes[0]:y.shape[0] - to_removes[1]]

    return y


###############################################################################
# Misc

# this has to go in mne.cuda instead of mne.filter to avoid import errors
def _smart_pad(x, n_pad, pad='reflect_limited'):
    """Pad vector x."""
    n_pad = np.asarray(n_pad)
    assert n_pad.shape == (2,)
    if (n_pad == 0).all():
        return x
    elif (n_pad < 0).any():
        raise RuntimeError('n_pad must be non-negative')
    if pad == 'reflect_limited':
        # need to pad with zeros if len(x) <= npad
        l_z_pad = np.zeros(max(n_pad[0] - len(x) + 1, 0), dtype=x.dtype)
        r_z_pad = np.zeros(max(n_pad[0] - len(x) + 1, 0), dtype=x.dtype)
        return np.concatenate([l_z_pad, 2 * x[0] - x[n_pad[0]:0:-1], x,
                               2 * x[-1] - x[-2:-n_pad[1] - 2:-1], r_z_pad])
    else:
        return np.pad(x, (tuple(n_pad),), pad)