File: ocl_device.h

package info (click to toggle)
lammps 20220106.git7586adbb6a%2Bds1-2
  • links: PTS, VCS
  • area: main
  • in suites: bookworm
  • size: 348,064 kB
  • sloc: cpp: 831,421; python: 24,896; xml: 14,949; f90: 10,845; ansic: 7,967; sh: 4,226; perl: 4,064; fortran: 2,424; makefile: 1,501; objc: 238; lisp: 163; csh: 16; awk: 14; tcl: 6
file content (903 lines) | stat: -rw-r--r-- 33,432 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
/***************************************************************************
                                ocl_device.h
                             -------------------
                               W. Michael Brown

  Utilities for dealing with OpenCL devices

 __________________________________________________________________________
    This file is part of the Geryon Unified Coprocessor Library (UCL)
 __________________________________________________________________________

    begin                : Mon Dec 23 2009
    copyright            : (C) 2009 by W. Michael Brown
    email                : brownw@ornl.gov
 ***************************************************************************/

/* -----------------------------------------------------------------------
   Copyright (2009) Sandia Corporation.  Under the terms of Contract
   DE-AC04-94AL85000 with Sandia Corporation, the U.S. Government retains
   certain rights in this software.  This software is distributed under
   the Simplified BSD License.
   ----------------------------------------------------------------------- */

#ifndef OCL_DEVICE
#define OCL_DEVICE

#include <string>
#include <vector>
#include <iostream>

#ifndef CL_TARGET_OPENCL_VERSION
#define CL_TARGET_OPENCL_VERSION 300
#endif

#ifdef __APPLE__
#include <OpenCL/cl.h>
#include <OpenCL/cl_platform.h>
#else
#include <CL/cl.h>
#include <CL/cl_platform.h>
#endif

#include "ocl_macros.h"
#include "ucl_types.h"

namespace ucl_opencl {

// --------------------------------------------------------------------------
// - COMMAND QUEUE STUFF
// --------------------------------------------------------------------------
typedef cl_command_queue command_queue;
typedef cl_context context_type;

inline void ucl_flush(command_queue &cq) { CL_SAFE_CALL(clFlush(cq)); }

inline void ucl_sync(cl_command_queue &cq) {
  CL_SAFE_CALL(clFinish(cq));
}

#if defined(GERYON_FORCE_SHARED_MAIN_MEM_ON)
inline bool _shared_mem_device(cl_device_id &device) { return true; }
#elif defined(GERYON_FORCE_SHARED_MAIN_MEM_OFF)
inline bool _shared_mem_device(cl_device_id &device) { return false; }
#else
inline bool _shared_mem_device(cl_device_id &device) {
  #ifdef CL_VERSION_1_2
  cl_bool br;
  CL_SAFE_CALL(clGetDeviceInfo(device, CL_DEVICE_HOST_UNIFIED_MEMORY,
                               sizeof(cl_bool), &br,NULL));
  return (br == CL_TRUE);
  #else
  cl_device_type device_type;
  CL_SAFE_CALL(clGetDeviceInfo(device,CL_DEVICE_TYPE,
                               sizeof(device_type),&device_type,NULL));
  return (device_type==CL_DEVICE_TYPE_CPU);
  #endif
}
#endif

struct OCLProperties {
  std::string name;
  cl_device_type device_type;
  bool is_subdevice;
  cl_ulong global_mem;
  cl_ulong shared_mem;
  cl_ulong const_mem;
  cl_uint compute_units;
  cl_uint clock;
  size_t work_group_size;
  size_t work_item_size[3];
  bool double_precision;
  int preferred_vector_width32, preferred_vector_width64;
  int alignment;
  size_t timer_resolution;
  bool ecc_support;
  std::string c_version;
  bool partition_equal, partition_counts, partition_affinity;
  cl_uint max_sub_devices;
  int cl_device_version;
  bool has_subgroup_support;
  bool has_shuffle_support;
};

/// Class for looking at data parallel device properties
/** \note Calls to change the device outside of the class results in incorrect
  *       behavior
  * \note There is no error checking for indexing past the number of devices **/
class UCL_Device {
 public:
  /// Collect properties for every device on the node
   /** \note You must set the active GPU with set() before using the device **/
  inline UCL_Device();

  inline ~UCL_Device();

  /// Return the number of platforms (0 if error or no platforms)
  inline int num_platforms() { return _num_platforms; }

  /// Return a string with name and info of the current platform
  inline std::string platform_name();

  /// Delete any contexts/data and set the platform number to be used
  inline int set_platform(const int pid);

  /// Return the number of devices that support OpenCL
  inline int num_devices() { return _num_devices; }

  /// Set the OpenCL device to the specified device number
  /** A context and default command queue will be created for the device *
    * Returns UCL_SUCCESS if successful or UCL_ERROR if the device could not
    * be allocated for use. clear() is called to delete any contexts and
    * associated data from previous calls to set(). **/
  inline int set(int num);

  /// Delete any context and associated data stored from a call to set()
  inline void clear();

  /// Get the current device number
  inline int device_num() { return _device; }

  /// Returns the context for the current device
  inline cl_context & context() { return _context; }

  /// Returns the default stream for the current device
  inline command_queue & cq() { return cq(_default_cq); }

  /// Returns the stream indexed by i
  inline command_queue & cq(const int i) { return _cq[i]; }

  /// Set the default command queue
  /** \param i index of the command queue (as added by push_command_queue())
      If i is 0, the command queue created with device initialization is
      used **/
  inline void set_command_queue(const int i) { _default_cq=i; }

  /// Block until all commands in the default stream have completed
  inline void sync() { sync(_default_cq); }

  /// Block until all commands in the specified stream have completed
  inline void sync(const int i) { ucl_sync(cq(i)); }

  /// Get the number of command queues currently available on device
  inline int num_queues()
    { return _cq.size(); }

  /// Add a command queue for device computations (with profiling enabled)
  inline void push_command_queue() {
    cl_int errorv;
    _cq.push_back(cl_command_queue());

#ifdef CL_VERSION_2_0
    cl_queue_properties props[] = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
    _cq.back()=clCreateCommandQueueWithProperties(_context, _cl_device, props, &errorv);
#else
    _cq.back()=clCreateCommandQueue(_context, _cl_device, CL_QUEUE_PROFILING_ENABLE, &errorv);
#endif
    if (errorv!=CL_SUCCESS) {
      std::cerr << "Could not create command queue on device: " << name()
                << std::endl;
      UCL_GERYON_EXIT;
    }
  }

  /// Remove a stream for device computations
  /** \note You cannot delete the default stream **/
  inline void pop_command_queue() {
    if (_cq.size()<2) return;
    CL_SAFE_CALL(clReleaseCommandQueue(_cq.back()));
    _cq.pop_back();
  }

  /// Get the current OpenCL device name
  inline std::string name() { return name(_device); }
  /// Get the OpenCL device name
  inline std::string name(const int i) {
    return std::string(_properties[i].name); }

  /// Get a string telling the type of the current device
  inline std::string device_type_name() { return device_type_name(_device); }
  /// Get a string telling the type of the device
  inline std::string device_type_name(const int i);

  /// Get current device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
  inline enum UCL_DEVICE_TYPE device_type() { return device_type(_device); }
  /// Get device type (UCL_CPU, UCL_GPU, UCL_ACCELERATOR, UCL_DEFAULT)
  inline enum UCL_DEVICE_TYPE device_type(const int i);

  /// Returns true if host memory is efficiently addressable from device
  inline bool shared_memory() { return shared_memory(_device); }
  /// Returns true if host memory is efficiently addressable from device
  inline bool shared_memory(const int i)
    { return _shared_mem_device(_cl_devices[i]); }

  /// Returns preferred vector width
  inline int preferred_fp32_width() { return preferred_fp32_width(_device); }
  /// Returns preferred vector width
  inline int preferred_fp32_width(const int i)
    {return _properties[i].preferred_vector_width32;}
  /// Returns preferred vector width
  inline int preferred_fp64_width() { return preferred_fp64_width(_device); }
  /// Returns preferred vector width
  inline int preferred_fp64_width(const int i)
    {return _properties[i].preferred_vector_width64;}

  /// Returns true if double precision is support for the current device
  inline bool double_precision() { return double_precision(_device); }
  /// Returns true if double precision is support for the device
  inline bool double_precision(const int i)
    {return _properties[i].double_precision;}

  /// Get the number of compute units on the current device
  inline unsigned cus() { return cus(_device); }
  /// Get the number of compute units
  inline unsigned cus(const int i)
    { return _properties[i].compute_units; }

  /// Get the gigabytes of global memory in the current device
  inline double gigabytes() { return gigabytes(_device); }
  /// Get the gigabytes of global memory
  inline double gigabytes(const int i)
    { return static_cast<double>(_properties[i].global_mem)/1073741824; }

  /// Get the bytes of global memory in the current device
  inline size_t bytes() { return bytes(_device); }
  /// Get the bytes of global memory
  inline size_t bytes(const int i) { return _properties[i].global_mem; }

  /// Return the GPGPU revision number for current device
  //inline double revision() { return revision(_device); }
  /// Return the GPGPU revision number
  //inline double revision(const int i)
  //  { return //static_cast<double>(_properties[i].minor)/10+_properties[i].major;}

  /// Clock rate in GHz for current device
  inline double clock_rate() { return clock_rate(_device); }
  /// Clock rate in GHz
  inline double clock_rate(const int i) { return _properties[i].clock*1e-3;}

  /// Return the address alignment in bytes
  inline int alignment() { return alignment(_device); }
  /// Return the address alignment in bytes
  inline int alignment(const int i) { return _properties[i].alignment; }

  /// Return the timer resolution
  inline size_t timer_resolution() { return timer_resolution(_device); }
  /// Return the timer resolution
  inline size_t timer_resolution(const int i)
    { return _properties[i].timer_resolution; }

  /// Get the maximum number of threads per block
  inline size_t group_size() { return group_size(_device); }
  /// Get the maximum number of threads per block
  inline size_t group_size(const int i)
    { return _properties[i].work_group_size; }
  /// Get the maximum number of threads per block in dimension 'dim'
  inline size_t group_size_dim(const int dim)
    { return group_size_dim(_device, dim); }
  /// Get the maximum number of threads per block in dimension 'dim'
  inline size_t group_size_dim(const int i, const int dim)
    { return _properties[i].work_item_size[dim]; }

  /// Get the shared local memory size in bytes
  inline size_t slm_size() { return slm_size(_device); }
  /// Get the shared local memory size in bytes
  inline size_t slm_size(const int i)
    { return _properties[i].shared_mem; }

  /// Return the maximum memory pitch in bytes for current device
  inline size_t max_pitch() { return max_pitch(_device); }
  /// Return the maximum memory pitch in bytes
  inline size_t max_pitch(const int i) { return 0; }

  /// Returns false if accelerator cannot be shared by multiple processes
  /** If it cannot be determined, true is returned **/
  inline bool sharing_supported() { return sharing_supported(_device); }
  /// Returns false if accelerator cannot be shared by multiple processes
  /** If it cannot be determined, true is returned **/
  inline bool sharing_supported(const int i)
    { return true; }

  /// True if the device is a sub-device
  inline bool is_subdevice()
    { return is_subdevice(_device); }
  /// True if the device is a sub-device
  inline bool is_subdevice(const int i)
    { return _properties[i].is_subdevice; }
  /// True if splitting device into equal subdevices supported
  inline bool fission_equal()
    { return fission_equal(_device); }
  /// True if splitting device into equal subdevices supported
  inline bool fission_equal(const int i)
    { return _properties[i].partition_equal; }
  /// True if splitting device into subdevices by specified counts supported
  inline bool fission_by_counts()
    { return fission_by_counts(_device); }
  /// True if splitting device into subdevices by specified counts supported
  inline bool fission_by_counts(const int i)
    { return _properties[i].partition_counts; }
  /// True if splitting device into subdevices by affinity domains supported
  inline bool fission_by_affinity()
    { return fission_by_affinity(_device); }
  /// True if splitting device into subdevices by affinity domains supported
  inline bool fission_by_affinity(const int i)
    { return _properties[i].partition_affinity; }
  /// True if the device has subgroup support
  inline bool has_subgroup_support()
    { return has_subgroup_support(_device); }
  /// True if the device has subgroup support
  inline bool has_subgroup_support(const int i)
    { return _properties[i].has_subgroup_support; }
  /// True if the device supports shuffle intrinsics
  inline bool has_shuffle_support()
    { return has_shuffle_support(_device); }
  /// True if the device supports shuffle intrinsics
  inline bool has_shuffle_support(const int i)
    { return _properties[i].has_shuffle_support; }

  /// Maximum number of subdevices allowed from device fission
  inline int max_sub_devices()
    { return max_sub_devices(_device); }
  /// Maximum number of subdevices allowed from device fission
  inline int max_sub_devices(const int i)
    { return _properties[i].max_sub_devices; }
  /// OpenCL version supported by the device
  inline int cl_device_version()
    { return cl_device_version(_device); }
  /// OpenCL version supported by the device
  inline int cl_device_version(const int i)
    { return _properties[i].cl_device_version; }

  /// List all devices along with all properties
  inline void print_all(std::ostream &out);

  /// Return the OpenCL type for the device
  inline cl_device_id & cl_device() { return _cl_device; }

  /// Automatically set the platform by type, vendor, and/or CU count
  /** If first_device is positive, search restricted to platforms containing
    * this device IDs. If ndevices is positive, search is restricted
    * to platforms with at least that many devices  **/
  inline int auto_set_platform(const enum UCL_DEVICE_TYPE type=UCL_GPU,
                               const std::string vendor="",
                               const int ndevices=-1,
                               const int first_device=-1);

 private:
  int _num_platforms;          // Number of platforms
  int _platform;               // UCL_Device ID for current platform
  cl_platform_id _cl_platform; // OpenCL ID for current platform
  cl_platform_id _cl_platforms[20]; // OpenCL IDs for all platforms
  cl_context _context;              // Context used for accessing the device
  std::vector<cl_command_queue> _cq;// The default command queue for this device
  int _device;                            // UCL_Device ID for current device
  cl_device_id _cl_device;                // OpenCL ID for current device
  std::vector<cl_device_id> _cl_devices;  // OpenCL IDs for all devices
  int _num_devices;                       // Number of devices
  std::vector<OCLProperties> _properties; // Properties for each device

  inline void add_properties(cl_device_id);
  inline int create_context();
  int _default_cq;
};

// Grabs the properties for all devices
UCL_Device::UCL_Device() {
  _device=-1;

  // --- Get Number of Platforms
  cl_uint nplatforms;
  cl_int errorv=clGetPlatformIDs(20,_cl_platforms,&nplatforms);

  if (errorv!=CL_SUCCESS) {
    _num_platforms=0;
    return;
  } else
    _num_platforms=static_cast<int>(nplatforms);
  set_platform(0);
}

UCL_Device::~UCL_Device() {
  clear();
}

void UCL_Device::clear() {
  _properties.clear();

  #ifdef GERYON_NUMA_FISSION
  #ifdef CL_VERSION_1_2
  for (int i=0; i<_cl_devices.size(); i++)
    CL_DESTRUCT_CALL(clReleaseDevice(_cl_devices[i]));
  #endif
  #endif

  _cl_devices.clear();
  if (_device>-1) {
    for (size_t i=0; i<_cq.size(); i++) {
      CL_DESTRUCT_CALL(clReleaseCommandQueue(_cq.back()));
      _cq.pop_back();
    }
    CL_DESTRUCT_CALL(clReleaseContext(_context));
  }
  _device=-1;
  _num_devices=0;
}

int UCL_Device::set_platform(int pid) {
  clear();
  cl_int errorv;

  _cl_device=0;
  _device=-1;
  _num_devices=0;
  _default_cq=0;

  #ifdef UCL_DEBUG
  assert(pid<num_platforms());
  #endif
  _platform=pid;
  _cl_platform=_cl_platforms[_platform];

  // --- Get Number of Devices
  cl_uint n;
  errorv=clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,0,nullptr,&n);
  _num_devices=n;
  if (errorv!=CL_SUCCESS || _num_devices==0) {
    _num_devices=0;
    return UCL_ERROR;
  }
  cl_device_id *device_list = new cl_device_id[_num_devices];
  CL_SAFE_CALL(clGetDeviceIDs(_cl_platform,CL_DEVICE_TYPE_ALL,n,device_list,
                              &n));

  #ifndef GERYON_NUMA_FISSION
  // --- Store properties for each device
  for (int i=0; i<_num_devices; i++) {
    _cl_devices.push_back(device_list[i]);
    add_properties(device_list[i]);
  }
  #else
  // --- Create sub-devices for anything partitionable by NUMA and store props
  int num_unpart = _num_devices;
  _num_devices = 0;
  for (int i=0; i<num_unpart; i++) {
    cl_uint num_subdevices = 1;

    #ifdef CL_VERSION_1_2
    cl_device_affinity_domain adomain;
    CL_SAFE_CALL(clGetDeviceInfo(device_list[i],
                                 CL_DEVICE_PARTITION_AFFINITY_DOMAIN,
                                 sizeof(cl_device_affinity_domain),
                                 &adomain,NULL));

    cl_device_partition_property props[3];
    props[0]=CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN;
    props[1]=CL_DEVICE_AFFINITY_DOMAIN_NUMA;
    props[2]=0;
    if (adomain & CL_DEVICE_AFFINITY_DOMAIN_NUMA)
      CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, 0, NULL,
                                      &num_subdevices));
    if (num_subdevices > 1) {
      cl_device_id *subdevice_list = new cl_device_id[num_subdevices];
      CL_SAFE_CALL(clCreateSubDevices(device_list[i], props, num_subdevices,
                                      subdevice_list, &num_subdevices));
      for (int j=0; j<num_subdevices; j++) {
        _cl_devices.push_back(device_list[i]);
        add_properties(device_list[i]);
        _num_devices++;
      }
      delete[] subdevice_list;
    } else {
      _cl_devices.push_back(device_list[i]);
      add_properties(device_list[i]);
      _num_devices++;
    }
    #endif
  } // for i
  #endif

  delete[] device_list;
  return UCL_SUCCESS;
}

int UCL_Device::create_context() {
  cl_int errorv;
  cl_context_properties props[3];
  props[0]=CL_CONTEXT_PLATFORM;
  props[1]=_platform;
  props[2]=0;
  _context=clCreateContext(0,1,&_cl_device,nullptr,nullptr,&errorv);
  if (errorv!=CL_SUCCESS) {
    #ifndef UCL_NO_EXIT
    std::cerr << "UCL Error: Could not access accelerator number " << _device
              << " for use.\n";
    UCL_GERYON_EXIT;
    #endif
    return UCL_ERROR;
  }
  push_command_queue();
  _default_cq=0;
  return UCL_SUCCESS;
}

void UCL_Device::add_properties(cl_device_id device_list) {
  OCLProperties op;
  char buffer[1024];
  cl_bool ans_bool;

  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_NAME,1024,buffer,nullptr));
  op.name=buffer;
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_GLOBAL_MEM_SIZE,
                               sizeof(op.global_mem),&op.global_mem,nullptr));
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_LOCAL_MEM_SIZE,
                               sizeof(op.shared_mem),&op.shared_mem,nullptr));
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE,
                               sizeof(op.const_mem),&op.const_mem,nullptr));
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_TYPE,
                               sizeof(op.device_type),&op.device_type,nullptr));
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_COMPUTE_UNITS,
                               sizeof(op.compute_units),&op.compute_units,
                               nullptr));
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_CLOCK_FREQUENCY,
                               sizeof(op.clock),&op.clock,nullptr));
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_WORK_GROUP_SIZE,
                               sizeof(op.work_group_size),&op.work_group_size,
                               nullptr));
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MAX_WORK_ITEM_SIZES,
                               3*sizeof(op.work_item_size[0]),op.work_item_size,
                               nullptr));
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_MEM_BASE_ADDR_ALIGN,
                               sizeof(cl_uint),&op.alignment,nullptr));
  op.alignment/=8;

  cl_uint float_width;
  CL_SAFE_CALL(clGetDeviceInfo(device_list,
                               CL_DEVICE_PREFERRED_VECTOR_WIDTH_FLOAT,
                               sizeof(float_width),&float_width,nullptr));
  op.preferred_vector_width32=float_width;

  cl_uint double_width;
  CL_SAFE_CALL(clGetDeviceInfo(device_list,
                               CL_DEVICE_PREFERRED_VECTOR_WIDTH_DOUBLE,
                               sizeof(double_width),&double_width,nullptr));
  op.preferred_vector_width64=double_width;

  // Determine if double precision is supported: All bits in the mask must be set.
  cl_device_fp_config double_mask = (CL_FP_FMA|CL_FP_ROUND_TO_NEAREST|CL_FP_ROUND_TO_ZERO|
                                     CL_FP_ROUND_TO_INF|CL_FP_INF_NAN|CL_FP_DENORM);
  cl_device_fp_config double_avail;
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_DOUBLE_FP_CONFIG,
                               sizeof(double_avail),&double_avail,nullptr));
  if ((double_avail & double_mask) == double_mask)
    op.double_precision=true;
  else
    op.double_precision=false;

  CL_SAFE_CALL(clGetDeviceInfo(device_list,
                               CL_DEVICE_PROFILING_TIMER_RESOLUTION,
                               sizeof(size_t),&op.timer_resolution,nullptr));


  op.ecc_support=false;
  CL_SAFE_CALL(clGetDeviceInfo(device_list,
                               CL_DEVICE_ERROR_CORRECTION_SUPPORT,
                               sizeof(ans_bool),&ans_bool,nullptr));
  if (ans_bool==CL_TRUE)
    op.ecc_support=true;

  op.c_version="";
  op.is_subdevice=false;
  op.partition_equal=false;
  op.partition_counts=false;
  op.partition_affinity=false;
  op.max_sub_devices=1;
  op.cl_device_version=0;
  op.has_subgroup_support=false;
  op.has_shuffle_support=false;

  #ifdef CL_VERSION_1_2
  size_t return_bytes;
  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_OPENCL_C_VERSION,1024,
                               buffer,nullptr));
  op.c_version=buffer;

  cl_device_partition_property pinfo[4];
  CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_PARTITION_TYPE,
                               4*sizeof(cl_device_partition_property),
                               &pinfo, &return_bytes));
  if (return_bytes == 0) op.is_subdevice=false;
  else if (pinfo[0]) op.is_subdevice=true;
  else op.is_subdevice=false;

  CL_SAFE_CALL(clGetDeviceInfo(device_list,
                               CL_DEVICE_PARTITION_PROPERTIES,
                               4*sizeof(cl_device_partition_property),
                               pinfo,&return_bytes));
  int nprops=return_bytes/sizeof(cl_device_partition_property);
  for (int i=0; i<nprops; i++) {
    if (pinfo[i]==CL_DEVICE_PARTITION_EQUALLY)
      op.partition_equal=true;
    else if (pinfo[i]==CL_DEVICE_PARTITION_BY_COUNTS)
      op.partition_counts=true;
    else if (pinfo[i]==CL_DEVICE_PARTITION_BY_AFFINITY_DOMAIN)
      op.partition_affinity=true;
  }

  CL_SAFE_CALL(clGetDeviceInfo(device_list,
                               CL_DEVICE_PARTITION_MAX_SUB_DEVICES,
                               sizeof(cl_uint),&op.max_sub_devices,nullptr));

  CL_SAFE_CALL(clGetDeviceInfo(device_list,CL_DEVICE_VERSION,1024,buffer,nullptr));
  int cl_version_maj = buffer[7] - '0';
  int cl_version_min = buffer[9] - '0';
  op.cl_device_version = cl_version_maj * 100 + cl_version_min * 10;

  size_t ext_str_size_ret;
  CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_EXTENSIONS, 0, nullptr,
                               &ext_str_size_ret));
  char buffer2[ext_str_size_ret];
  CL_SAFE_CALL(clGetDeviceInfo(device_list, CL_DEVICE_EXTENSIONS,
                               ext_str_size_ret, buffer2, nullptr));
  #if defined(CL_VERSION_2_1) || defined(CL_VERSION_3_0)
  if (op.cl_device_version >= 210) {
    if ((std::string(buffer2).find("cl_khr_subgroups") != std::string::npos) ||
        (std::string(buffer2).find("cl_intel_subgroups") != std::string::npos))
      op.has_subgroup_support=true;
    if (std::string(buffer2).find("cl_intel_subgroups") != std::string::npos)
      op.has_shuffle_support=true;
  }
  #endif
  if (std::string(buffer2).find("cl_nv_device_attribute_query") !=
      std::string::npos) {
    #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV
    #define CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV 0x4000
    #endif
    #ifndef CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV
    #define CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV 0x4001
    #endif
    cl_uint major, minor;
    CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                 CL_DEVICE_COMPUTE_CAPABILITY_MAJOR_NV,
                                 sizeof(cl_uint), &major, nullptr));
    CL_SAFE_CALL(clGetDeviceInfo(device_list,
                                 CL_DEVICE_COMPUTE_CAPABILITY_MINOR_NV,
                                 sizeof(cl_uint), &minor, nullptr));
    double arch = static_cast<double>(minor)/10+major;
    if (arch >= 3.0)
      op.has_shuffle_support=true;
  }
  #endif

  _properties.push_back(op);
}

std::string UCL_Device::platform_name() {
  char info[1024];

  CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VENDOR,1024,info,
                                 nullptr));
  std::string ans=std::string(info)+' ';

  CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_NAME,1024,info,
                                 nullptr));
  ans+=std::string(info)+' ';

  CL_SAFE_CALL(clGetPlatformInfo(_cl_platform,CL_PLATFORM_VERSION,1024,info,
               nullptr));
  ans+=std::string(info);

  return ans;
}

// Get a string telling the type of the device
std::string UCL_Device::device_type_name(const int i) {
  if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
    return "CPU";
  else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
    return "GPU";
  else if (_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR)
    return "ACCELERATOR";
  else
    return "DEFAULT";
}

// Get a string telling the type of the device
enum UCL_DEVICE_TYPE UCL_Device::device_type(const int i) {
  if (_properties[i].device_type==CL_DEVICE_TYPE_CPU)
    return UCL_CPU;
  else if (_properties[i].device_type==CL_DEVICE_TYPE_GPU)
    return UCL_GPU;
  else if (_properties[i].device_type==CL_DEVICE_TYPE_ACCELERATOR)
    return UCL_ACCELERATOR;
  else
    return UCL_DEFAULT;
}

// Set the CUDA device to the specified device number
int UCL_Device::set(int num) {
  _device=num;
  _cl_device=_cl_devices[_device];
  return create_context();
}

// List all devices from all platforms along with all properties
void UCL_Device::print_all(std::ostream &out) {
  // --- loop through the platforms
  for (int n=0; n<_num_platforms; n++) {

    set_platform(n);

    out << "\nPlatform " << n << ":\n";

    if (num_devices() == 0)
      out << "There is no device supporting OpenCL\n";
    for (int i=0; i<num_devices(); ++i) {
      out << "\nDevice " << i << ": \"" << name(i).c_str() << "\"\n";
      out << "  Type of device:                                "
          << device_type_name(i).c_str() << std::endl;
      out << "  Supported OpenCL Version:                      "
          << _properties[i].cl_device_version / 100 << "."
          << _properties[i].cl_device_version % 100 << std::endl;
      out << "  Is a subdevice:                                ";
      if (is_subdevice(i))
        out << "Yes\n";
      else
        out << "No\n";
      out << "  Double precision support:                      ";
      if (double_precision(i))
        out << "Yes\n";
      else
        out << "No\n";
      out << "  Total amount of global memory:                 "
          << gigabytes(i) << " GB\n";
      out << "  Number of compute units/multiprocessors:       "
          << _properties[i].compute_units << std::endl;
      //out << "  Number of cores:                               "
      //    << cores(i) << std::endl;
      out << "  Total amount of constant memory:               "
          << _properties[i].const_mem << " bytes\n";
      out << "  Total amount of local/shared memory per block: "
          << _properties[i].shared_mem << " bytes\n";
      //out << "  Total number of registers available per block: "
      //    << _properties[i].regsPerBlock << std::endl;
      //out << "  Warp size:                                     "
      //    << _properties[i].warpSize << std::endl;
      out << "  Maximum group size (# of threads per block)    "
          << _properties[i].work_group_size << std::endl;
      out << "  Maximum item sizes (# threads for each dim)    "
          << _properties[i].work_item_size[0] << " x "
          << _properties[i].work_item_size[1] << " x "
          << _properties[i].work_item_size[2] << std::endl;
      //out << "  Maximum sizes of each dimension of a grid:     "
      //    << _properties[i].maxGridSize[0] << " x "
      //    << _properties[i].maxGridSize[1] << " x "
      //    << _properties[i].maxGridSize[2] << std::endl;
      //out << "  Maximum memory pitch:                          "
      //    << _properties[i].memPitch) << " bytes\n";
      //out << "  Texture alignment:                             "
      //    << _properties[i].textureAlignment << " bytes\n";
      out << "  Clock rate:                                    "
          << clock_rate(i) << " GHz\n";
      //out << "  Concurrent copy and execution:                 ";
      out << "  ECC support:                                   ";
      if (_properties[i].ecc_support)
        out << "Yes\n";
      else
        out << "No\n";
      out << "  Device fission into equal partitions:          ";
      if (fission_equal(i))
        out << "Yes\n";
      else
        out << "No\n";
      out << "  Device fission by counts:                      ";
      if (fission_by_counts(i))
        out << "Yes\n";
      else
        out << "No\n";
      out << "  Device fission by affinity:                    ";
      if (fission_by_affinity(i))
        out << "Yes\n";
      else
        out << "No\n";
      out << "  Maximum subdevices from fission:               "
          << max_sub_devices(i) << std::endl;
      out << "  Shared memory system:                          ";
      if (shared_memory(i))
        out << "Yes\n";
      else
        out << "No\n";
      out << "  Subgroup support:                              ";
      if (_properties[i].has_subgroup_support)
        out << "Yes\n";
      else
        out << "No\n";
      out << "  Shuffle support:                               ";
      if (_properties[i].has_shuffle_support)
        out << "Yes\n";
      else
        out << "No\n";
    }
  }
}

int UCL_Device::auto_set_platform(const enum UCL_DEVICE_TYPE type,
                                  const std::string vendor,
                                  const int ndevices,
                                  const int first_device) {
  if (_num_platforms < 2) return set_platform(0);

  int last_device = -1;
  if (first_device > -1) {
    if (ndevices)
      last_device = first_device + ndevices - 1;
    else
      last_device = first_device;
  }

  bool vendor_match=false;
  bool type_match=false;
  int max_cus=0;
  int best_platform=0;

  std::string vendor_upper=vendor;
  for (int i=0; i<vendor.length(); i++)
    if (vendor_upper[i]<='z' && vendor_upper[i]>='a')
      vendor_upper[i]=toupper(vendor_upper[i]);

  for (int n=0; n<_num_platforms; n++) {
    set_platform(n);
    if (last_device > -1 && last_device >= num_devices()) continue;
    if (ndevices > num_devices()) continue;

    int first_id=0;
    int last_id=num_devices()-1;
    if (last_device > -1) {
      first_id=first_device;
      last_id=last_device;
    }

    if (vendor_upper!="") {
      std::string pname = platform_name();
      for (int i=0; i<pname.length(); i++)
        if (pname[i]<='z' && pname[i]>='a')
          pname[i]=toupper(pname[i]);

      if (pname.find(vendor_upper)!=std::string::npos) {
        if (vendor_match == false) {
          best_platform=n;
          max_cus=0;
          vendor_match=true;
        }
      } else if (vendor_match)
        continue;
    }

    if (type != UCL_DEFAULT) {
      bool ptype_matched=false;
      for (int d=first_id; d<=last_id; d++) {
        if (type==device_type(d)) {
          if (type_match == false) {
            best_platform=n;
            max_cus=0;
            type_match=true;
            ptype_matched=true;
          }
        }
      }
      if (type_match==true && ptype_matched==false)
        continue;
    }

    for (int d=first_id; d<=last_id; d++) {
      if (cus(d) > max_cus) {
        best_platform=n;
        max_cus=cus(d);
      }
    }
  }
  return set_platform(best_platform);
}

} // namespace ucl_opencl

#endif