File: PerfTestGramSchmidt.cpp

package info (click to toggle)
kokkos 4.7.01-2
  • links: PTS, VCS
  • area: main
  • in suites: sid
  • size: 16,636 kB
  • sloc: cpp: 223,676; sh: 2,446; makefile: 2,437; python: 91; fortran: 4; ansic: 2
file content (197 lines) | stat: -rw-r--r-- 5,980 bytes parent folder | download | duplicates (2)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
//@HEADER
// ************************************************************************
//
//                        Kokkos v. 4.0
//       Copyright (2022) National Technology & Engineering
//               Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER

#include <Kokkos_Core.hpp>
#include <benchmark/benchmark.h>
#include "PerfTest_Category.hpp"

#include <cmath>
#include "PerfTestBlasKernels.hpp"

//----------------------------------------------------------------------------
//----------------------------------------------------------------------------

namespace Test {

// Reduction   : result = dot( Q(:,j) , Q(:,j) );
// PostProcess : R(j,j) = result ; inv = 1 / result ;
template <class VectorView, class ValueView>
struct InvNorm2 : public Kokkos::DotSingle<VectorView> {
  using value_type = typename Kokkos::DotSingle<VectorView>::value_type;

  ValueView Rjj;
  ValueView inv;

  InvNorm2(const VectorView& argX, const ValueView& argR,
           const ValueView& argInv)
      : Kokkos::DotSingle<VectorView>(argX), Rjj(argR), inv(argInv) {}

  KOKKOS_INLINE_FUNCTION
  void final(value_type& result) const {
    result = Kokkos::sqrt(result);
    Rjj()  = result;
    inv()  = (0 < result) ? 1.0 / result : 0;
  }
};

template <class VectorView, class ValueView>
inline void invnorm2(const VectorView& x, const ValueView& r,
                     const ValueView& r_inv) {
  Kokkos::parallel_reduce(x.extent(0),
                          InvNorm2<VectorView, ValueView>(x, r, r_inv));
}

// PostProcess : tmp = - ( R(j,k) = result );
template <class VectorView, class ValueView>
struct DotM : public Kokkos::Dot<VectorView> {
  using value_type = typename Kokkos::Dot<VectorView>::value_type;

  ValueView Rjk;
  ValueView tmp;

  DotM(const VectorView& argX, const VectorView& argY, const ValueView& argR,
       const ValueView& argTmp)
      : Kokkos::Dot<VectorView>(argX, argY), Rjk(argR), tmp(argTmp) {}

  KOKKOS_INLINE_FUNCTION
  void final(value_type& result) const {
    Rjk() = result;
    tmp() = -result;
  }
};

template <class VectorView, class ValueView>
inline void dot_neg(const VectorView& x, const VectorView& y,
                    const ValueView& r, const ValueView& r_neg) {
  Kokkos::parallel_reduce(x.extent(0),
                          DotM<VectorView, ValueView>(x, y, r, r_neg));
}

template <typename Scalar, class DeviceType>
struct ModifiedGramSchmidt {
  using execution_space = DeviceType;
  using size_type       = typename execution_space::size_type;

  using multivector_type =
      Kokkos::View<Scalar**, Kokkos::LayoutLeft, execution_space>;

  using vector_type =
      Kokkos::View<Scalar*, Kokkos::LayoutLeft, execution_space>;

  using value_view = Kokkos::View<Scalar, Kokkos::LayoutLeft, execution_space>;

  multivector_type Q;
  multivector_type R;

  static double factorization(const multivector_type Q_,
                              const multivector_type R_) {
    const size_type count = Q_.extent(1);
    value_view tmp("tmp");
    value_view one("one");

    Kokkos::deep_copy(one, (Scalar)1);

    Kokkos::Timer timer;

    for (size_type j = 0; j < count; ++j) {
      // Reduction   : tmp = dot( Q(:,j) , Q(:,j) );
      // PostProcess : tmp = std::sqrt( tmp ); R(j,j) = tmp ; tmp = 1 / tmp ;
      const vector_type Qj = Kokkos::subview(Q_, Kokkos::ALL(), j);
      const value_view Rjj = Kokkos::subview(R_, j, j);

      invnorm2(Qj, Rjj, tmp);

      // Q(:,j) *= ( 1 / R(j,j) ); => Q(:,j) *= tmp ;
      Kokkos::scale(tmp, Qj);

      for (size_type k = j + 1; k < count; ++k) {
        const vector_type Qk = Kokkos::subview(Q_, Kokkos::ALL(), k);
        const value_view Rjk = Kokkos::subview(R_, j, k);

        // Reduction   : R(j,k) = dot( Q(:,j) , Q(:,k) );
        // PostProcess : tmp = - R(j,k);
        dot_neg(Qj, Qk, Rjk, tmp);

        // Q(:,k) -= R(j,k) * Q(:,j); => Q(:,k) += tmp * Q(:,j)
        Kokkos::axpby(tmp, Qj, one, Qk);
      }
    }

    execution_space().fence();

    return timer.seconds();
  }

  //--------------------------------------------------------------------------

  static double test(const size_type length, const size_type count) {
    multivector_type Q_("Q", length, count);
    multivector_type R_("R", count, count);

    typename multivector_type::HostMirror A = Kokkos::create_mirror(Q_);

    // Create and fill A on the host
    for (size_type j = 0; j < count; ++j) {
      for (size_type i = 0; i < length; ++i) {
        A(i, j) = (i + 1) * (j + 1);
      }
    }

    Kokkos::deep_copy(Q_, A);

    // A = Q * R
    const double dt = factorization(Q_, R_);

    return dt;
  }
};

template <class Scalar>
static void GramSchmidt(benchmark::State& state) {
  const int parallel_work_length = state.range(0);

  for (auto _ : state) {
    const double seconds =
        ModifiedGramSchmidt<Scalar, Kokkos::DefaultExecutionSpace>::test(
            parallel_work_length, 32);

    state.SetIterationTime(seconds);
    state.counters["Count"] = benchmark::Counter(parallel_work_length);
    state.counters["Time normalized"] =
        benchmark::Counter(seconds / parallel_work_length);
  }
}

// FIXME_SYCL SYCL+Cuda reports "an illegal memory access was encountered"
#if defined(KOKKOS_ENABLE_SYCL) && defined(KOKKOS_IMPL_ARCH_NVIDIA_GPU)
BENCHMARK(GramSchmidt<double>)
    ->ArgName("Count")
    ->ArgsProduct({
        benchmark::CreateRange(1 << 10, 1 << 18, 2),
    })
    ->UseManualTime()
    ->Iterations(5);
#else
BENCHMARK(GramSchmidt<double>)
    ->ArgName("Count")
    ->ArgsProduct({
        benchmark::CreateRange(1 << 10, 1 << 19, 2),
    })
    ->UseManualTime()
    ->Iterations(5);
#endif

}  // namespace Test