From: Cordell Bloor <cgmb@slerp.xyz>
Date: Sun, 9 Apr 2023 02:33:46 -0600
Subject: make openmp optional

Applied-Upstream: https://github.com/ROCmSoftwarePlatform/rocBLAS/commit/45bef63fc4988d11f25c9cc84b12513c7b8abc81
---
 clients/common/blis_interface.cpp                  |  2 +
 clients/common/cblas_interface.cpp                 | 12 +++++
 clients/include/rocblas_init.hpp                   | 52 ++++++++++++++++++++++
 clients/samples/example_openmp.cpp                 |  2 +
 tensile/HostLibraryTests/CachingLibrary_test.cpp   |  2 +
 .../HostLibraryTests/testlib/include/TestUtils.hpp |  7 ++-
 6 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/clients/common/blis_interface.cpp b/clients/common/blis_interface.cpp
index da7aef3..b302d57 100644
--- a/clients/common/blis_interface.cpp
+++ b/clients/common/blis_interface.cpp
@@ -21,7 +21,9 @@
  * ************************************************************************ */
 
 #include "blis.h"
+#ifdef _OPENMP
 #include "omp.h"
+#endif
 
 void setup_blis()
 {
diff --git a/clients/common/cblas_interface.cpp b/clients/common/cblas_interface.cpp
index 2eb50ab..86b10cd 100644
--- a/clients/common/cblas_interface.cpp
+++ b/clients/common/cblas_interface.cpp
@@ -23,7 +23,9 @@
 #include "rocblas_vector.hpp"
 #include "utility.hpp"
 #include <bitset>
+#ifdef _OPENMP
 #include <omp.h>
+#endif
 
 /*
  * ===========================================================================
@@ -462,7 +464,9 @@ void cblas_geam_helper(rocblas_operation transA,
     rocblas_int inc1_B = transB == rocblas_operation_none ? 1 : ldb;
     rocblas_int inc2_B = transB == rocblas_operation_none ? ldb : 1;
 
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(rocblas_int i = 0; i < M; i++)
     {
         for(rocblas_int j = 0; j < N; j++)
@@ -971,7 +975,9 @@ void cblas_herkx(rocblas_fill      uplo,
     {
         if(uplo == rocblas_fill_upper)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(int j = 0; j < n; ++j)
             {
                 for(int i = 0; i <= j; i++)
@@ -993,7 +999,9 @@ void cblas_herkx(rocblas_fill      uplo,
         }
         else // lower
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(int j = 0; j < n; ++j)
             {
                 for(int i = j; i < n; i++)
@@ -1018,7 +1026,9 @@ void cblas_herkx(rocblas_fill      uplo,
     {
         if(uplo == rocblas_fill_upper)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(int j = 0; j < n; ++j)
             {
                 for(int i = 0; i <= j; i++)
@@ -1042,7 +1052,9 @@ void cblas_herkx(rocblas_fill      uplo,
         }
         else // lower
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(int j = 0; j < n; ++j)
             {
                 for(int i = j; i < n; i++)
diff --git a/clients/include/rocblas_init.hpp b/clients/include/rocblas_init.hpp
index e971b72..4a07fb9 100644
--- a/clients/include/rocblas_init.hpp
+++ b/clients/include/rocblas_init.hpp
@@ -29,7 +29,9 @@
 #include "rocblas_random.hpp"
 #include <cinttypes>
 #include <iostream>
+#ifdef _OPENMP
 #include <omp.h>
+#endif
 #include <vector>
 
 //!
@@ -70,7 +72,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,
     if(matrix_type == rocblas_client_general_matrix)
     {
         for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -81,7 +85,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,
     else if(matrix_type == rocblas_client_triangular_matrix)
     {
         for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -107,7 +113,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,
 
         if(matrix_type == rocblas_client_general_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -117,7 +125,9 @@ void rocblas_init_matrix_alternating_sign(rocblas_check_matrix_type matrix_type,
         }
         else if(matrix_type == rocblas_client_triangular_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -136,7 +146,9 @@ void rocblas_init_vector_alternating_sign(T rand_gen(), T* x, rocblas_int N, roc
     if(incx < 0)
         x -= (N - 1) * incx;
 
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(rocblas_int j = 0; j < N; ++j)
     {
         auto value  = rand_gen();
@@ -162,7 +174,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
     if(matrix_type == rocblas_client_general_matrix)
     {
         for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                     A[i + j * lda + b * stride] = rand_gen();
@@ -170,7 +184,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
     else if(matrix_type == rocblas_client_hermitian_matrix)
     {
         for(size_t b = 0; b < batch_count; ++b)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -197,7 +213,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
     else if(matrix_type == rocblas_client_symmetric_matrix)
     {
         for(size_t b = 0; b < batch_count; ++b)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -224,7 +242,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
     else if(matrix_type == rocblas_client_triangular_matrix)
     {
         for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -249,14 +269,18 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
         auto  lda = hA.lda();
         if(matrix_type == rocblas_client_general_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                     A[i + j * lda] = rand_gen();
         }
         else if(matrix_type == rocblas_client_hermitian_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -282,7 +306,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
         }
         else if(matrix_type == rocblas_client_symmetric_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -308,7 +334,9 @@ void rocblas_init_matrix(rocblas_check_matrix_type matrix_type,
         }
         else if(matrix_type == rocblas_client_triangular_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -329,7 +357,9 @@ void rocblas_init_vector(T rand_gen(), T* x, rocblas_int N, rocblas_stride incx)
     if(incx < 0)
         x -= (N - 1) * incx;
 
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(rocblas_int j = 0; j < N; ++j)
         x[j * incx] = rand_gen();
 }
@@ -352,7 +382,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
     if(matrix_type == rocblas_client_general_matrix)
     {
         for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                     A[i + j * lda + b * stride] = T(seedReset ? cos(i + j * lda + b * stride)
@@ -361,7 +393,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
     else if(matrix_type == rocblas_client_hermitian_matrix)
     {
         for(size_t b = 0; b < batch_count; ++b)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -390,7 +424,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
     else if(matrix_type == rocblas_client_symmetric_matrix)
     {
         for(size_t b = 0; b < batch_count; ++b)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -418,7 +454,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
     else if(matrix_type == rocblas_client_triangular_matrix)
     {
         for(size_t b = 0; b < batch_count; b++)
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -449,14 +487,18 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
 
         if(matrix_type == rocblas_client_general_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                     A[i + j * lda] = T(seedReset ? cos(i + j * lda) : sin(i + j * lda));
         }
         else if(matrix_type == rocblas_client_hermitian_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -483,7 +525,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
         }
         else if(matrix_type == rocblas_client_symmetric_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < N; ++i)
                 for(size_t j = 0; j <= i; ++j)
                 {
@@ -509,7 +553,9 @@ void rocblas_init_matrix_trig(rocblas_check_matrix_type matrix_type,
         }
         else if(matrix_type == rocblas_client_triangular_matrix)
         {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
             for(size_t i = 0; i < M; ++i)
                 for(size_t j = 0; j < N; ++j)
                 {
@@ -533,7 +579,9 @@ void rocblas_init_vector_trig(T* x, rocblas_int N, rocblas_stride incx, bool see
     if(incx < 0)
         x -= (N - 1) * incx;
 
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
     for(rocblas_int j = 0; j < N; ++j)
         x[j * incx] = T(seedReset ? cos(j * incx) : sin(j * incx));
 }
@@ -809,7 +857,9 @@ void rocblas_copy_matrix(const T* A,
     {
         size_t stride_offset_a = i_batch * stridea;
         size_t stride_offset_b = i_batch * strideb;
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
         for(size_t j = 0; j < N; ++j)
         {
             size_t offset_a = stride_offset_a + j * lda;
@@ -828,7 +878,9 @@ void rocblas_copy_matrix(
 
     for(size_t i_batch = 0; i_batch < batch_count; i_batch++)
     {
+#ifdef _OPENMP
 #pragma omp parallel for
+#endif
         for(size_t j = 0; j < N; ++j)
         {
             size_t offset_a = j * lda;
diff --git a/clients/samples/example_openmp.cpp b/clients/samples/example_openmp.cpp
index f62dae6..f844480 100644
--- a/clients/samples/example_openmp.cpp
+++ b/clients/samples/example_openmp.cpp
@@ -42,7 +42,9 @@
 #include <cstdlib>
 #include <hip/hip_runtime.h>
 #include <iostream>
+#ifdef _OPENMP
 #include <omp.h>
+#endif
 #include <vector>
 
 #define NUM_THREADS 4
diff --git a/tensile/HostLibraryTests/CachingLibrary_test.cpp b/tensile/HostLibraryTests/CachingLibrary_test.cpp
index 2868530..2bdf118 100644
--- a/tensile/HostLibraryTests/CachingLibrary_test.cpp
+++ b/tensile/HostLibraryTests/CachingLibrary_test.cpp
@@ -63,7 +63,9 @@ TEST(Cache, Threaded)
     using namespace Tensile;
     CacheMap<int, int> cache(-1);
 
+#ifdef _OPENMP
 #pragma omp parallel num_threads(32)
+#endif
     {
         int seed = 0;
 #ifdef _OPENMP
diff --git a/tensile/HostLibraryTests/testlib/include/TestUtils.hpp b/tensile/HostLibraryTests/testlib/include/TestUtils.hpp
index f146c58..10475ed 100644
--- a/tensile/HostLibraryTests/testlib/include/TestUtils.hpp
+++ b/tensile/HostLibraryTests/testlib/include/TestUtils.hpp
@@ -27,7 +27,9 @@
 #pragma once
 
 #include <cstddef>
+#ifdef _OPENMP
 #include <omp.h>
+#endif
 #include <random>
 
 #include <Tensile/ContractionProblem.hpp>
@@ -99,8 +101,9 @@ namespace Tensile
             throw std::runtime_error("Fix this function to work with dimensions != 3");
 
         auto seed_base = rng();
-
+#ifdef _OPENMP
 #pragma omp parallel num_threads(32)
+#endif
         {
             RNG  myrng = rng;
             auto seed  = seed_base;
@@ -111,7 +114,9 @@ namespace Tensile
 
             std::vector<size_t> index3{0, 0, 0};
 
+#ifdef _OPENMP
 #pragma omp for schedule(static) collapse(2)
+#endif
             for(size_t i = 0; i < desc.sizes()[2]; i++)
             {
                 for(size_t j = 0; j < desc.sizes()[1]; j++)
