Description: Avoid SSE2 usage on i386 without properly checks
Author: Nicholas Guriev <guriev-ns@ya.ru>
Forwarded: https://github.com/desktop-app/tg_owt/pull/45
Last-Update: Wed, 27 Jan 2021 18:59:30 +0300

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1206,7 +1206,6 @@ PRIVATE
     #modules/audio_processing/utility/ooura_fft.h
     #modules/audio_processing/utility/ooura_fft_tables_common.h
     #modules/audio_processing/utility/ooura_fft_neon.cc
-    #modules/audio_processing/utility/ooura_fft_sse2.cc
     #modules/audio_processing/utility/ooura_fft_tables_neon_sse2.h
     modules/audio_processing/vad/gmm.cc
     modules/audio_processing/vad/pitch_based_vad.cc
@@ -1562,7 +1561,6 @@ PRIVATE
     modules/video_processing/util/denoiser_filter.cc
     modules/video_processing/util/denoiser_filter_c.cc
     modules/video_processing/util/denoiser_filter_neon.cc
-    modules/video_processing/util/denoiser_filter_sse2.cc
     modules/video_processing/util/noise_estimation.cc
     modules/video_processing/util/skin_detection.cc
     modules/video_processing/video_denoiser.cc
@@ -1601,15 +1599,12 @@ PRIVATE
     common_audio/fir_filter_c.cc
     common_audio/fir_filter_neon.cc
     common_audio/fir_filter_neon.h
-    common_audio/fir_filter_sse.cc
-    common_audio/fir_filter_sse.h
     common_audio/real_fourier.cc
     common_audio/real_fourier_ooura.cc
     common_audio/resampler/push_resampler.cc
     common_audio/resampler/push_sinc_resampler.cc
     common_audio/resampler/resampler.cc
     common_audio/resampler/sinc_resampler_neon.cc
-    common_audio/resampler/sinc_resampler_sse.cc
     common_audio/resampler/sinc_resampler.cc
     common_audio/resampler/sinusoidal_linear_chirp_source.cc
     common_audio/signal_processing/dot_product_with_scale.cc
@@ -1657,8 +1652,6 @@ PRIVATE
     common_audio/third_party/spl_sqrt_floor/spl_sqrt_floor.c
     common_audio/third_party/ooura/fft_size_128/ooura_fft.cc
     common_audio/third_party/ooura/fft_size_128/ooura_fft.h
-    common_audio/third_party/ooura/fft_size_128/ooura_fft_sse2.cc
-    common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h
     common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_common.h
     common_audio/third_party/ooura/fft_size_128/ooura_fft_neon.cc
     common_audio/third_party/ooura/fft_size_256/fft4g.cc
@@ -2001,8 +1994,15 @@ if (is_x86 OR is_x64)
         common_audio/fir_filter_avx2.h
     )
     add_sublibrary(sse2
+        common_audio/fir_filter_sse.cc
+        common_audio/fir_filter_sse.h
+        common_audio/resampler/sinc_resampler_sse.cc
+        common_audio/third_party/ooura/fft_size_128/ooura_fft_sse2.cc
+        common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h
+        #modules/audio_processing/utility/ooura_fft_sse2.cc
         modules/desktop_capture/differ_vector_sse2.cc
         modules/desktop_capture/differ_vector_sse2.h
+        modules/video_processing/util/denoiser_filter_sse2.cc
     )
 endif()
 
@@ -2027,15 +2027,10 @@ if (NOT WIN32)
     )
 endif()
 
-if ((NOT is_x86) AND (NOT is_x64))
-    remove_target_sources(tg_owt ${webrtc_loc}
-        common_audio/fir_filter_sse.cc
-        common_audio/fir_filter_sse.h
-        common_audio/resampler/sinc_resampler_sse.cc
-        modules/audio_processing/utility/ooura_fft_sse2.cc
-        modules/video_processing/util/denoiser_filter_sse2.cc
-        common_audio/third_party/ooura/fft_size_128/ooura_fft_sse2.cc
-        common_audio/third_party/ooura/fft_size_128/ooura_fft_tables_neon_sse2.h
+if (x86_has_sse2)
+    target_compile_definitions(tg_owt
+    PRIVATE
+        WEBRTC_HAS_SSE2
     )
 endif()
 
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@@ -8,8 +8,10 @@ set(is_arm  0)
 set(is_arm8 0)
 set(is_arm7 0)
 set(arm_use_neon 0)
+set(x86_has_sse2 0)
 
 option(TG_OWT_ARCH_ARMV7_USE_NEON "Use NEON SIMD instructions when building for ARMv7" ON)
+option(TG_OWT_ARCH_X86_FORCE_SSE "Assume SSE instructions available when building for IA-32" ON)
 
 
 # Check for 64-bit x86 (aka x64):
@@ -117,3 +119,29 @@ endif() # arm32
 endif() # aarch64
 endif() # x86
 endif() # x64
+
+
+if (is_x86 OR is_x64)
+    check_symbol_exists(__SSE__  "stddef.h" HAVE_SSE1_DEF)
+    check_symbol_exists(__SSE2__ "stddef.h" HAVE_SSE2_DEF)
+
+    if (HAVE_SSE1_DEF AND HAVE_SSE2_DEF)
+        message(STATUS "Compiller natively supports SSE and SSE2, these SIMD instructions now enabled")
+        set(x86_has_sse2 1)
+    elseif (TG_OWT_ARCH_X86_FORCE_SSE)
+        message(STATUS "SSE SIMD instructions enabled (can be disabled with -DTG_OWT_ARCH_X86_FORCE_SSE=OFF).")
+        set(x86_has_sse2 1)
+
+        if (WIN32)
+            # TODO: Add the correct flags for Windows here.
+        elseif (APPLE)
+            # TODO: Add the correct flags for Apple devices here.
+        else()
+            set(CMAKE_C_FLAGS   "${CMAKE_C_FLAGS}   -msse2")
+            set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse2")
+            set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} -msse2")
+        endif()
+    else()
+        message(STATUS "Runtime checks of SSE SIMD activated (can be forced with -DTG_OWT_ARCH_X86_FORCE_SSE=ON).")
+    endif()
+endif()
--- a/cmake/init_target.cmake
+++ b/cmake/init_target.cmake
@@ -78,13 +78,6 @@ function(init_target target_name) # init
             endif()
         endif()
 
-        if (is_x86)
-            target_compile_options(${target_name}
-            PRIVATE
-                -msse2
-            )
-        endif()
-
         target_compile_definitions(${target_name}
         PRIVATE
             HAVE_NETINET_IN_H
--- a/cmake/libpffft.cmake
+++ b/cmake/libpffft.cmake
@@ -15,7 +15,7 @@ PRIVATE
     _USE_MATH_DEFINES
 )
 
-if (NOT is_x86 AND NOT is_x64 AND NOT arm_use_neon)
+if (NOT x86_has_sse2 AND NOT arm_use_neon)
     target_compile_definitions(libpffft
     PRIVATE
         PFFFT_SIMD_DISABLE
--- a/src/modules/audio_processing/aec3/adaptive_fir_filter.cc
+++ b/src/modules/audio_processing/aec3/adaptive_fir_filter.cc
@@ -88,7 +88,7 @@ void ComputeFrequencyResponse_Neon(
 
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 // Computes and stores the frequency response of the filter.
-void ComputeFrequencyResponse_Sse2(
+RTC_TARGET_SSE2 void ComputeFrequencyResponse_Sse2(
     size_t num_partitions,
     const std::vector<std::vector<FftData>>& H,
     std::vector<std::array<float, kFftLengthBy2Plus1>>* H2) {
@@ -210,10 +210,11 @@ void AdaptPartitions_Neon(const RenderBu
 
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 // Adapts the filter partitions. (SSE2 variant)
-void AdaptPartitions_Sse2(const RenderBuffer& render_buffer,
-                          const FftData& G,
-                          size_t num_partitions,
-                          std::vector<std::vector<FftData>>* H) {
+RTC_TARGET_SSE2 void AdaptPartitions_Sse2(
+    const RenderBuffer& render_buffer,
+    const FftData& G,
+    size_t num_partitions,
+    std::vector<std::vector<FftData>>* H) {
   rtc::ArrayView<const std::vector<FftData>> render_buffer_data =
       render_buffer.GetFftBuffer();
   const size_t num_render_channels = render_buffer_data[0].size();
@@ -375,10 +376,11 @@ void ApplyFilter_Neon(const RenderBuffer
 
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 // Produces the filter output (SSE2 variant).
-void ApplyFilter_Sse2(const RenderBuffer& render_buffer,
-                      size_t num_partitions,
-                      const std::vector<std::vector<FftData>>& H,
-                      FftData* S) {
+RTC_TARGET_SSE2 void ApplyFilter_Sse2(
+    const RenderBuffer& render_buffer,
+    size_t num_partitions,
+    const std::vector<std::vector<FftData>>& H,
+    FftData* S) {
   // const RenderBuffer& render_buffer,
   //                     rtc::ArrayView<const FftData> H,
   //                     FftData* S) {
--- a/src/modules/audio_processing/aec3/adaptive_fir_filter_erl.cc
+++ b/src/modules/audio_processing/aec3/adaptive_fir_filter_erl.cc
@@ -57,7 +57,7 @@ void ErlComputer_NEON(
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 // Computes and stores the echo return loss estimate of the filter, which is the
 // sum of the partition frequency responses.
-void ErlComputer_SSE2(
+RTC_TARGET_SSE2 void ErlComputer_SSE2(
     const std::vector<std::array<float, kFftLengthBy2Plus1>>& H2,
     rtc::ArrayView<float> erl) {
   std::fill(erl.begin(), erl.end(), 0.f);
--- a/src/modules/audio_processing/aec3/fft_data.h
+++ b/src/modules/audio_processing/aec3/fft_data.h
@@ -48,7 +48,7 @@ struct FftData {
                 rtc::ArrayView<float> power_spectrum) const {
     RTC_DCHECK_EQ(kFftLengthBy2Plus1, power_spectrum.size());
     switch (optimization) {
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && defined(WEBRTC_HAS_SSE2)
       case Aec3Optimization::kSse2: {
         constexpr int kNumFourBinBands = kFftLengthBy2 / 4;
         constexpr int kLimit = kNumFourBinBands * 4;
--- a/src/modules/audio_processing/aec3/matched_filter.cc
+++ b/src/modules/audio_processing/aec3/matched_filter.cc
@@ -144,14 +144,14 @@ void MatchedFilterCore_NEON(size_t x_sta
 
 #if defined(WEBRTC_ARCH_X86_FAMILY)
 
-void MatchedFilterCore_SSE2(size_t x_start_index,
-                            float x2_sum_threshold,
-                            float smoothing,
-                            rtc::ArrayView<const float> x,
-                            rtc::ArrayView<const float> y,
-                            rtc::ArrayView<float> h,
-                            bool* filters_updated,
-                            float* error_sum) {
+RTC_TARGET_SSE2 void MatchedFilterCore_SSE2(size_t x_start_index,
+                                            float x2_sum_threshold,
+                                            float smoothing,
+                                            rtc::ArrayView<const float> x,
+                                            rtc::ArrayView<const float> y,
+                                            rtc::ArrayView<float> h,
+                                            bool* filters_updated,
+                                            float* error_sum) {
   const int h_size = static_cast<int>(h.size());
   const int x_size = static_cast<int>(x.size());
   RTC_DCHECK_EQ(0, h_size % 4);
--- a/src/modules/audio_processing/aec3/vector_math.h
+++ b/src/modules/audio_processing/aec3/vector_math.h
@@ -43,7 +43,7 @@ class VectorMath {
   void SqrtAVX2(rtc::ArrayView<float> x);
   void Sqrt(rtc::ArrayView<float> x) {
     switch (optimization_) {
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && defined(WEBRTC_HAS_SSE2)
       case Aec3Optimization::kSse2: {
         const int x_size = static_cast<int>(x.size());
         const int vector_limit = x_size >> 2;
@@ -123,7 +123,7 @@ class VectorMath {
     RTC_DCHECK_EQ(z.size(), x.size());
     RTC_DCHECK_EQ(z.size(), y.size());
     switch (optimization_) {
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && defined(WEBRTC_HAS_SSE2)
       case Aec3Optimization::kSse2: {
         const int x_size = static_cast<int>(x.size());
         const int vector_limit = x_size >> 2;
@@ -173,7 +173,7 @@ class VectorMath {
   void Accumulate(rtc::ArrayView<const float> x, rtc::ArrayView<float> z) {
     RTC_DCHECK_EQ(z.size(), x.size());
     switch (optimization_) {
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && defined(WEBRTC_HAS_SSE2)
       case Aec3Optimization::kSse2: {
         const int x_size = static_cast<int>(x.size());
         const int vector_limit = x_size >> 2;
--- a/src/modules/audio_processing/agc2/rnn_vad/vector_math.h
+++ b/src/modules/audio_processing/agc2/rnn_vad/vector_math.h
@@ -40,7 +40,7 @@ class VectorMath {
   float DotProduct(rtc::ArrayView<const float> x,
                    rtc::ArrayView<const float> y) const {
     RTC_DCHECK_EQ(x.size(), y.size());
-#if defined(WEBRTC_ARCH_X86_FAMILY)
+#if defined(WEBRTC_ARCH_X86_FAMILY) && defined(WEBRTC_HAS_SSE2)
     if (cpu_features_.avx2) {
       return DotProductAvx2(x, y);
     } else if (cpu_features_.sse2) {
--- a/src/rtc_base/system/inline.h
+++ b/src/rtc_base/system/inline.h
@@ -28,4 +28,10 @@
 
 #endif
 
+#if defined(__GNUC__) && !defined(__SSE2__)
+#define RTC_TARGET_SSE2 __attribute__((__target__("sse2")))
+#else
+#define RTC_TARGET_SSE2
+#endif
+
 #endif  // RTC_BASE_SYSTEM_INLINE_H_
