File: SseMath.hpp

package info (click to toggle)
consensuscore 1.1.1%2Bdfsg-8
  • links: PTS, VCS
  • area: main
  • in suites: sid, trixie
  • size: 3,492 kB
  • sloc: cpp: 38,945; python: 2,083; ansic: 543; sh: 184; makefile: 91; cs: 10
file content (52 lines) | stat: -rw-r--r-- 1,306 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
// Author: David Alexander

#pragma once

#define SIMDE_ENABLE_NATIVE_ALIASES
#include <simde/x86/sse.h>
#include <limits>

#include <ConsensusCore/Quiver/detail/sse_mathfun.h>

// todo: turn these into inline functions
#define ADD4(a, b) _mm_add_ps((a), (b))

#define AFFINE4(offset, slope, dataptr) \
    (_mm_add_ps(_mm_set_ps1(offset), _mm_mul_ps(_mm_set_ps1(slope), _mm_loadu_ps((dataptr)))))

#define MUX4(mask, a, b) (_mm_or_ps(_mm_and_ps((mask), (a)), _mm_andnot_ps((mask), (b))))

#define MAX4(a, b) _mm_max_ps((a), (b))

namespace ConsensusCore {
namespace detail {
//
// Log-space arithmetic
//
static const __m128 ones = _mm_set_ps1(1.0f);

inline __m128 logAdd4(__m128 aa, __m128 bb)
{
    __m128 max = _mm_max_ps(aa, bb);
    __m128 min = _mm_min_ps(aa, bb);
    __m128 diff = _mm_sub_ps(min, max);
    return _mm_add_ps(max, log_ps(_mm_add_ps(ones, exp_ps(diff))));

    // Precision of this is currently too low -- causes problems w/ Edna
    // need a more accurate impl.
    // return logAddApprox_ps(aa, bb);
}

inline float logAdd(float a, float b)
{
    __m128 aa = _mm_set_ps1(a);
    __m128 bb = _mm_set_ps1(b);
    __m128 acc = logAdd4(aa, bb);

    // Use an aligned buf to extract
    ALIGN16_BEG float buf[4] ALIGN16_END;
    _mm_store_ps(&buf[0], acc);
    return buf[0];
}
}
}