File: fslog4.cpp

package info (click to toggle)
flang 20181226-2
  • links: PTS, VCS
  • area: main
  • in suites: buster
  • size: 181,072 kB
  • sloc: cpp: 1,182,685; ansic: 598,652; objc: 103,775; f90: 57,054; python: 15,041; fortran: 13,601; lisp: 10,416; perl: 2,460; asm: 2,148; sh: 1,544; awk: 995; cs: 565; xml: 403; lex: 295; makefile: 225; pascal: 130
file content (116 lines) | stat: -rw-r--r-- 4,239 bytes parent folder | download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116

/*
 * Copyright (c) 2017-2018, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 *
 */


#if defined(TARGET_LINUX_POWER)
#include "xmm2altivec.h"
#elif defined(TARGET_LINUX_ARM64)
#include "arm64intrin.h"
#else
#include <immintrin.h>
#endif
#include "fslog_defs.h"

extern "C" __m128 __fvs_log_fma3(__m128);

__m128 __fvs_log_fma3(__m128 a) {
    __m128 const LOG_C1_VEC = _mm_set1_ps(LOG_C1);
    __m128 const LOG_C2_VEC = _mm_set1_ps(LOG_C2);
    __m128 const LOG_C3_VEC = _mm_set1_ps(LOG_C3);
    __m128 const LOG_C4_VEC = _mm_set1_ps(LOG_C4);
    __m128 const LOG_C5_VEC = _mm_set1_ps(LOG_C5);
    __m128 const LOG_C6_VEC = _mm_set1_ps(LOG_C6);
    __m128 const LOG_C7_VEC = _mm_set1_ps(LOG_C7);
    __m128 const LOG_C8_VEC = _mm_set1_ps(LOG_C8);
    __m128 const LOG_C9_VEC = _mm_set1_ps(LOG_C9);
    __m128 const LOG_CA_VEC = _mm_set1_ps(LOG_CA);

    __m128i const CANONICAL_NAN_VEC = _mm_set1_epi32(CANONICAL_NAN);
    __m128i const MINUS_INF_VEC = _mm_set1_epi32(MINUS_INF);
    __m128i const NAN_INF_MASK_VEC = _mm_set1_epi32(NAN_INF_MASK);

    __m128 const PARTITION_CONST_VEC = _mm_set1_ps(PARTITION_CONST);
    __m128 const TWO_TO_M126_F_VEC = _mm_set1_ps(TWO_TO_M126_F);
    __m128 const TWO_TO_24_F_VEC = _mm_set1_ps(TWO_TO_24_F);

    __m128 const ONE_VEC = _mm_set1_ps(1.0f);
    __m128 const F24_VEC = _mm_set1_ps(U24);
    __m128i const BIT_MASK2_VEC = _mm_set1_epi32(BIT_MASK2);
    __m128i const OFFSET_VEC = _mm_set1_epi32(OFFSET);
    __m128i exp_offset_vec = _mm_set1_epi32(EXP_OFFSET);
    
    __m128 const FLT2INT_CVT = _mm_set1_ps(12582912.0f);
    __m128 FLT2INT_CVT_BIAS = _mm_set1_ps(12582912.0f + 126.0f);
    
    __m128 mask = _mm_cmp_ps(a, TWO_TO_M126_F_VEC, _CMP_LT_OS);
    __m128 fix = _mm_blendv_ps(ONE_VEC, TWO_TO_24_F_VEC, mask);
    a = _mm_mul_ps(a, fix);
    FLT2INT_CVT_BIAS = _mm_add_ps(FLT2INT_CVT_BIAS, _mm_and_ps(mask, F24_VEC));

    __m128 tmpm;
    __m128 spec;

    mask = _mm_cmp_ps(a, _mm_set1_ps(0.0f), _CMP_LT_OS);
    spec = _mm_and_ps((__m128)CANONICAL_NAN_VEC, mask);
        
    mask = _mm_cmp_ps(a, _mm_set1_ps(0.0f), _CMP_EQ_OS);
    tmpm = _mm_and_ps(mask, (__m128)MINUS_INF_VEC);
    spec = _mm_or_ps(tmpm, spec);
    
    mask = _mm_cmp_ps(a, (__m128)NAN_INF_MASK_VEC, _CMP_EQ_OS);
    tmpm = _mm_and_ps(mask, a);
    spec = _mm_or_ps(tmpm,spec);
    mask = _mm_cmp_ps(a, a, _CMP_NEQ_UQ);
    tmpm = _mm_and_ps(mask, _mm_add_ps(a,a));
    spec = _mm_or_ps(tmpm,spec);

    __m128 e = (__m128)_mm_srli_epi32((__m128i)a, 23);
           e = (__m128)_mm_add_epi32((__m128i)e, (__m128i)FLT2INT_CVT);
           e = _mm_sub_ps(e, FLT2INT_CVT_BIAS);

    __m128 m = _mm_and_ps((__m128)BIT_MASK2_VEC, a);
           m = (__m128)_mm_add_epi32((__m128i)m, OFFSET_VEC);
    
    __m128 mask_shift = _mm_cmp_ps(m, PARTITION_CONST_VEC, _CMP_LT_OS);
    
    e = _mm_sub_ps(e, _mm_and_ps(mask_shift, _mm_set1_ps(1.0f)));
    m = _mm_add_ps(m, _mm_and_ps(mask_shift, m));
    m = _mm_sub_ps(m, _mm_set1_ps(1.0f));
    
    __m128 const LN2 = _mm_set1_ps(0x1.62E43p-01);
    e = _mm_mul_ps(e, LN2);

    __m128 t =                       LOG_CA_VEC;
           t = _mm_fmadd_ps(t, m, LOG_C9_VEC);
           t = _mm_fmadd_ps(t, m, LOG_C8_VEC);
           t = _mm_fmadd_ps(t, m, LOG_C7_VEC);
           t = _mm_fmadd_ps(t, m, LOG_C6_VEC);
           t = _mm_fmadd_ps(t, m, LOG_C5_VEC);
           t = _mm_fmadd_ps(t, m, LOG_C4_VEC);
           t = _mm_fmadd_ps(t, m, LOG_C3_VEC);
           t = _mm_fmadd_ps(t, m, LOG_C2_VEC);
           t = _mm_fmadd_ps(t, m, LOG_C1_VEC);

    __m128 m2 = _mm_mul_ps(m, m);
           t = _mm_fmadd_ps(t, m2, m);
           t = _mm_add_ps(t, e);
           t = _mm_add_ps(t, spec); 

    return t;
}