Line data Source code
1 : ///////////////////////////////////////////////////////////////////////////
2 : // Inastemp - Berenger Bramas MPCDF - 2016
3 : // Under MIT Licence, please you must read the LICENCE file.
4 : ///////////////////////////////////////////////////////////////////////////
5 : #ifndef INAVECSSE41DOUBLE_HPP
6 : #define INAVECSSE41DOUBLE_HPP
7 :
8 : #include "SSSE3/InaVecSSSE3Double.hpp"
9 :
10 : #ifndef INASTEMP_USE_SSE41
11 : #error InaVecSSE41<double> is included but SSE41 is not enable in the configuration
12 : #endif
13 :
14 : #include <tmmintrin.h>
15 : #include <emmintrin.h>
16 : #include <smmintrin.h>
17 :
18 : template <class RealType>
19 : class InaVecSSE41;
20 :
21 : template <>
22 : class alignas(16) InaVecSSE41<double> : public InaVecSSSE3<double> {
23 : using Parent = InaVecSSSE3<double>;
24 :
25 : public:
26 : using Parent::GetVecLength;
27 :
28 11449 : using InaVecSSSE3<double>::InaVecSSSE3;
29 :
30 216 : inline InaVecSSE41(){}
31 :
32 : inline InaVecSSE41(const InaVecSSSE3<double>& other)
33 : : Parent(other){}
34 :
35 : // Re-put exp to benefit from floor
36 26 : inline InaVecSSE41<double> exp() const {
37 : #ifdef __INTEL_COMPILER
38 : return _mm_exp_pd(Parent::vec);
39 : #else
40 26 : const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
41 26 : const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64()));
42 26 : const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64()));
43 26 : const __m128d COEFF_P5_X = _mm_set1_pd(double(InaFastExp::GetCoefficient9_8()));
44 26 : const __m128d COEFF_P5_Y = _mm_set1_pd(double(InaFastExp::GetCoefficient9_7()));
45 26 : const __m128d COEFF_P5_Z = _mm_set1_pd(double(InaFastExp::GetCoefficient9_6()));
46 26 : const __m128d COEFF_P5_A = _mm_set1_pd(double(InaFastExp::GetCoefficient9_5()));
47 26 : const __m128d COEFF_P5_B = _mm_set1_pd(double(InaFastExp::GetCoefficient9_4()));
48 26 : const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient9_3()));
49 26 : const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient9_2()));
50 26 : const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient9_1()));
51 26 : const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient9_0()));
52 :
53 52 : __m128d x = _mm_mul_pd(Parent::vec, COEFF_LOG2E);
54 :
55 78 : const __m128d fractional_part = _mm_sub_pd(x, InaVecSSE41(x).floor().vec);
56 :
57 390 : __m128d factor = _mm_add_pd(_mm_mul_pd(_mm_add_pd(
58 : _mm_mul_pd(_mm_add_pd( _mm_mul_pd(_mm_add_pd(
59 : _mm_mul_pd(_mm_add_pd( _mm_mul_pd(_mm_add_pd(
60 : _mm_mul_pd(_mm_add_pd( _mm_mul_pd(_mm_add_pd(_mm_mul_pd(
61 : COEFF_P5_X, fractional_part), COEFF_P5_Y), fractional_part),
62 : COEFF_P5_Z),fractional_part), COEFF_P5_A), fractional_part),
63 : COEFF_P5_B), fractional_part), COEFF_P5_C),fractional_part),
64 : COEFF_P5_D), fractional_part), COEFF_P5_E),fractional_part),
65 26 : COEFF_P5_F);
66 :
67 26 : x = _mm_sub_pd(x,factor);
68 :
69 52 : x = _mm_add_pd(_mm_mul_pd(COEFF_A, x), COEFF_B);
70 :
71 26 : alignas(64) long int allvalint[GetVecLength()] = { _mm_cvtsd_si64(x),
72 78 : _mm_cvtsd_si64(_mm_shuffle_pd(x, x, 1)) };
73 :
74 104 : return _mm_castsi128_pd(_mm_set_epi64x(allvalint[1], allvalint[0]));
75 : #endif
76 : }
77 :
78 : inline InaVecSSE41<double> expLowAcc() const {
79 16 : const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
80 16 : const __m128d COEFF_A = _mm_set1_pd(double(InaFastExp::CoeffA64()));
81 16 : const __m128d COEFF_B = _mm_set1_pd(double(InaFastExp::CoeffB64()));
82 16 : const __m128d COEFF_P5_C = _mm_set1_pd(double(InaFastExp::GetCoefficient4_3()));
83 16 : const __m128d COEFF_P5_D = _mm_set1_pd(double(InaFastExp::GetCoefficient4_2()));
84 16 : const __m128d COEFF_P5_E = _mm_set1_pd(double(InaFastExp::GetCoefficient4_1()));
85 16 : const __m128d COEFF_P5_F = _mm_set1_pd(double(InaFastExp::GetCoefficient4_0()));
86 :
87 32 : __m128d x = _mm_mul_pd(Parent::vec, COEFF_LOG2E);
88 :
89 48 : const __m128d fractional_part = _mm_sub_pd(x, InaVecSSE41(x).floor().vec);
90 :
91 80 : __m128d factor = _mm_add_pd(_mm_mul_pd(_mm_add_pd(
92 : _mm_mul_pd(_mm_add_pd(_mm_mul_pd(
93 : COEFF_P5_C, fractional_part),
94 : COEFF_P5_D), fractional_part),
95 : COEFF_P5_E), fractional_part),
96 16 : COEFF_P5_F);
97 :
98 16 : x = _mm_sub_pd(x,factor);
99 :
100 32 : x = _mm_add_pd(_mm_mul_pd(COEFF_A, x), COEFF_B);
101 :
102 16 : alignas(64) long int allvalint[GetVecLength()] = { _mm_cvtsd_si64(x),
103 48 : _mm_cvtsd_si64(_mm_shuffle_pd(x, x, 1)) };
104 :
105 64 : return _mm_castsi128_pd(_mm_set_epi64x(allvalint[1], allvalint[0]));
106 : }
107 :
108 : inline InaVecSSE41<double> floor() const {
109 276 : return _mm_floor_pd(Parent::vec);
110 : }
111 :
112 : inline static const char* GetName(){
113 : return "InaVecSSE41<double>";
114 : }
115 :
116 : inline static InaIfElse< InaVecSSE41<double> >::ThenClass If(const typename Parent::MaskType& inTest) {
117 30 : return InaIfElse< InaVecSSE41<double> >::IfClass().If(inTest);
118 : }
119 : };
120 :
121 : #endif
|