Line data Source code
1 : ///////////////////////////////////////////////////////////////////////////
2 : // Inastemp - Berenger Bramas MPCDF - 2016
3 : // Under MIT Licence, please you must read the LICENCE file.
4 : ///////////////////////////////////////////////////////////////////////////
5 : #ifndef INAVECSSE41FLOAT_HPP
6 : #define INAVECSSE41FLOAT_HPP
7 :
8 : #include "InastempGlobal.h"
9 : #include "SSSE3/InaVecSSSE3Float.hpp"
10 :
11 : #ifndef INASTEMP_USE_SSE41
12 : #error InaVecSSE41<float> is included but SSE41 is not enable in the configuration
13 : #endif
14 :
15 : #include <tmmintrin.h>
16 : #include <emmintrin.h>
17 : #include <smmintrin.h>
18 :
19 : template <class RealType>
20 : class InaVecSSE41;
21 :
22 : template <>
23 : class alignas(16) InaVecSSE41<float> : public InaVecSSSE3<float> {
24 : using Parent = InaVecSSSE3<float>;
25 :
26 : public:
27 : using Parent::GetVecLength;
28 :
29 15818 : using InaVecSSSE3<float>::InaVecSSSE3;
30 :
31 288 : inline InaVecSSE41(){}
32 :
33 : inline InaVecSSE41(const InaVecSSSE3<float>& other)
34 : : Parent(other){}
35 :
36 : // Re-put exp to benefit from Floor
37 26 : inline InaVecSSE41<float> exp() const {
38 : #ifdef __INTEL_COMPILER
39 : return _mm_exp_ps(Parent::vec);
40 : #else
41 26 : const __m128 COEFF_LOG2E = _mm_set1_ps(float(InaFastExp::CoeffLog2E()));
42 26 : const __m128 COEFF_A = _mm_set1_ps(float(InaFastExp::CoeffA32()));
43 26 : const __m128 COEFF_B = _mm_set1_ps(float(InaFastExp::CoeffB32()));
44 26 : const __m128 COEFF_P5_A = _mm_set1_ps(float(InaFastExp::GetCoefficient6_5()));
45 26 : const __m128 COEFF_P5_B = _mm_set1_ps(float(InaFastExp::GetCoefficient6_4()));
46 26 : const __m128 COEFF_P5_C = _mm_set1_ps(float(InaFastExp::GetCoefficient6_3()));
47 26 : const __m128 COEFF_P5_D = _mm_set1_ps(float(InaFastExp::GetCoefficient6_2()));
48 26 : const __m128 COEFF_P5_E = _mm_set1_ps(float(InaFastExp::GetCoefficient6_1()));
49 26 : const __m128 COEFF_P5_F = _mm_set1_ps(float(InaFastExp::GetCoefficient6_0()));
50 :
51 52 : __m128 x = _mm_mul_ps( Parent::vec , COEFF_LOG2E);
52 :
53 78 : const __m128 fractional_part = _mm_sub_ps(x, InaVecSSE41(x).floor().vec);
54 :
55 234 : __m128 factor = _mm_add_ps(_mm_mul_ps(_mm_add_ps( _mm_mul_ps(_mm_add_ps(
56 : _mm_mul_ps(_mm_add_ps( _mm_mul_ps(_mm_add_ps(_mm_mul_ps(
57 : COEFF_P5_A, fractional_part), COEFF_P5_B), fractional_part), COEFF_P5_C),fractional_part),
58 26 : COEFF_P5_D), fractional_part), COEFF_P5_E),fractional_part), COEFF_P5_F);
59 :
60 26 : x = _mm_sub_ps(x,factor);
61 :
62 78 : __m128i castedInteger = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(COEFF_A, x), COEFF_B));
63 :
64 52 : return _mm_castsi128_ps(castedInteger);
65 : #endif
66 : }
67 :
68 : inline InaVecSSE41<float> ExpLowAcc() const {
69 : const __m128 COEFF_LOG2E = _mm_set1_ps(float(InaFastExp::CoeffLog2E()));
70 : const __m128 COEFF_A = _mm_set1_ps(float(InaFastExp::CoeffA32()));
71 : const __m128 COEFF_B = _mm_set1_ps(float(InaFastExp::CoeffB32()));
72 : const __m128 COEFF_P5_D = _mm_set1_ps(float(InaFastExp::GetCoefficient3_2()));
73 : const __m128 COEFF_P5_E = _mm_set1_ps(float(InaFastExp::GetCoefficient3_1()));
74 : const __m128 COEFF_P5_F = _mm_set1_ps(float(InaFastExp::GetCoefficient3_0()));
75 :
76 : __m128 x = _mm_mul_ps( Parent::vec , COEFF_LOG2E);
77 :
78 : const __m128 fractional_part = _mm_sub_ps(x, InaVecSSE41(x).floor().vec);
79 :
80 : __m128 factor = _mm_add_ps(_mm_mul_ps(
81 : _mm_add_ps(_mm_mul_ps(
82 : COEFF_P5_D, fractional_part),
83 : COEFF_P5_E), fractional_part),
84 : COEFF_P5_F);
85 :
86 : x = _mm_sub_ps(x,factor);
87 :
88 : __m128i castedInteger = _mm_cvtps_epi32(_mm_add_ps(_mm_mul_ps(COEFF_A, x), COEFF_B));
89 :
90 : return _mm_castsi128_ps(castedInteger);
91 : }
92 :
93 : inline InaVecSSE41<float> floor() const {
94 228 : return _mm_floor_ps(Parent::vec);
95 : }
96 :
97 : inline static const char* GetName() {
98 : return "InaVecSSE41<float>";
99 : }
100 :
101 : inline static InaIfElse< InaVecSSE41<float> >::ThenClass If(const typename Parent::MaskType inTest) {
102 30 : return InaIfElse< InaVecSSE41<float> >::IfClass().If(inTest);
103 : }
104 : };
105 :
106 : #endif
|