LCOV - code coverage report
Current view: top level - Src/SSE41 - InaVecSSE41Double.hpp (source / functions) Hit Total Coverage
Test: Coverage inastemp Lines: 42 42 100.0 %
Date: 2022-03-17 09:48:28 Functions: 1 1 100.0 %

          Line data    Source code
       1             : ///////////////////////////////////////////////////////////////////////////
       2             : // Inastemp - Berenger Bramas MPCDF - 2016
       3             : // Under MIT Licence, please you must read the LICENCE file.
       4             : ///////////////////////////////////////////////////////////////////////////
       5             : #ifndef INAVECSSE41DOUBLE_HPP
       6             : #define INAVECSSE41DOUBLE_HPP
       7             : 
       8             : #include "SSSE3/InaVecSSSE3Double.hpp"
       9             : 
      10             : #ifndef INASTEMP_USE_SSE41
      11             : #error InaVecSSE41<double> is included but SSE41 is not enable in the configuration
      12             : #endif
      13             : 
      14             : #include <tmmintrin.h>
      15             : #include <emmintrin.h>
      16             : #include <smmintrin.h>
      17             : 
      18             : template <class RealType>
      19             : class InaVecSSE41;
      20             : 
      21             : template <>
      22             : class alignas(16) InaVecSSE41<double> : public InaVecSSSE3<double> {
      23             :     using Parent = InaVecSSSE3<double>;
      24             : 
      25             : public:
      26             :     using Parent::GetVecLength;
      27             : 
      28       11449 :     using InaVecSSSE3<double>::InaVecSSSE3;
      29             : 
      30         216 :     inline InaVecSSE41(){}
      31             : 
      32             :     inline InaVecSSE41(const InaVecSSSE3<double>& other)
      33             :         : Parent(other){}
      34             : 
      35             :     // Re-put exp to benefit from floor
      36          26 :     inline InaVecSSE41<double> exp() const {
      37             : #ifdef __INTEL_COMPILER
      38             :         return _mm_exp_pd(Parent::vec);
      39             : #else
      40          26 :         const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
      41          26 :         const __m128d COEFF_A     = _mm_set1_pd(double(InaFastExp::CoeffA64()));
      42          26 :         const __m128d COEFF_B     = _mm_set1_pd(double(InaFastExp::CoeffB64()));
      43          26 :         const __m128d COEFF_P5_X  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_8()));
      44          26 :         const __m128d COEFF_P5_Y  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_7()));
      45          26 :         const __m128d COEFF_P5_Z  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_6()));
      46          26 :         const __m128d COEFF_P5_A  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_5()));
      47          26 :         const __m128d COEFF_P5_B  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_4()));
      48          26 :         const __m128d COEFF_P5_C  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_3()));
      49          26 :         const __m128d COEFF_P5_D  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_2()));
      50          26 :         const __m128d COEFF_P5_E  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_1()));
      51          26 :         const __m128d COEFF_P5_F  = _mm_set1_pd(double(InaFastExp::GetCoefficient9_0()));
      52             : 
      53          52 :         __m128d x = _mm_mul_pd(Parent::vec, COEFF_LOG2E);
      54             : 
      55          78 :         const __m128d fractional_part = _mm_sub_pd(x, InaVecSSE41(x).floor().vec);
      56             : 
      57         390 :         __m128d factor = _mm_add_pd(_mm_mul_pd(_mm_add_pd(
      58             :                          _mm_mul_pd(_mm_add_pd( _mm_mul_pd(_mm_add_pd(
      59             :                          _mm_mul_pd(_mm_add_pd( _mm_mul_pd(_mm_add_pd(
      60             :                          _mm_mul_pd(_mm_add_pd( _mm_mul_pd(_mm_add_pd(_mm_mul_pd(
      61             :                          COEFF_P5_X, fractional_part), COEFF_P5_Y), fractional_part),
      62             :                          COEFF_P5_Z),fractional_part), COEFF_P5_A), fractional_part),
      63             :                          COEFF_P5_B), fractional_part), COEFF_P5_C),fractional_part),
      64             :                          COEFF_P5_D), fractional_part), COEFF_P5_E),fractional_part),
      65          26 :                          COEFF_P5_F);
      66             : 
      67          26 :         x = _mm_sub_pd(x,factor);
      68             : 
      69          52 :         x = _mm_add_pd(_mm_mul_pd(COEFF_A, x), COEFF_B);
      70             : 
      71          26 :         alignas(64) long int allvalint[GetVecLength()] = { _mm_cvtsd_si64(x),
      72          78 :                                                       _mm_cvtsd_si64(_mm_shuffle_pd(x, x, 1)) };
      73             : 
      74         104 :         return _mm_castsi128_pd(_mm_set_epi64x(allvalint[1], allvalint[0]));
      75             : #endif
      76             :     }
      77             : 
      78             :     inline InaVecSSE41<double> expLowAcc() const {
      79          16 :         const __m128d COEFF_LOG2E = _mm_set1_pd(double(InaFastExp::CoeffLog2E()));
      80          16 :         const __m128d COEFF_A     = _mm_set1_pd(double(InaFastExp::CoeffA64()));
      81          16 :         const __m128d COEFF_B     = _mm_set1_pd(double(InaFastExp::CoeffB64()));
      82          16 :         const __m128d COEFF_P5_C  = _mm_set1_pd(double(InaFastExp::GetCoefficient4_3()));
      83          16 :         const __m128d COEFF_P5_D  = _mm_set1_pd(double(InaFastExp::GetCoefficient4_2()));
      84          16 :         const __m128d COEFF_P5_E  = _mm_set1_pd(double(InaFastExp::GetCoefficient4_1()));
      85          16 :         const __m128d COEFF_P5_F  = _mm_set1_pd(double(InaFastExp::GetCoefficient4_0()));
      86             : 
      87          32 :         __m128d x = _mm_mul_pd(Parent::vec, COEFF_LOG2E);
      88             : 
      89          48 :         const __m128d fractional_part = _mm_sub_pd(x, InaVecSSE41(x).floor().vec);
      90             : 
      91          80 :         __m128d factor = _mm_add_pd(_mm_mul_pd(_mm_add_pd(
      92             :                          _mm_mul_pd(_mm_add_pd(_mm_mul_pd(
      93             :                                          COEFF_P5_C, fractional_part),
      94             :                                          COEFF_P5_D), fractional_part),
      95             :                                          COEFF_P5_E), fractional_part),
      96          16 :                                          COEFF_P5_F);
      97             : 
      98          16 :         x = _mm_sub_pd(x,factor);
      99             : 
     100          32 :         x = _mm_add_pd(_mm_mul_pd(COEFF_A, x), COEFF_B);
     101             : 
     102          16 :         alignas(64) long int allvalint[GetVecLength()] = { _mm_cvtsd_si64(x),
     103          48 :                                                       _mm_cvtsd_si64(_mm_shuffle_pd(x, x, 1)) };
     104             : 
     105          64 :         return _mm_castsi128_pd(_mm_set_epi64x(allvalint[1], allvalint[0]));
     106             :     }
     107             : 
     108             :     inline InaVecSSE41<double> floor() const {
     109         276 :         return _mm_floor_pd(Parent::vec);
     110             :     }
     111             : 
     112             :     inline static const char* GetName(){
     113             :         return "InaVecSSE41<double>";
     114             :     }
     115             : 
     116             :     inline static InaIfElse< InaVecSSE41<double> >::ThenClass If(const typename Parent::MaskType& inTest) {
     117          30 :         return InaIfElse< InaVecSSE41<double> >::IfClass().If(inTest);
     118             :     }
     119             : };
     120             : 
     121             : #endif

Generated by: LCOV version 1.13