119 lines
4.1 KiB
C
119 lines
4.1 KiB
C
#pragma once
|
|
|
|
#define SSE_MATHFUN_WITH_CODE
|
|
#include"kumb.h"
|
|
|
|
// exp2f4 and log2f4 by Jose Fonseca (MIT License)
|
|
|
|
#define EXP_POLY_DEGREE 3
|
|
|
|
#define POLY0(x, c0) _mm_set1_ps(c0)
|
|
#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
|
|
#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
|
|
#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
|
|
#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
|
|
#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
|
|
|
|
static inline __m128 exp2f4(__m128 x)
|
|
{
|
|
__m128i ipart;
|
|
__m128 fpart, expipart, expfpart;
|
|
|
|
x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
|
|
x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
|
|
|
|
/* ipart = int(x - 0.5) */
|
|
ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
|
|
|
|
/* fpart = x - ipart */
|
|
fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
|
|
|
|
/* expipart = (float) (1 << ipart) */
|
|
expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
|
|
|
|
/* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
|
|
#if EXP_POLY_DEGREE == 5
|
|
expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
|
|
#elif EXP_POLY_DEGREE == 4
|
|
expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
|
|
#elif EXP_POLY_DEGREE == 3
|
|
expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
|
|
#elif EXP_POLY_DEGREE == 2
|
|
expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
|
|
#else
|
|
#error
|
|
#endif
|
|
|
|
return _mm_mul_ps(expipart, expfpart);
|
|
}
|
|
|
|
#define LOG_POLY_DEGREE 5
|
|
|
|
static inline __m128 log2f4(__m128 x)
|
|
{
|
|
__m128i exp = _mm_set1_epi32(0x7F800000);
|
|
__m128i mant = _mm_set1_epi32(0x007FFFFF);
|
|
|
|
__m128 one = _mm_set1_ps( 1.0f);
|
|
|
|
__m128i i = _mm_castps_si128(x);
|
|
|
|
__m128 e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, exp), 23), _mm_set1_epi32(127)));
|
|
|
|
__m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mant)), one);
|
|
|
|
__m128 p;
|
|
|
|
/* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */
|
|
#if LOG_POLY_DEGREE == 6
|
|
p = POLY5( m, 3.1157899f, -3.3241990f, 2.5988452f, -1.2315303f, 3.1821337e-1f, -3.4436006e-2f);
|
|
#elif LOG_POLY_DEGREE == 5
|
|
p = POLY4(m, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
|
|
#elif LOG_POLY_DEGREE == 4
|
|
p = POLY3(m, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
|
|
#elif LOG_POLY_DEGREE == 3
|
|
p = POLY2(m, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
|
|
#else
|
|
#error
|
|
#endif
|
|
|
|
/* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
|
|
p = _mm_mul_ps(p, _mm_sub_ps(m, one));
|
|
|
|
return _mm_add_ps(p, e);
|
|
}
|
|
|
|
__attribute__((optimize("O3"))) static inline __m128 apply_gamma_ps(__m128 z, __m128 gamma) {
|
|
return exp2f4(_mm_mul_ps(log2f4(z), gamma));
|
|
}
|
|
|
|
__attribute__((optimize("O3"))) static inline __m128i apply_gamma_epi32(__m128i z, __m128 gamma) {
|
|
__m128 zf = _mm_cvtepi32_ps(z);
|
|
zf = _mm_mul_ps(zf, _mm_set1_ps(1.f / 65535));
|
|
zf = log2f4(zf);
|
|
zf = _mm_mul_ps(zf, gamma);
|
|
zf = exp2f4(zf);
|
|
zf = _mm_mul_ps(zf, _mm_set1_ps(65535));
|
|
z = _mm_cvtps_epi32(zf);
|
|
|
|
/* Sometimes overflow causes the top 16 bits to be non-zero (e.g. when log(0) makes NaN) */
|
|
/* Those must be masked out */
|
|
z = _mm_and_si128(z, _mm_set1_epi32(0x0000FFFF));
|
|
|
|
return z;
|
|
}
|
|
|
|
__attribute__((optimize("O3"))) static inline __m128i apply_gamma_epi16(__m128i z, __m128 gamma) {
|
|
__m128i low = apply_gamma_epi32(_mm_unpacklo_epi16(z, _mm_setzero_si128()), gamma);
|
|
__m128i high = apply_gamma_epi32(_mm_unpackhi_epi16(z, _mm_setzero_si128()), gamma);
|
|
|
|
low = _mm_hadd_epi16(low, low);
|
|
low = _mm_and_si128(low, _mm_set_epi32(0, 0, 0xFFFFFFFF, 0xFFFFFFFF));
|
|
|
|
high = _mm_hadd_epi16(high, high);
|
|
high = _mm_slli_si128(high, 8);
|
|
|
|
return _mm_or_si128(low, high);
|
|
}
|
|
|