#pragma once #include #include"linearity.h" // strides must be 16-byte aligned static inline void bgra64toycbcr(uint8_t *bgra64, size_t bgra64stride, size_t imgW, size_t imgH, uint8_t *outY, uint8_t *outU, uint8_t *outV, size_t strideY, size_t strideU, size_t strideV) { #pragma omp parallel for simd for(size_t y = 0; y < imgH; y += 2) { for(size_t x = 0; x < imgW; x += 16) { __m128i rgb, partY, partU, partV, dotY, dotU, dotV; __m128i wipY0 = _mm_setzero_si128(); __m128i wipY1 = _mm_setzero_si128(); __m128i wipU = _mm_setzero_si128(); __m128i wipV = _mm_setzero_si128(); __m128i tempU = _mm_setzero_si128(); __m128i tempV = _mm_setzero_si128(); #define DO_DAH_DOO_DOO(LoOrHi, shufY, shufUV) \ /* Process top two */\ rgb = _mm_srli_epi16(apply_gamma_epi16(line0, _mm_set1_ps(1 / 2.2f)), 8); \ /* Start matrix multiplication (BT.709 + full->studio range) */\ partY = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 47, 157, 16, 0, 47, 157, 16));\ partU = _mm_mullo_epi16(rgb, _mm_set_epi16(0, -25, -85, 110, 0, -25, -85, 110));\ partV = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 110, -100, -10, 0, 110, -100, -10));\ /* Finish mat-mul with dot products */\ dotY = _mm_madd_epi16(partY, _mm_set1_epi16(1));\ dotY = _mm_hadd_epi32(dotY, _mm_setzero_si128());\ dotU = _mm_madd_epi16(partU, _mm_set1_epi16(1));\ dotU = _mm_hadd_epi32(dotU, _mm_setzero_si128());\ dotV = _mm_madd_epi16(partV, _mm_set1_epi16(1));\ dotV = _mm_hadd_epi32(dotV, _mm_setzero_si128());\ /* Insert Ys */\ wipY0 = _mm_or_si128(wipY0, _mm_shuffle_epi8(dotY, shufY));\ /* Save top UV */\ tempU = dotU;\ tempV = dotV;\ \ /* Process bottom two */\ rgb = _mm_srli_epi16(apply_gamma_epi16(line1, _mm_set1_ps(1 / 2.2f)), 8); \ /* Start matrix multiplication (BT.709 + full->studio range) */\ partY = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 47, 157, 16, 0, 47, 157, 16));\ partU = _mm_mullo_epi16(rgb, _mm_set_epi16(0, -25, -85, 110, 0, -25, -85, 110));\ partV = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 110, -100, -10, 0, 110, -100, -10));\ /* Finish mat-mul with dot products */\ dotY = _mm_madd_epi16(partY, _mm_set1_epi16(1));\ dotY = _mm_hadd_epi32(dotY, _mm_setzero_si128());\ dotU = _mm_madd_epi16(partU, _mm_set1_epi16(1));\ dotU = _mm_hadd_epi32(dotU, _mm_setzero_si128());\ dotV = _mm_madd_epi16(partV, _mm_set1_epi16(1));\ dotV = _mm_hadd_epi32(dotV, _mm_setzero_si128());\ /* Insert Ys */\ wipY1 = _mm_or_si128(wipY1, _mm_shuffle_epi8(dotY, shufY));\ /* Save bottom UVs */\ tempU = _mm_hadd_epi32(_mm_add_epi32(tempU, dotU), _mm_setzero_si128());\ tempV = _mm_hadd_epi32(_mm_add_epi32(tempV, dotV), _mm_setzero_si128());\ \ /* Insert UVs */\ wipU = _mm_or_si128(wipU, _mm_shuffle_epi8(_mm_srli_epi32(tempU, 2), shufUV));\ wipV = _mm_or_si128(wipV, _mm_shuffle_epi8(_mm_srli_epi32(tempV, 2), shufUV)); __m128i line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 0) * 8)); // Load two pixels __m128i line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 0) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 5, 1), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1)); line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 2) * 8)); // Load two pixels line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 2) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 5, 1, -128, -128), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128)); line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 4) * 8)); // Load two pixels line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 4) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 5, 1, -128, -128, -128, -128), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128)); line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 6) * 8)); // Load two pixels line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 6) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, 5, 1, -128, -128, -128, -128, -128, -128), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128)); line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 8) * 8)); // Load two pixels line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 8) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8(-128, -128, -128, -128, -128, -128, 5, 1, -128, -128, -128, -128, -128, -128, -128, -128), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128)); line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 10) * 8)); // Load two pixels line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 10) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8(-128, -128, -128, -128, 5, 1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128, -128)); line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 12) * 8)); // Load two pixels line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 12) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8(-128, -128, 5, 1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128, -128, -128)); line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 14) * 8)); // Load two pixels line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 14) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8( 5, 1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128, -128, -128, -128)); _mm_stream_si128((__m128i*) &outY[strideY * (y + 0) + x], _mm_add_epi8(_mm_set1_epi8(16), wipY0)); _mm_stream_si128((__m128i*) &outY[strideY * (y + 1) + x], _mm_add_epi8(_mm_set1_epi8(16), wipY1)); _mm_storeu_si128((__m128i*) &outU[strideU * (y / 2) + x / 2], _mm_add_epi8(wipU, _mm_set1_epi8(128))); _mm_storeu_si128((__m128i*) &outV[strideV * (y / 2) + x / 2], _mm_add_epi8(wipV, _mm_set1_epi8(128))); } } }