cuticle/hi/yuv.h
2025-03-09 10:29:35 +02:00

128 lines
7.3 KiB
C

#pragma once
#include<smmintrin.h>
#include"linearity.h"
// strides must be 16-byte aligned
static inline void bgra64toycbcr(uint8_t *bgra64, size_t bgra64stride, size_t imgW, size_t imgH, uint8_t *outY, uint8_t *outU, uint8_t *outV, size_t strideY, size_t strideU, size_t strideV) {
#pragma omp parallel for simd
for(size_t y = 0; y < imgH; y += 2) {
for(size_t x = 0; x < imgW; x += 16) {
__m128i rgb, partY, partU, partV, dotY, dotU, dotV;
__m128i wipY0 = _mm_setzero_si128();
__m128i wipY1 = _mm_setzero_si128();
__m128i wipU = _mm_setzero_si128();
__m128i wipV = _mm_setzero_si128();
__m128i tempU = _mm_setzero_si128();
__m128i tempV = _mm_setzero_si128();
#define DO_DAH_DOO_DOO(LoOrHi, shufY, shufUV) \
/* Process top two */\
rgb = _mm_srli_epi16(apply_gamma_epi16(line0, _mm_set1_ps(1 / 2.2f)), 8); \
/* Start matrix multiplication (BT.709 + full->studio range) */\
partY = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 47, 157, 16, 0, 47, 157, 16));\
partU = _mm_mullo_epi16(rgb, _mm_set_epi16(0, -25, -85, 110, 0, -25, -85, 110));\
partV = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 110, -100, -10, 0, 110, -100, -10));\
/* Finish mat-mul with dot products */\
dotY = _mm_madd_epi16(partY, _mm_set1_epi16(1));\
dotY = _mm_hadd_epi32(dotY, _mm_setzero_si128());\
dotU = _mm_madd_epi16(partU, _mm_set1_epi16(1));\
dotU = _mm_hadd_epi32(dotU, _mm_setzero_si128());\
dotV = _mm_madd_epi16(partV, _mm_set1_epi16(1));\
dotV = _mm_hadd_epi32(dotV, _mm_setzero_si128());\
/* Insert Ys */\
wipY0 = _mm_or_si128(wipY0, _mm_shuffle_epi8(dotY, shufY));\
/* Save top UV */\
tempU = dotU;\
tempV = dotV;\
\
/* Process bottom two */\
rgb = _mm_srli_epi16(apply_gamma_epi16(line1, _mm_set1_ps(1 / 2.2f)), 8); \
/* Start matrix multiplication (BT.709 + full->studio range) */\
partY = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 47, 157, 16, 0, 47, 157, 16));\
partU = _mm_mullo_epi16(rgb, _mm_set_epi16(0, -25, -85, 110, 0, -25, -85, 110));\
partV = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 110, -100, -10, 0, 110, -100, -10));\
/* Finish mat-mul with dot products */\
dotY = _mm_madd_epi16(partY, _mm_set1_epi16(1));\
dotY = _mm_hadd_epi32(dotY, _mm_setzero_si128());\
dotU = _mm_madd_epi16(partU, _mm_set1_epi16(1));\
dotU = _mm_hadd_epi32(dotU, _mm_setzero_si128());\
dotV = _mm_madd_epi16(partV, _mm_set1_epi16(1));\
dotV = _mm_hadd_epi32(dotV, _mm_setzero_si128());\
/* Insert Ys */\
wipY1 = _mm_or_si128(wipY1, _mm_shuffle_epi8(dotY, shufY));\
/* Save bottom UVs */\
tempU = _mm_hadd_epi32(_mm_add_epi32(tempU, dotU), _mm_setzero_si128());\
tempV = _mm_hadd_epi32(_mm_add_epi32(tempV, dotV), _mm_setzero_si128());\
\
/* Insert UVs */\
wipU = _mm_or_si128(wipU, _mm_shuffle_epi8(_mm_srli_epi32(tempU, 2), shufUV));\
wipV = _mm_or_si128(wipV, _mm_shuffle_epi8(_mm_srli_epi32(tempV, 2), shufUV));
__m128i line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 0) * 8)); // Load two pixels
__m128i line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 0) * 8)); // Load two pixels
DO_DAH_DOO_DOO(_mm_unpacklo_epi8,
_mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 5, 1),
_mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1));
line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 2) * 8)); // Load two pixels
line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 2) * 8)); // Load two pixels
DO_DAH_DOO_DOO(_mm_unpacklo_epi8,
_mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 5, 1, -128, -128),
_mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128));
line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 4) * 8)); // Load two pixels
line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 4) * 8)); // Load two pixels
DO_DAH_DOO_DOO(_mm_unpacklo_epi8,
_mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 5, 1, -128, -128, -128, -128),
_mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128));
line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 6) * 8)); // Load two pixels
line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 6) * 8)); // Load two pixels
DO_DAH_DOO_DOO(_mm_unpacklo_epi8,
_mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, 5, 1, -128, -128, -128, -128, -128, -128),
_mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128));
line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 8) * 8)); // Load two pixels
line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 8) * 8)); // Load two pixels
DO_DAH_DOO_DOO(_mm_unpacklo_epi8,
_mm_set_epi8(-128, -128, -128, -128, -128, -128, 5, 1, -128, -128, -128, -128, -128, -128, -128, -128),
_mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128));
line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 10) * 8)); // Load two pixels
line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 10) * 8)); // Load two pixels
DO_DAH_DOO_DOO(_mm_unpacklo_epi8,
_mm_set_epi8(-128, -128, -128, -128, 5, 1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128),
_mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128, -128));
line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 12) * 8)); // Load two pixels
line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 12) * 8)); // Load two pixels
DO_DAH_DOO_DOO(_mm_unpacklo_epi8,
_mm_set_epi8(-128, -128, 5, 1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128),
_mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128, -128, -128));
line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 14) * 8)); // Load two pixels
line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 14) * 8)); // Load two pixels
DO_DAH_DOO_DOO(_mm_unpacklo_epi8,
_mm_set_epi8( 5, 1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128),
_mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128, -128, -128, -128));
_mm_stream_si128((__m128i*) &outY[strideY * (y + 0) + x], _mm_add_epi8(_mm_set1_epi8(16), wipY0));
_mm_stream_si128((__m128i*) &outY[strideY * (y + 1) + x], _mm_add_epi8(_mm_set1_epi8(16), wipY1));
_mm_storeu_si128((__m128i*) &outU[strideU * (y / 2) + x / 2], _mm_add_epi8(wipU, _mm_set1_epi8(128)));
_mm_storeu_si128((__m128i*) &outV[strideV * (y / 2) + x / 2], _mm_add_epi8(wipV, _mm_set1_epi8(128)));
}
}
}