diff --git a/hi/aaclc.c b/hi/aaclc.c new file mode 100644 index 0000000..3617c52 --- /dev/null +++ b/hi/aaclc.c @@ -0,0 +1,141 @@ +#include"node.h" + +#include +#include +#include"img.h" +#include +#include +#include +#include +#include + +typedef struct { + CHiPubNode pub; + + HANDLE_AACENCODER enc; + + int16_t *pcmbuf; + size_t pcmbufSz; + + uint64_t timestamp; // In samples (48000 Hz) + + AACENC_InfoStruct info; + bool extradataSent; +} Internal; + +static int encodeaac_perform(CHiPubNode *pub) { + Internal *n = (Internal*) pub; + + CHiImage *newpcm = CHi_Crawl(&pub->sinks[0])->data.sample; + if(newpcm && newpcm->width) { + n->pcmbuf = realloc(n->pcmbuf, sizeof(*n->pcmbuf) * (n->pcmbufSz + newpcm->width * newpcm->channels)); + memcpy(n->pcmbuf + n->pcmbufSz, newpcm->data8, newpcm->width * newpcm->channels * sizeof(*n->pcmbuf)); + /*for(size_t z = 0; z < newpcm->width; z++) { + static size_t lol = 0; + n->pcmbuf[n->pcmbufSz + z] = sinf(lol++ * 440.0 / 48000 * 2.0 * 3.14159) * 15000; + }*/ + n->pcmbufSz += newpcm->width * newpcm->channels; + } + + CHiBSFrames *frames = calloc(1, sizeof(*frames) + 1 * sizeof(CHiBSFrame)); + + if(!n->extradataSent) { + frames->count++; + + frames->data[frames->count - 1].timestamp = 0; + frames->data[frames->count - 1].flags = CUTIHI_BS_SETUP_PACKET; + frames->data[frames->count - 1].ptr = n->info.confBuf; + frames->data[frames->count - 1].sz = n->info.confSize; + + n->extradataSent = true; + } + + while(n->pcmbufSz) { + int inIdentifier = IN_AUDIO_DATA; + int inSize = n->pcmbufSz * sizeof(*n->pcmbuf); + int inElSize = 2; + AACENC_BufDesc inbuf = { .numBufs = 1, .bufs = &n->pcmbuf, .bufferIdentifiers = &inIdentifier, .bufSizes = &inSize, .bufElSizes = &inElSize }; + + int outIdentifier = OUT_BITSTREAM_DATA; + int outSize = 2048; + int outElSize = 1; + void *outBuf = malloc(outSize); + AACENC_BufDesc outbuf = { .numBufs = 1, .bufs = &outBuf, .bufferIdentifiers = &outIdentifier, .bufSizes = &outSize, .bufElSizes = &outElSize }; + + AACENC_InArgs inargs = { .numInSamples = n->pcmbufSz }; + AACENC_OutArgs outargs = {}; + + int err = aacEncEncode(n->enc, &inbuf, &outbuf, &inargs, &outargs); + + if(err == AACENC_OK) { + if(outargs.numOutBytes > 0) { + frames = realloc(frames, sizeof(*frames) + (++frames->count) * sizeof(CHiBSFrame)); + + frames->data[frames->count - 1].timestamp = n->timestamp / 48; // ms + frames->data[frames->count - 1].flags = 0; + frames->data[frames->count - 1].ptr = outBuf; + frames->data[frames->count - 1].sz = outargs.numOutBytes; + } + if(outargs.numInSamples > 0) { + memmove(n->pcmbuf, n->pcmbuf + outargs.numInSamples, sizeof(*n->pcmbuf) * (n->pcmbufSz - outargs.numInSamples)); + n->pcmbufSz -= outargs.numInSamples; + + n->timestamp += outargs.numInSamples; + } + + if(outargs.numOutBytes == 0 || outargs.numInSamples == 0) { + break; + } + } else if(err == AACENC_ENCODE_EOF) { + break; + } else { + abort(); + return 0; + } + } + + n->pub.sources[0].type = CUTIHI_VAL_AACBS; + n->pub.sources[0].data.bitstream = frames; + + return 1; +} +static int encodeaac_start(CHiPubNode *pub) { + Internal *n = (Internal*) pub; + + assert(aacEncOpen(&n->enc, 0, 1) == AACENC_OK); + assert(aacEncoder_SetParam(n->enc, AACENC_AOT, 2) == AACENC_OK); // Low complexity profile + assert(aacEncoder_SetParam(n->enc, AACENC_SAMPLERATE, 48000) == AACENC_OK); + assert(aacEncoder_SetParam(n->enc, AACENC_CHANNELMODE, MODE_1) == AACENC_OK); + assert(aacEncoder_SetParam(n->enc, AACENC_CHANNELORDER, 1) == AACENC_OK); + assert(aacEncoder_SetParam(n->enc, AACENC_BITRATEMODE, 3) == AACENC_OK); + + assert(aacEncEncode(n->enc, NULL, NULL, NULL, NULL) == AACENC_OK); + + memset(&n->info, 0, sizeof(n->info)); + assert(aacEncInfo(n->enc, &n->info) == AACENC_OK); + + n->pcmbuf = NULL; + n->pcmbufSz = 0; + + n->timestamp = 0; + n->extradataSent = false; + + return 1; +} +static int encodeaac_stop(CHiPubNode *pub) { + Internal *n = (Internal*) pub; + + aacEncClose(&n->enc); + + return 1; +} +CUTIVIS CHiPubNode *CHi_EncodeAAC() { + Internal *ret = calloc(1, sizeof(*ret)); + ret->pub.type = CUTIHI_T('CEnc','AACL'); + ret->pub.Start = encodeaac_start; + ret->pub.Perform = encodeaac_perform; + ret->pub.Stop = encodeaac_stop; + ret->pub.sinks = calloc(sizeof(*ret->pub.sinks), ret->pub.sinkCount = 1); + ret->pub.sources = calloc(sizeof(*ret->pub.sources), ret->pub.sourceCount = 1); + return &ret->pub; +} diff --git a/hi/h264enc.c b/hi/h264enc.c new file mode 100644 index 0000000..d54a0b9 --- /dev/null +++ b/hi/h264enc.c @@ -0,0 +1,289 @@ +#include"node.h" + +#include +#include +#include +#include +#include +#include + +#define MINIH264_IMPLEMENTATION +#include + +#include"mode.h" +#include"img.h" +#include"yuv.h" + +#include"h264enc_sys.c" + +typedef struct +{ + void *event_start; + void *event_done; + void (*callback)(void*); + void *job; + void *thread; + int terminated; +} h264e_thread_t; + +static THREAD_RET THRAPI minih264_thread_func(void *arg) { + h264e_thread_t *t = (h264e_thread_t *)arg; + thread_name("h264"); + for(;;) { + event_wait(t->event_start, INFINITE); + if(t->terminated) + break; + t->callback(t->job); + event_set(t->event_done); + } + return 0; +} + +static void *h264e_thread_pool_init(int max_threads) { + int i; + h264e_thread_t *threads = (h264e_thread_t*) calloc(sizeof(h264e_thread_t), max_threads); + if(!threads) + return 0; + for(i = 0; i < max_threads; i++) { + h264e_thread_t *t = threads + i; + t->event_start = event_create(0, 0); + t->event_done = event_create(0, 0); + t->thread = thread_create(minih264_thread_func, t); + } + return threads; +} + +static void h264e_thread_pool_close(void *pool, int max_threads) { + int i; + h264e_thread_t *threads = (h264e_thread_t *)pool; + for(i = 0; i < max_threads; i++) { + h264e_thread_t *t = threads + i; + t->terminated = 1; + event_set(t->event_start); + thread_wait(t->thread); + thread_close(t->thread); + event_destroy(t->event_start); + event_destroy(t->event_done); + } + free(pool); +} + +static void h264e_thread_pool_run(void *pool, void (*callback)(void*), void *callback_job[], int njobs) { + h264e_thread_t *threads = (h264e_thread_t*)pool; + int i; + for(i = 0; i < njobs; i++) { + h264e_thread_t *t = threads + i; + t->callback = (void (*)(void *))callback; + t->job = callback_job[i]; + event_set(t->event_start); + } + for(i = 0; i < njobs; i++) { + h264e_thread_t *t = threads + i; + event_wait(t->event_done, INFINITE); + } +} + +typedef struct { + CHiPubNode pub; + + H264E_persist_t *enc; + H264E_scratch_t *scratch; + int threadpoolsize; + void *threadpool; + + bool firstFrame; +} Internal; + +int encodeh264_start(CHiPubNode *pub) { + Internal *n = (Internal*) pub; + + CHiValue *firstFrameVal = CHi_Crawl(&pub->sinks[0]); + + if(!firstFrameVal || firstFrameVal->type != CUTIHI_VAL_SAMPLE) { + pub->errors.active[0] = true; + strncpy(pub->errors.code[0], "frame not found", CUTIHI_ERR_SIZE); + pub->errors.sink[0] = 0; + + return 0; + } + + CHiImage *firstFrame = firstFrameVal->data.sample; + + if(firstFrame->width % 16 != 0 || firstFrame->height % 16 != 0) { + pub->errors.active[0] = true; + strncpy(pub->errors.code[0], "size mod16 not 0", CUTIHI_ERR_SIZE); + pub->errors.sink[0] = 0; + + return 0; + } + + H264E_create_param_t params; + memset(¶ms, 0, sizeof(params)); + params.enableNEON = 1; + params.num_layers = 1; + params.inter_layer_pred_flag = 0; + params.gop = 30; + params.width = firstFrame->width; + params.height = firstFrame->height; + params.max_long_term_reference_frames = 0; + params.fine_rate_control_flag = 0; + params.const_input_flag = 0; + params.vbv_size_bytes = 1024 * 1024; + params.temporal_denoise_flag = 1; + params.const_input_flag = 1; + + params.max_threads = n->threadpoolsize = 4; + params.token = n->threadpool = h264e_thread_pool_init(n->threadpoolsize); + params.run_func_in_thread = h264e_thread_pool_run; + + int sizeofPersist, sizeofScratch; + assert(!H264E_sizeof(¶ms, &sizeofPersist, &sizeofScratch)); + + n->enc = _mm_malloc(sizeofPersist, 64); + n->scratch = _mm_malloc(sizeofScratch, 64); + + assert(!H264E_init(n->enc, ¶ms)); + + n->firstFrame = true; + + return 1; +} + +int encodeh264_stop(CHiPubNode *pub) { + Internal *n = (Internal*) pub; + + h264e_thread_pool_close(n->threadpool, n->threadpoolsize); + + return 1; +} + +static bool contains_nal(uint8_t *data, size_t sz, int nalType) { + uint8_t *dataEnd = data + sz; + + int zeros = 0; + while(data != dataEnd) { + if(*data == 0) { + zeros++; + } else if((zeros == 2 || zeros == 3) && *data == 1 && data + 1 != dataEnd && (*(data + 1) & 0x1F) == nalType) { + return true; + } else { + zeros = 0; + } + data++; + } + + return false; +} + +static uint8_t *find_nal(uint8_t *data, uint8_t *dataEnd, int *nalType) { + int zeros = 0; + while(data != dataEnd) { + if(*data == 0) { + zeros++; + } else if((zeros == 2 || zeros == 3) && *data == 1 && data + 1 != dataEnd) { + *nalType = (*(data + 1)) & 0x1F; + return data - zeros; + } else { + zeros = 0; + } + data++; + } + return dataEnd; +} + +static size_t delete_nals(uint8_t *data, size_t sz, int targetNalType) { + uint8_t *dataStart = data; + uint8_t *dataEnd = data + sz; + + while(1) { + int nalType; + uint8_t *start = find_nal(data, dataEnd, &nalType); + + if(start == dataEnd) { + break; + } + + if(nalType == targetNalType) { + uint8_t *start2 = find_nal(start + 3, dataEnd, &nalType); + + memmove(start, start2, dataEnd - start2); + dataEnd -= start2 - start; + } + + data = start + 3; + } + + return dataEnd - dataStart; +} + +int encodeh264_perform(CHiPubNode *pub) { + Internal *n = (Internal*) pub; + + CHiImage *img = CHi_Crawl(&pub->sinks[0])->data.sample; + + size_t strideY = (img->width + 15) & ~15; + size_t strideU = (((img->width + 1) / 2) + 15) & ~15; + size_t strideV = (((img->width + 1) / 2) + 15) & ~15; + + uint8_t *outY = malloc(strideY * img->height + 15); + uint8_t *outU = malloc(strideU * ((img->height + 1) / 2) + 15); + uint8_t *outV = malloc(strideV * ((img->height + 1) / 2) + 15); + bgra64toycbcr(img->data8, img->stride, img->width, img->height, outY, outU, outV, strideY, strideU, strideV); + + H264E_run_param_t params; + memset(¶ms, 0, sizeof(params)); + params.encode_speed = H264E_SPEED_FASTEST; + params.frame_type = 0; + params.desired_frame_bytes = 10000; + params.qp_min = 10; + params.qp_max = 30; + params.desired_nalu_bytes = 0; + + size_t sizeofCodedData = 0; + uint8_t *codedData = NULL; + assert(!H264E_encode(n->enc, n->scratch, ¶ms, &(H264E_io_yuv_t) { + .yuv = {outY, outU, outV}, + .stride = {strideY, strideU, strideV} + }, &codedData, &sizeofCodedData)); + + free(outY); + free(outU); + free(outV); + + /* We only want SPS and PPS NALs once */ + if(!n->firstFrame) { + sizeofCodedData = delete_nals(codedData, sizeofCodedData, 7); + sizeofCodedData = delete_nals(codedData, sizeofCodedData, 8); + } + + CHiBSFrames *frames = malloc(sizeof(*frames) + sizeof(CHiBSFrame)); + frames->count = 1; + frames->data[0].timestamp = CHi_Time_Get(pub->ng) * 1000; + frames->data[0].sz = sizeofCodedData; + frames->data[0].ptr = codedData; + frames->data[0].flags = 0; + if(contains_nal(codedData, sizeofCodedData, 5)) { + frames->data[0].flags |= CUTIHI_BS_FLAG_KEY; + } + if(n->firstFrame) { + frames->data[0].flags |= CUTIHI_BS_SETUP_PACKET; + } + + pub->sources[0].type = CUTIHI_VAL_H264BS; + pub->sources[0].data.bitstream = frames; + + n->firstFrame = false; + + return 1; +} + +CUTIVIS CHiPubNode *CHi_EncodeH264() { + Internal *ret = calloc(1, sizeof(*ret)); + ret->pub.type = CUTIHI_T('CEnc', 'H264'); + ret->pub.Start = encodeh264_start; + ret->pub.Perform = encodeh264_perform; + ret->pub.Stop = encodeh264_stop; + ret->pub.sinks = calloc(sizeof(*ret->pub.sinks), ret->pub.sinkCount = 1); + ret->pub.sources = calloc(sizeof(*ret->pub.sources), ret->pub.sourceCount = 1); + return &ret->pub; +} diff --git a/hi/h264enc_sys.c b/hi/h264enc_sys.c new file mode 100644 index 0000000..5c7af61 --- /dev/null +++ b/hi/h264enc_sys.c @@ -0,0 +1,573 @@ +#ifndef __LGE_SYSTEM_H__ +#define __LGE_SYSTEM_H__ + +#ifdef _WIN32 + +#include +typedef DWORD THREAD_RET; +#define THRAPI __stdcall +#include + +#else //_WIN32 + +#include +#include + +typedef void * THREAD_RET; +typedef THREAD_RET (*PTHREAD_START_ROUTINE)(void *lpThreadParameter); +typedef PTHREAD_START_ROUTINE LPTHREAD_START_ROUTINE; + +typedef pthread_mutex_t CRITICAL_SECTION, *PCRITICAL_SECTION, *LPCRITICAL_SECTION; + +#define THRAPI + +#ifndef FALSE +#define FALSE 0 +#endif + +#ifndef TRUE +#define TRUE 1 +#endif + +typedef void * HANDLE; +#define MAXIMUM_WAIT_OBJECTS 64 +#define INFINITE (uint32_t)(-1) +#define WAIT_FAILED (-1) +#define WAIT_TIMEOUT 0x102 +#define WAIT_OBJECT 0 +#define WAIT_OBJECT_0 0 +#define WAIT_ABANDONED 128 +#define WAIT_ABANDONED_0 128 + +#endif //_WIN32 + +#ifdef __cplusplus +extern "C" { +#else +#ifndef bool +#define bool int +#endif +#endif + +HANDLE event_create(bool manualReset, bool initialState); +bool event_destroy(HANDLE event); + +#ifndef _WIN32 +#define SetEvent event_set +#define ResetEvent event_reset +#define WaitForSingleObject event_wait +#define WaitForMultipleObjects event_wait_multiple +bool event_set(HANDLE event); +bool event_reset(HANDLE event); +int event_wait(HANDLE event, uint32_t milliseconds); +int event_wait_multiple(uint32_t count, const HANDLE *events, bool waitAll, uint32_t milliseconds); +bool InitializeCriticalSection(LPCRITICAL_SECTION lpCriticalSection); +bool DeleteCriticalSection(LPCRITICAL_SECTION lpCriticalSection); +bool EnterCriticalSection(LPCRITICAL_SECTION lpCriticalSection); +bool LeaveCriticalSection(LPCRITICAL_SECTION lpCriticalSection); +#else +#define event_set SetEvent +#define event_reset ResetEvent +#define event_wait WaitForSingleObject +#define event_wait_multiple WaitForMultipleObjects +#endif + +HANDLE thread_create(LPTHREAD_START_ROUTINE lpStartAddress, void *lpParameter); +bool thread_close(HANDLE thread); +void *thread_wait(HANDLE thread); +bool thread_name(const char *name); +void thread_sleep(uint32_t milliseconds); + +uint64_t GetTime(); + +#ifdef __cplusplus +} +#endif + +#endif //__LGE_SYSTEM_H__ + +#ifndef _WIN32 + +#include +#include +#include +#include +#if defined(__linux) || defined(__linux__) +#include +#endif + +#define nullptr 0 + +typedef struct Event Event; + +typedef struct Event +{ + Event * volatile pMultipleCond; + pthread_mutex_t mutex; + pthread_cond_t cond; + volatile bool signaled; + bool manual_reset; +} Event; + +static bool InitEvent(Event *e) +{ +#if (defined(ANDROID) && !defined(__LP64__)) || defined(__APPLE__) + if (pthread_cond_init(&e->cond, NULL)) + return FALSE; +#else + pthread_condattr_t attr; + if (pthread_condattr_init(&attr)) + return FALSE; + if (pthread_condattr_setclock(&attr, CLOCK_MONOTONIC)) + { + pthread_condattr_destroy(&attr); + return FALSE; + } + if (pthread_cond_init(&e->cond, &attr)) + { + pthread_condattr_destroy(&attr); + return FALSE; + } + pthread_condattr_destroy(&attr); +#endif + if (pthread_mutex_init(&e->mutex, NULL)) + { + pthread_cond_destroy(&e->cond); + return FALSE; + } + e->pMultipleCond = NULL; + return TRUE; +} + +#ifdef __APPLE__ +#include +static inline uint64_t GetAbsTimeInNanoseconds() +{ + static mach_timebase_info_data_t g_timebase_info; + if (g_timebase_info.denom == 0) + mach_timebase_info(&g_timebase_info); + return mach_absolute_time()*g_timebase_info.numer/g_timebase_info.denom; +} +#endif + +static inline void GetAbsTime(struct timespec *ts, uint32_t timeout) +{ +#if defined(__APPLE__) + uint64_t cur_time = GetAbsTimeInNanoseconds(); + ts->tv_sec = cur_time/1000000000u + timeout/1000u; + ts->tv_nsec = (cur_time % 1000000000u) + (timeout % 1000u)*1000000u; +#else + clock_gettime(CLOCK_MONOTONIC, ts); + ts->tv_sec += timeout/1000u; + ts->tv_nsec += (timeout % 1000u)*1000000u; +#endif + if (ts->tv_nsec >= 1000000000) + { + ts->tv_nsec -= 1000000000; + ts->tv_sec++; + } +} + +static inline int CondTimedWait(pthread_cond_t *cond, pthread_mutex_t *mutex, const struct timespec *abstime) +{ +#if defined(ANDROID) && !defined(__LP64__) + return pthread_cond_timedwait_monotonic_np(cond, mutex, abstime); +#elif defined(__APPLE__) + struct timespec reltime; + uint64_t cur_time = GetAbsTimeInNanoseconds(); + reltime.tv_sec = abstime->tv_sec - cur_time/1000000000u; + reltime.tv_nsec = abstime->tv_nsec - (cur_time % 1000000000u); + if (reltime.tv_nsec < 0) + { + reltime.tv_nsec += 1000000000; + reltime.tv_sec--; + } + if ((reltime.tv_sec < 0) || ((reltime.tv_sec == 0) && (reltime.tv_nsec == 0))) + return ETIMEDOUT; + return pthread_cond_timedwait_relative_np(cond, mutex, &reltime); +#else + return pthread_cond_timedwait(cond, mutex, abstime); +#endif +} + +static bool WaitForEvent(Event *e, uint32_t timeout, bool *signaled) +{ + if (pthread_mutex_lock(&e->mutex)) + return FALSE; + + if (timeout == INFINITE) + { + while (!e->signaled) + pthread_cond_wait(&e->cond, &e->mutex); + } else if (timeout != 0) + { + struct timespec t; + GetAbsTime(&t, timeout); + while (!e->signaled) + { + if (CondTimedWait(&e->cond, &e->mutex, &t)) + break; + } + } + *signaled = e->signaled; + if (!e->manual_reset) + e->signaled = FALSE; + + if (pthread_mutex_unlock(&e->mutex)) + return FALSE; + return TRUE; +} + +static bool WaitForMultipleEvents(Event **e, uint32_t count, uint32_t timeout, bool waitAll, int *signaled_num) +{ + uint32_t i; +#define PTHR(func, num) for (i = num; i < count; i++)\ + if (func(&e[i]->mutex))\ + return FALSE; + PTHR(pthread_mutex_lock, 0); + + int sig_num = -1; + if (timeout == 0) + { +#define CHECK_SIGNALED \ + if (waitAll)\ + {\ + for (i = 0; i < count; i++)\ + if (!e[i]->signaled)\ + break;\ + if (i == count)\ + for (i = 0; i < count; i++)\ + {\ + if (sig_num < 0 && e[i]->signaled)\ + sig_num = (int)i;\ + if (!e[i]->manual_reset)\ + e[i]->signaled = FALSE;\ + }\ + } else\ + {\ + for (i = 0; i < count; i++)\ + if (e[i]->signaled)\ + {\ + sig_num = (int)i;\ + if (!e[i]->manual_reset)\ + e[i]->signaled = FALSE;\ + break;\ + }\ + } + CHECK_SIGNALED; + } else + if (timeout == INFINITE) + { +#define SET_MULTIPLE(val) for (i = 1; i < count; i++)\ + e[i]->pMultipleCond = val; + SET_MULTIPLE(e[0]); + for (;;) + { + CHECK_SIGNALED; + if (sig_num >= 0) + break; + PTHR(pthread_mutex_unlock, 1); + pthread_cond_wait(&e[0]->cond, &e[0]->mutex); + PTHR(pthread_mutex_lock, 1); + } + SET_MULTIPLE(0); + } else + { + SET_MULTIPLE(e[0]); + struct timespec t; + GetAbsTime(&t, timeout); + for (;;) + { + CHECK_SIGNALED; + if (sig_num >= 0) + break; + PTHR(pthread_mutex_unlock, 1); + int res = CondTimedWait(&e[0]->cond, &e[0]->mutex, &t); + PTHR(pthread_mutex_lock, 1); + if (res) + break; + } + SET_MULTIPLE(0); + } + PTHR(pthread_mutex_unlock, 0); + *signaled_num = sig_num; + return TRUE; +} + +HANDLE event_create(bool manualReset, bool initialState) +{ + Event *e = (Event *)malloc(sizeof(*e)); + if (!e) + return NULL; + if (!InitEvent(e)) + { + free(e); + return NULL; + } + e->manual_reset = manualReset; + e->signaled = initialState; + return (HANDLE)e; +} + +bool event_destroy(HANDLE event) +{ + Event *e = (Event *)event; + if (!e) + return FALSE; + if (pthread_cond_destroy(&e->cond)) + return FALSE; + if (pthread_mutex_destroy(&e->mutex)) + return FALSE; + free((void *)e); + return TRUE; +} + +bool event_set(HANDLE event) +{ + Event *e = (Event *)event; + if (pthread_mutex_lock(&e->mutex)) + return FALSE; + + Event *pMultipleCond = e->pMultipleCond; + e->signaled = TRUE; + if (pthread_cond_signal(&e->cond)) + return FALSE; + + if (pthread_mutex_unlock(&e->mutex)) + return FALSE; + + if (pMultipleCond && pMultipleCond != e) + { + if (pthread_mutex_lock(&pMultipleCond->mutex)) + return FALSE; + if (pthread_cond_signal(&pMultipleCond->cond)) + return FALSE; + if (pthread_mutex_unlock(&pMultipleCond->mutex)) + return FALSE; + } + return TRUE; +} + +bool event_reset(HANDLE event) +{ + Event *e = (Event *)event; + if (pthread_mutex_lock(&e->mutex)) + return FALSE; + e->signaled = FALSE; + if (pthread_mutex_unlock(&e->mutex)) + return FALSE; + return TRUE; +} + +int event_wait(HANDLE event, uint32_t milliseconds) +{ + bool signaled; + if (!WaitForEvent((Event *)event, milliseconds, &signaled)) + return WAIT_FAILED; + return signaled ? WAIT_OBJECT : WAIT_TIMEOUT; +} + +int event_wait_multiple(uint32_t count, const HANDLE *events, bool waitAll, uint32_t milliseconds) +{ + if (count == 1) + return event_wait(events[0], milliseconds); + int signaled_num = -1; + if (!WaitForMultipleEvents((Event **)events, count, milliseconds, waitAll, &signaled_num)) + return WAIT_FAILED; + return (signaled_num < 0) ? WAIT_TIMEOUT : (WAIT_OBJECT_0 + signaled_num); +} + +bool InitializeCriticalSection(LPCRITICAL_SECTION lpCriticalSection) +{ + pthread_mutexattr_t ma; + if (pthread_mutexattr_init(&ma)) + return FALSE; + if (pthread_mutexattr_settype(&ma, PTHREAD_MUTEX_RECURSIVE)) + { + pthread_mutexattr_destroy(&ma); + return FALSE; + } + if (pthread_mutex_init((pthread_mutex_t *)lpCriticalSection, &ma)) + { + pthread_mutexattr_destroy(&ma); + return FALSE; + } + if (pthread_mutexattr_destroy(&ma)) + return FALSE; + return TRUE; +} + +bool DeleteCriticalSection(LPCRITICAL_SECTION lpCriticalSection) +{ + if (pthread_mutex_destroy((pthread_mutex_t *)lpCriticalSection)) + return FALSE; + return TRUE; +} + +bool EnterCriticalSection(LPCRITICAL_SECTION lpCriticalSection) +{ + if (pthread_mutex_lock((pthread_mutex_t *)lpCriticalSection)) + return FALSE; + return TRUE; +} + +bool LeaveCriticalSection(LPCRITICAL_SECTION lpCriticalSection) +{ + if (pthread_mutex_unlock((pthread_mutex_t *)lpCriticalSection)) + return FALSE; + return TRUE; +} + +HANDLE thread_create(LPTHREAD_START_ROUTINE lpStartAddress, void *lpParameter) +{ + pthread_t *t = (pthread_t *)malloc(sizeof(*t)); + if (!t) + return nullptr; + if (pthread_create(t, 0, lpStartAddress, lpParameter)) + { + free(t); + return nullptr; + } + //if (lpThreadId) + // *lpThreadId = (uint32_t)*t; + return (HANDLE)t; +} + +bool thread_close(HANDLE thread) +{ + if (!thread) + return FALSE; + pthread_t *t = (pthread_t *)thread; + if (*t) + pthread_detach(*t); + free(t); + return TRUE; +} + +void *thread_wait(HANDLE thread) +{ + if (!thread) + return (void*)-1; + void *ret = 0; + pthread_t *t = (pthread_t *)thread; + if (!*t) + return ret; + int res = pthread_join(*t, &ret); + if (res) + return (void*)-1; +#if 0 + if (timeout == 0) + { + int res = pthread_tryjoin_np(*t, &ret); + if (res) + return FALSE; + } else + if (timeout == INFINITE) + { + int res = pthread_join(*t, &ret); + if (res) + return FALSE; + } else + { + struct timespec ts; + GetAbsTime(&ts, timeout); + int res = pthread_timedjoin_np(*t, &ret, &ts); + if (res) + return FALSE; + } +#endif + *t = 0; // thread joined - no need to detach + return ret; +} + +#else //_WIN32 + +HANDLE thread_create(LPTHREAD_START_ROUTINE lpStartAddress, void *lpParameter) +{ + DWORD tid; + return CreateThread(0, 0, lpStartAddress, lpParameter, 0, &tid); +} + +HANDLE event_create(bool manualReset, bool initialState) +{ + return CreateEvent(0, manualReset, initialState, 0); +} + +bool event_destroy(HANDLE event) +{ + CloseHandle(event); + return TRUE; +} + +bool thread_close(HANDLE thread) +{ + CloseHandle(thread); + return TRUE; +} + +void *thread_wait(HANDLE thread) +{ + if (WaitForSingleObject(thread, INFINITE) == WAIT_OBJECT_0) + { + DWORD ExitCode; + GetExitCodeThread(thread, &ExitCode); + return (void *)(intptr_t)ExitCode; + } + return (void *)(intptr_t)-1; +} + +#endif //_WIN32 + +bool thread_name(const char *name) +{ +#ifdef _WIN32 +#ifdef _MSC_VER + struct tagTHREADNAME_INFO + { + DWORD dwType; + LPCSTR szName; + DWORD dwThreadID; + DWORD dwFlags; + } info = { 0x1000, name, (DWORD)-1, 0 }; + __try + { + RaiseException(0x406D1388, 0, sizeof(info)/sizeof(ULONG_PTR), (ULONG_PTR*)&info); + } + __except(EXCEPTION_EXECUTE_HANDLER) + { + } +#endif + return TRUE; +#elif defined(__linux) || defined(__linux__) + return (0 == prctl(PR_SET_NAME, name, 0, 0, 0)); + //return (0 == pthread_setname_np(pthread_self(), name)); +#else // macos, ios + return (0 == pthread_setname_np(name)); +#endif +} + +void thread_sleep(uint32_t milliseconds) +{ +#ifdef _WIN32 + Sleep(milliseconds); +#else + usleep((useconds_t)milliseconds*1000); +#endif +} + +uint64_t GetTime() +{ + uint64_t time; +#ifdef _WIN32 + GetSystemTimeAsFileTime((FILETIME*)&time); + time = time/10 - 11644473600000000; +#elif defined(__APPLE__) + time = GetAbsTimeInNanoseconds() / 1000u; +#else + struct timespec ts; + // CLOCK_PROCESS_CPUTIME_ID CLOCK_THREAD_CPUTIME_ID + clock_gettime(CLOCK_MONOTONIC, &ts); + time = (uint64_t)ts.tv_sec * 1000000u + ts.tv_nsec / 1000u; +#endif + return time; +} diff --git a/hi/rtmp.c b/hi/rtmp.c new file mode 100644 index 0000000..fa9cfbf --- /dev/null +++ b/hi/rtmp.c @@ -0,0 +1,301 @@ +#include"node.h" + +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include"img.h" + +#define NALLENSZ 4 + +static size_t annexb_parse(const uint8_t *src, const uint8_t *srcEnd) { + int zeros = 0; + const uint8_t *src2; + for(src2 = src; src2 != srcEnd; src2++) { + if(*src2 == 0) { + zeros++; + } else if((zeros == 2 || zeros == 3) && *src2 == 1) { + src2 -= zeros; + break; + } else { + zeros = 0; + } + } + return src2 - src; +} + +static uint8_t *annexb_to_extradata(const uint8_t *src, const uint8_t *srcEnd, size_t *szRet, size_t *srcSzEnd) { + const uint8_t *sps = src; + while(*sps == 0) sps++; + assert(sps[0] == 1); + sps++; + + size_t szSps = annexb_parse(sps, srcEnd); + + const uint8_t *pps = sps + szSps; + while(*pps == 0) pps++; + assert(pps[0] == 1); + pps++; + + size_t szPps = annexb_parse(pps, srcEnd); + + uint8_t *ret = malloc(*szRet = (6 + 2 + szSps + 1 + 2 + szPps)); + ret[0] = 1; + ret[1] = sps[1]; + ret[2] = sps[2]; + ret[3] = sps[3]; + ret[4] = 0xFC | (NALLENSZ - 1); + + ret[5] = 0xE0 | 1; + ret[6] = szSps >> 8; + ret[7] = szSps & 0xFF; + + memcpy(&ret[8], sps, szSps); + + ret[8 + szSps + 0] = 1; + ret[8 + szSps + 1] = szPps >> 8; + ret[8 + szSps + 2] = szPps & 0xFF; + + memcpy(&ret[8 + szSps + 3], pps, szPps); + + *srcSzEnd = pps + szPps - src; + + return ret; +} + +static uint8_t *annexb_to_avcc(const uint8_t *src, size_t szSrc, size_t *szRet) { + size_t cap = 4096, sz = 0; + uint8_t *ret = malloc(cap); + + const uint8_t *srcEnd = src + szSrc; + while(src != srcEnd) { + assert(*src == 0); + while(*src == 0) { + src++; + } + assert(*src == 1); + src++; + + size_t nalSize = annexb_parse(src, srcEnd); + + size_t additionSz = NALLENSZ + nalSize; + + if(sz + additionSz > cap) { + ret = realloc(ret, cap = (sz + additionSz)); + } + + *(uint32_t*) &ret[sz] = htonl(nalSize); + memcpy(&ret[sz + NALLENSZ], src, nalSize); + + sz += additionSz; + src += nalSize; + } + + *szRet = sz; + + return ret; +} + +typedef struct { + CHiPubNode pub; + + RTMP *rtmp; + + RTMPPacket rtmppkt; +} Internal; + +static int streamrtmp_start(CHiPubNode *pub) { + Internal *n = (Internal*) pub; + + n->rtmp = RTMP_Alloc(); + + if(!n->rtmp) return 0; + + RTMP_Init(n->rtmp); + + RTMP_LogSetLevel(RTMP_LOGINFO); + RTMP_LogSetOutput(stderr); + + RTMP_SetupURL(n->rtmp, CHi_Crawl(&pub->sinks[2])->data.text); + RTMP_EnableWrite(n->rtmp); + + if(!RTMP_Connect(n->rtmp, NULL)) { + return 0; + } + + if(!RTMP_ConnectStream(n->rtmp, 0)) { + return 0; + } + + memset(&n->rtmppkt, 0, sizeof(n->rtmppkt)); + RTMPPacket_Alloc(&n->rtmppkt, 4096); + + return 1; +} + +static int streamrtmp_stop(CHiPubNode *pub) { + Internal *n = (Internal*) pub; + + RTMP_Free(n->rtmp); + n->rtmp = NULL; + + return 1; +} + +#define FLV_TAG_HEADER_SIZE 11 +#define FLV_VIDEO_HDR_SIZE 5 +#define FLV_AUDIO_HDR_SIZE 2 +#define FLV_PREV_TAG_SIZE_SIZE 4 + +static int do_video(Internal *n) { + if(!CHi_Crawl(&n->pub.sinks[0]) || !CHi_Crawl(&n->pub.sinks[0])->data.bitstream) { + return 1; + } + + CHiBSFrames *frames = CHi_Crawl(&n->pub.sinks[0])->data.bitstream; + + for(size_t fi = 0; fi < frames->count; fi++) { + CHiBSFrame *f = &frames->data[fi]; + + size_t avccSize; + uint8_t *avcc; + + if(f->flags & CUTIHI_BS_SETUP_PACKET) { + size_t annexbextradatasrcsize; + avcc = annexb_to_extradata(f->ptr, f->ptr + f->sz, &avccSize, &annexbextradatasrcsize); + } else { + avcc = annexb_to_avcc(f->ptr, f->sz, &avccSize); + } + + size_t dataSize = FLV_VIDEO_HDR_SIZE + avccSize; + size_t tagSize = FLV_TAG_HEADER_SIZE + dataSize; + size_t rtmpPacketSize = tagSize + FLV_PREV_TAG_SIZE_SIZE; + uint8_t *packet = malloc(rtmpPacketSize); + + // Tag + packet[0] = 9; // video + packet[1] = (dataSize >> 16) & 0xFF; + packet[2] = (dataSize >> 8) & 0xFF; + packet[3] = (dataSize >> 0) & 0xFF; + packet[4] = (f->timestamp >> 16) & 0xFF; + packet[5] = (f->timestamp >> 8) & 0xFF; + packet[6] = (f->timestamp >> 0) & 0xFF; + packet[7] = (f->timestamp >> 24) & 0xFF; + packet[8] = 0; + packet[9] = 0; + packet[10] = 0; + + // Video Header + packet[11] = (f->flags & CUTIHI_BS_FLAG_KEY) ? 0x17 : 0x27; + packet[12] = (f->flags & CUTIHI_BS_SETUP_PACKET) ? 0 : 1; + packet[13] = 0; + packet[14] = 0; + packet[15] = 0; + + memcpy(&packet[16], avcc, avccSize); + + packet[16 + avccSize + 0] = (tagSize >> 24) & 0xFF; + packet[16 + avccSize + 1] = (tagSize >> 16) & 0xFF; + packet[16 + avccSize + 2] = (tagSize >> 8) & 0xFF; + packet[16 + avccSize + 3] = (tagSize >> 0) & 0xFF; + + RTMP_Write(n->rtmp, packet, rtmpPacketSize); + + free(packet); + free(avcc); + } + + return 1; +} + +static int do_audio(Internal *n) { + if(!CHi_Crawl(&n->pub.sinks[1]) || !CHi_Crawl(&n->pub.sinks[1])->data.bitstream) { + return 1; + } + + CHiBSFrames *frames = CHi_Crawl(&n->pub.sinks[1])->data.bitstream; + + for(size_t fi = 0; fi < frames->count; fi++) { + CHiBSFrame *f = &frames->data[fi]; + + size_t avccSize = f->sz; + uint8_t *avcc = f->ptr; + + size_t dataSize = FLV_AUDIO_HDR_SIZE + avccSize; + size_t tagSize = FLV_TAG_HEADER_SIZE + dataSize; + size_t rtmpPacketSize = tagSize + FLV_PREV_TAG_SIZE_SIZE; + uint8_t *packet = malloc(rtmpPacketSize); + + // Tag + packet[0] = 8; // audio + packet[1] = (dataSize >> 16) & 0xFF; + packet[2] = (dataSize >> 8) & 0xFF; + packet[3] = (dataSize >> 0) & 0xFF; + packet[4] = (f->timestamp >> 16) & 0xFF; + packet[5] = (f->timestamp >> 8) & 0xFF; + packet[6] = (f->timestamp >> 0) & 0xFF; + packet[7] = (f->timestamp >> 24) & 0xFF; + packet[8] = 0; + packet[9] = 0; + packet[10] = 0; + + // Audio Header + packet[11] = (1 << 0) | (1 << 1) | (3 << 2) | (10 << 4); + packet[12] = (f->flags & CUTIHI_BS_SETUP_PACKET) ? 0 : 1; + + memcpy(&packet[13], avcc, avccSize); + + packet[13 + avccSize + 0] = (tagSize >> 24) & 0xFF; + packet[13 + avccSize + 1] = (tagSize >> 16) & 0xFF; + packet[13 + avccSize + 2] = (tagSize >> 8) & 0xFF; + packet[13 + avccSize + 3] = (tagSize >> 0) & 0xFF; + + RTMP_Write(n->rtmp, packet, rtmpPacketSize); + + free(packet); + } + + return 1; +} + +static int streamrtmp_perform(CHiPubNode *pub) { + Internal *n = (Internal*) pub; + + if(!do_video(n)) return 0; + + if(!do_audio(n)) return 0; + + int fd = RTMP_Socket(n->rtmp); + fd_set sockset; + struct timeval timeout = {}; + FD_ZERO(&sockset); + FD_SET(fd, &sockset); + int result = select(fd + 1, &sockset, NULL, NULL, &timeout); + if(result == 1 && FD_ISSET(fd, &sockset)) { + RTMP_ReadPacket(n->rtmp, &n->rtmppkt); + if(!RTMPPacket_IsReady(&n->rtmppkt)) { + RTMP_ClientPacket(n->rtmp, &n->rtmppkt); + } + } + + return 1; +} + +CUTIVIS CHiPubNode *CHi_StreamRTMP() { + Internal *ret = calloc(1, sizeof(*ret)); + ret->pub.type = CUTIHI_T('CStr', 'RTMP'); + ret->pub.Start = streamrtmp_start; + ret->pub.Perform = streamrtmp_perform; + ret->pub.Stop = streamrtmp_stop; + ret->pub.sinks = calloc(sizeof(*ret->pub.sinks), ret->pub.sinkCount = 3); + ret->pub.sources = calloc(sizeof(*ret->pub.sources), ret->pub.sourceCount = 0); + return &ret->pub; +} diff --git a/hi/vpxenc.c b/hi/vpxenc.c new file mode 100644 index 0000000..e42051f --- /dev/null +++ b/hi/vpxenc.c @@ -0,0 +1,182 @@ +#include"node.h" + +#include +#include + +#include + +#include"mode.h" + +#include"img.h" +#include + +#include + +#include + +#include"minitrace.h" + +#include"linearity.h" + +#include"yuv.h" + +typedef struct CHiEncodeVP9Node { + CHiPubNode pub; + + vpx_codec_ctx_t codec; + vpx_codec_enc_cfg_t cfg; + + enum { + WAITING, IN_PROGRESS + } state; + uint8_t *outY, *outU, *outV; + uint16_t strideY, strideU, strideV; + + vpx_codec_iface_t *iface; +} CHiEncodeVP9Node; + +static int encodevpx_perform(CHiPubNode *pub) { + CHiEncodeVP9Node *node = (void*) pub; + + MTR_BEGIN("CHi", "encodevp9_perform"); + + pub->sources[0].type = CUTIHI_VAL_VP9BS; + pub->sources[0].data.bitstream = NULL; + + if(node->state == WAITING) return 1; + + CHiImage *rgbIn = (CHiImage*) CHi_Crawl(&pub->sinks[0])->data.sample; + + bgra64toycbcr(rgbIn->data8, rgbIn->stride, rgbIn->width, rgbIn->height, node->outY, node->outU, node->outV, node->strideY, node->strideU, node->strideV); + + vpx_image_t vpxraw; + vpxraw.fmt = VPX_IMG_FMT_I420; + vpxraw.cs = VPX_CS_BT_709; + vpxraw.range = VPX_CR_STUDIO_RANGE; + vpxraw.bit_depth = 8; + vpxraw.w = vpxraw.d_w = node->cfg.g_w; + vpxraw.h = vpxraw.d_h = node->cfg.g_h; + vpxraw.r_w = vpxraw.r_h = 0; + vpxraw.x_chroma_shift = vpxraw.y_chroma_shift = 1; + vpxraw.img_data_owner = 0; + vpxraw.self_allocd = 0; + vpxraw.bps = 12; + vpxraw.stride[VPX_PLANE_Y] = node->strideY; + vpxraw.planes[VPX_PLANE_Y] = node->outY; + vpxraw.stride[VPX_PLANE_U] = node->strideU; + vpxraw.planes[VPX_PLANE_U] = node->outU; + vpxraw.stride[VPX_PLANE_V] = node->strideV; + vpxraw.planes[VPX_PLANE_V] = node->outV; + + vpx_codec_encode(&node->codec, &vpxraw, CHi_Time_Get(pub->ng) * 1000.f, 1, 0, VPX_DL_REALTIME); + + CHiBSFrames *ret = malloc(sizeof(CHiBSFrames)); + ret->count = 0; + + vpx_codec_iter_t iter = NULL; + const vpx_codec_cx_pkt_t *pkt; + while((pkt = vpx_codec_get_cx_data(&node->codec, &iter)) != NULL) { + if(pkt->kind == VPX_CODEC_CX_FRAME_PKT) { + ret = (CHiBSFrames*) realloc(ret, sizeof(CHiBSFrames) + sizeof(CHiBSFrame) * (ret->count + 1)); + ret->data[ret->count].timestamp = pkt->data.frame.pts; + ret->data[ret->count].sz = pkt->data.frame.sz; + ret->data[ret->count].flags = pkt->data.frame.flags & VPX_FRAME_IS_KEY; + ret->data[ret->count].ptr = malloc(ret->data[ret->count].sz); + memcpy(ret->data[ret->count].ptr, pkt->data.frame.buf, ret->data[ret->count].sz); + ret->count++; + } + } + + pub->sources[0].data.bitstream = ret; + + MTR_END("CHi", "encodevp9_perform"); + + return 1; +} + +static void encodevpx_destroy(CHiPubNode *pub) { + CHiEncodeVP9Node *node = (void*) pub; + + free(node); +} + +static int encodevpx_start(CHiPubNode *pubn) { + CHiEncodeVP9Node *node = (void*) pubn; + + node->state = IN_PROGRESS; + + CHiImage *firstFrame = (CHiImage*) CHi_Crawl(&pubn->sinks[0])->data.sample; + + vpx_codec_enc_config_default(node->iface, &node->cfg, 0); + node->cfg.g_w = firstFrame->width; + node->cfg.g_h = firstFrame->height; + node->cfg.g_timebase.num = 1; + node->cfg.g_timebase.den = 30; + node->cfg.g_lag_in_frames = 0; + node->cfg.g_threads = 8; + node->cfg.kf_mode = VPX_KF_AUTO; + node->cfg.kf_max_dist = 300; + node->cfg.rc_end_usage = VPX_VBR; + node->cfg.rc_target_bitrate = 512; + node->cfg.rc_min_quantizer = 4; + node->cfg.rc_max_quantizer = 48; + + vpx_codec_enc_init(&node->codec, node->iface, &node->cfg, 0); + vpx_codec_control(&node->codec, VP8E_SET_CPUUSED, 8); + vpx_codec_control(&node->codec, VP9E_SET_ROW_MT, 1); + vpx_codec_control(&node->codec, VP9E_SET_TILE_COLUMNS, 2); + vpx_codec_control(&node->codec, VP9E_SET_TILE_ROWS, 1); + vpx_codec_control(&node->codec, VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_SCREEN); + + node->strideY = (node->cfg.g_w + 64) & ~63; + node->strideU = (node->cfg.g_w / 2 + 64) & ~63; + node->strideV = (node->cfg.g_w / 2 + 64) & ~63; + + node->outY = (uint8_t*) _mm_malloc(node->strideY * node->cfg.g_h, 16); + node->outU = (uint8_t*) _mm_malloc(node->strideU * node->cfg.g_h / 2, 16); + node->outV = (uint8_t*) _mm_malloc(node->strideV * node->cfg.g_h / 2, 16); + + return 1; +} + +static int encodevpx_stop(CHiPubNode *pubn) { + CHiEncodeVP9Node *node = (void*) pubn; + + node->state = WAITING; + + _mm_free(node->outY); + _mm_free(node->outU); + _mm_free(node->outV); + + vpx_codec_destroy(&node->codec); + + return 1; +} + +CUTIVIS CHiPubNode *CHi_EncodeVP8() { + CHiEncodeVP9Node *n = calloc(1, sizeof(*n)); + n->pub.type = CUTIHI_T('CEnc','GVP8'); + n->pub.Start = encodevpx_start; + n->pub.Perform = encodevpx_perform; + n->pub.Stop = encodevpx_stop; + n->pub.Destroy = encodevpx_destroy; + n->pub.sinks = (CHiValue*) calloc(sizeof(*n->pub.sinks), n->pub.sinkCount = 1); + n->pub.sources = (CHiValue*) calloc(sizeof(*n->pub.sources), n->pub.sourceCount = 1); + n->state = WAITING; + n->iface = vpx_codec_vp8_cx(); + return &n->pub; +} + +CUTIVIS CHiPubNode *CHi_EncodeVP9() { + CHiEncodeVP9Node *n = calloc(1, sizeof(*n)); + n->pub.type = CUTIHI_T('CEnc','GVP9'); + n->pub.Start = encodevpx_start; + n->pub.Perform = encodevpx_perform; + n->pub.Stop = encodevpx_stop; + n->pub.Destroy = encodevpx_destroy; + n->pub.sinks = (CHiValue*) calloc(sizeof(*n->pub.sinks), n->pub.sinkCount = 1); + n->pub.sources = (CHiValue*) calloc(sizeof(*n->pub.sources), n->pub.sourceCount = 1); + n->state = WAITING; + n->iface = vpx_codec_vp9_cx(); + return &n->pub; +} diff --git a/hi/x264enc.c b/hi/x264enc.c new file mode 100644 index 0000000..a92640e --- /dev/null +++ b/hi/x264enc.c @@ -0,0 +1,259 @@ +#include"node.h" + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include"mode.h" +#include"img.h" + +typedef struct { + CHiPubNode pub; + + char fifoIn[256]; + + int fdOut; + int fdIn; + + pthread_t iothread; + pthread_mutex_t iomutex; + + size_t outQueueLen; + uint8_t *outQueue; + + size_t inQueueLen; + size_t inQueueCap; + uint8_t *inQueue; + + size_t waitingForNum; +} Internal; + +void *iothread_func(void *ud) { + Internal *n = ud; + + while(1) { + pthread_mutex_lock(&n->iomutex); + + while(1) { + uint8_t toread[4096]; + ssize_t readcount = read(n->fdIn, toread, sizeof(toread)); + + if(readcount == -1) { + if(errno != EAGAIN && errno != EWOULDBLOCK) { + goto error; + } else { + break; + } + } + + if(n->inQueueLen + readcount > n->inQueueCap) { + n->inQueue = realloc(n->inQueue, n->inQueueCap += 8192); + } + + memcpy(n->inQueue + n->inQueueLen, toread, readcount); + n->inQueueLen += readcount; + + if(readcount < sizeof(toread)) { + break; + } + } + + //printf("inqueue %lu outqueue %lu\n\n", n->inQueueLen, n->outQueueLen); + + while(n->outQueueLen > 0) { + ssize_t wrotecount = write(n->fdOut, n->outQueue, n->outQueueLen); + + if(wrotecount == -1) { + if(errno != EAGAIN && errno != EWOULDBLOCK) { + goto error; + } else { + break; + } + } + + memmove(n->outQueue, n->outQueue + wrotecount, n->outQueueLen - wrotecount); + n->outQueueLen -= wrotecount; + + if(wrotecount < n->outQueueLen) { + break; + } + } + + pthread_mutex_unlock(&n->iomutex); + } + + return NULL; +error: + pthread_mutex_unlock(&n->iomutex); + return NULL; +} + +int encodeh264_start(CHiPubNode *pub) { + Internal *n = (Internal*) pub; + + CHiImage *firstFrame = CHi_Crawl(&pub->sinks[0])->data.sample; + +#ifndef _WIN32 + uint8_t randoms[4]; + getrandom(randoms, sizeof(randoms), 0); + + snprintf(n->fifoIn, sizeof(n->fifoIn), "/tmp/cuticlex264_%02X%02X%02X%02X.264", randoms[0], randoms[1], randoms[2], randoms[3]); + + char res[256]; + snprintf(res, sizeof(res), "%ix%i", firstFrame->width, firstFrame->height); + + assert(mkfifo(n->fifoIn, 0666) == 0); + + int fd[2]; + pipe(fd); + + if(fork() == 0) { + close(fd[1]); + dup2(fd[0], STDIN_FILENO); + close(fd[0]); + + if(execlp("x264", "x264", "--profile", "main", "--tune", "zerolatency", "--preset", "ultrafast", "--input-csp", "bgra", "--input-depth", "16", "--input-res", res, "--aud", "--demuxer", "raw", "--fps", "30", "--qp", "50", "-o", n->fifoIn, "-", NULL) == -1) { + abort(); + } + } else { + close(fd[0]); + n->fdOut = fd[1]; + } + + n->fdIn = open(n->fifoIn, O_RDONLY | O_NONBLOCK, 0); + + fcntl(n->fdOut, F_SETFL, fcntl(n->fdOut, F_GETFL) | O_NONBLOCK); +#endif + + n->outQueueLen = 0; + n->outQueue = NULL; + + n->inQueueLen = 0; + n->inQueueCap = 8192; + n->inQueue = malloc(sizeof(*n->inQueue) * n->inQueueCap); + + n->waitingForNum = 0; + + assert(pthread_create(&n->iothread, NULL, iothread_func, n) == 0); + + return 1; +} + +int encodeh264_stop(CHiPubNode *pub) { + Internal *n = (Internal*) pub; + + close(n->fdOut); + close(n->fdIn); + + unlink(n->fifoIn); + + return 1; +} + +static uint8_t *find_aud(uint8_t *bs, size_t len, int *sz) { + if(len <= 3) { + return NULL; + } + + for(size_t i = 3; i < len; i++) { + if(bs[i] == 9 && bs[i - 1] == 1 && bs[i - 2] == 0 && bs[i - 3] == 0) { + if(i >= 4 && bs[i - 4] == 0) { + *sz = 4; + return &bs[i - 4]; + } else { + *sz = 3; + return &bs[i - 3]; + } + } + } + + return NULL; +} + +int encodeh264_perform(CHiPubNode *pub) { + Internal *n = (Internal*) pub; + + pthread_mutex_lock(&n->iomutex); + { + //if(CHi_GetMode() == CUTIHI_MODE_OFFLINE || n->waitingForNum == 0) { + CHiImage *frame = CHi_Crawl(&pub->sinks[0])->data.sample; + + size_t frameSize = frame->width * 8 * frame->height; + + n->outQueue = realloc(n->outQueue, n->outQueueLen + frameSize); + CHi_Restride(frame->data8, n->outQueue + n->outQueueLen, frame->stride, frame->width * 8, frame->height); + n->outQueueLen += frameSize; + + // n->waitingForNum++; + //} + } + + CHiBSFrames *frames = calloc(1, sizeof(*frames)); + + while(1) { + int aud0sz; + uint8_t *aud0 = find_aud(n->inQueue, n->inQueueLen, &aud0sz); + + if(aud0 == NULL) { + break; + } + + // First AUD must always be at the start because it's the delimiter + assert(aud0 == n->inQueue); + + int aud1sz; + uint8_t *aud1 = find_aud(n->inQueue + aud0sz, n->inQueueLen - aud0sz, &aud1sz); + + // Second AUD must exist, otherwise we don't actually know if the packet's over + if(aud1 == NULL) { + break; + } + + size_t framesz = aud1 - aud0; + + static int nextts = 0; + nextts++; + + frames = realloc(frames, sizeof(*frames) + sizeof(CHiBSFrame) * (frames->count + 1)); + //frames->data[frames->count].timestamp = CHi_Time_Get(pub->ng) * 1000; + frames->data[frames->count].timestamp = nextts * 33; + frames->data[frames->count].sz = framesz - 6; + memcpy(frames->data[frames->count].ptr = malloc(framesz - 6), aud0 + 6, framesz - 6); + frames->data[frames->count].flags = 0; + frames->count++; + + memmove(n->inQueue, n->inQueue + framesz, n->inQueueLen - framesz); + n->inQueueLen -= framesz; + + //if(n->waitingForNum > 0) { + // n->waitingForNum--; + //} + } + + pthread_mutex_unlock(&n->iomutex); + + pub->sources[0].type = CUTIHI_VAL_H264BS; + pub->sources[0].data.bitstream = frames; + + return 1; +} + +CUTIVIS CHiPubNode *CHi_EncodeH264() { + Internal *ret = calloc(1, sizeof(*ret)); + ret->pub.type = CUTIHI_T('CEnc', 'H264'); + ret->pub.Start = encodeh264_start; + ret->pub.Perform = encodeh264_perform; + ret->pub.Stop = encodeh264_stop; + ret->pub.sinks = calloc(sizeof(*ret->pub.sinks), ret->pub.sinkCount = 1); + ret->pub.sources = calloc(sizeof(*ret->pub.sources), ret->pub.sourceCount = 1); + return &ret->pub; +} diff --git a/hi/yuv.h b/hi/yuv.h new file mode 100644 index 0000000..c31aef8 --- /dev/null +++ b/hi/yuv.h @@ -0,0 +1,127 @@ +#pragma once + +#include + +#include"linearity.h" + +// strides must be 16-byte aligned +static inline void bgra64toycbcr(uint8_t *bgra64, size_t bgra64stride, size_t imgW, size_t imgH, uint8_t *outY, uint8_t *outU, uint8_t *outV, size_t strideY, size_t strideU, size_t strideV) { + #pragma omp parallel for simd + for(size_t y = 0; y < imgH; y += 2) { + for(size_t x = 0; x < imgW; x += 16) { + __m128i rgb, partY, partU, partV, dotY, dotU, dotV; + + __m128i wipY0 = _mm_setzero_si128(); + __m128i wipY1 = _mm_setzero_si128(); + __m128i wipU = _mm_setzero_si128(); + __m128i wipV = _mm_setzero_si128(); + + __m128i tempU = _mm_setzero_si128(); + __m128i tempV = _mm_setzero_si128(); + +#define DO_DAH_DOO_DOO(LoOrHi, shufY, shufUV) \ + /* Process top two */\ + rgb = _mm_srli_epi16(apply_gamma_epi16(line0, _mm_set1_ps(1 / 2.2f)), 8); \ + /* Start matrix multiplication (BT.709 + full->studio range) */\ + partY = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 47, 157, 16, 0, 47, 157, 16));\ + partU = _mm_mullo_epi16(rgb, _mm_set_epi16(0, -25, -85, 110, 0, -25, -85, 110));\ + partV = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 110, -100, -10, 0, 110, -100, -10));\ + /* Finish mat-mul with dot products */\ + dotY = _mm_madd_epi16(partY, _mm_set1_epi16(1));\ + dotY = _mm_hadd_epi32(dotY, _mm_setzero_si128());\ + dotU = _mm_madd_epi16(partU, _mm_set1_epi16(1));\ + dotU = _mm_hadd_epi32(dotU, _mm_setzero_si128());\ + dotV = _mm_madd_epi16(partV, _mm_set1_epi16(1));\ + dotV = _mm_hadd_epi32(dotV, _mm_setzero_si128());\ + /* Insert Ys */\ + wipY0 = _mm_or_si128(wipY0, _mm_shuffle_epi8(dotY, shufY));\ + /* Save top UV */\ + tempU = dotU;\ + tempV = dotV;\ + \ + /* Process bottom two */\ + rgb = _mm_srli_epi16(apply_gamma_epi16(line1, _mm_set1_ps(1 / 2.2f)), 8); \ + /* Start matrix multiplication (BT.709 + full->studio range) */\ + partY = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 47, 157, 16, 0, 47, 157, 16));\ + partU = _mm_mullo_epi16(rgb, _mm_set_epi16(0, -25, -85, 110, 0, -25, -85, 110));\ + partV = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 110, -100, -10, 0, 110, -100, -10));\ + /* Finish mat-mul with dot products */\ + dotY = _mm_madd_epi16(partY, _mm_set1_epi16(1));\ + dotY = _mm_hadd_epi32(dotY, _mm_setzero_si128());\ + dotU = _mm_madd_epi16(partU, _mm_set1_epi16(1));\ + dotU = _mm_hadd_epi32(dotU, _mm_setzero_si128());\ + dotV = _mm_madd_epi16(partV, _mm_set1_epi16(1));\ + dotV = _mm_hadd_epi32(dotV, _mm_setzero_si128());\ + /* Insert Ys */\ + wipY1 = _mm_or_si128(wipY1, _mm_shuffle_epi8(dotY, shufY));\ + /* Save bottom UVs */\ + tempU = _mm_hadd_epi32(_mm_add_epi32(tempU, dotU), _mm_setzero_si128());\ + tempV = _mm_hadd_epi32(_mm_add_epi32(tempV, dotV), _mm_setzero_si128());\ + \ + /* Insert UVs */\ + wipU = _mm_or_si128(wipU, _mm_shuffle_epi8(_mm_srli_epi32(tempU, 2), shufUV));\ + wipV = _mm_or_si128(wipV, _mm_shuffle_epi8(_mm_srli_epi32(tempV, 2), shufUV)); + + __m128i line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 0) * 8)); // Load two pixels + __m128i line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 0) * 8)); // Load two pixels + + DO_DAH_DOO_DOO(_mm_unpacklo_epi8, + _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 5, 1), + _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1)); + + line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 2) * 8)); // Load two pixels + line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 2) * 8)); // Load two pixels + + DO_DAH_DOO_DOO(_mm_unpacklo_epi8, + _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 5, 1, -128, -128), + _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128)); + + line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 4) * 8)); // Load two pixels + line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 4) * 8)); // Load two pixels + + DO_DAH_DOO_DOO(_mm_unpacklo_epi8, + _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 5, 1, -128, -128, -128, -128), + _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128)); + + line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 6) * 8)); // Load two pixels + line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 6) * 8)); // Load two pixels + + DO_DAH_DOO_DOO(_mm_unpacklo_epi8, + _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, 5, 1, -128, -128, -128, -128, -128, -128), + _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128)); + + line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 8) * 8)); // Load two pixels + line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 8) * 8)); // Load two pixels + + DO_DAH_DOO_DOO(_mm_unpacklo_epi8, + _mm_set_epi8(-128, -128, -128, -128, -128, -128, 5, 1, -128, -128, -128, -128, -128, -128, -128, -128), + _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128)); + + line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 10) * 8)); // Load two pixels + line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 10) * 8)); // Load two pixels + + DO_DAH_DOO_DOO(_mm_unpacklo_epi8, + _mm_set_epi8(-128, -128, -128, -128, 5, 1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128), + _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128, -128)); + + line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 12) * 8)); // Load two pixels + line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 12) * 8)); // Load two pixels + + DO_DAH_DOO_DOO(_mm_unpacklo_epi8, + _mm_set_epi8(-128, -128, 5, 1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128), + _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128, -128, -128)); + + line0 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 0) * bgra64stride + (x + 14) * 8)); // Load two pixels + line1 = _mm_load_si128((__m128i*) ((uintptr_t) bgra64 + (y + 1) * bgra64stride + (x + 14) * 8)); // Load two pixels + + DO_DAH_DOO_DOO(_mm_unpacklo_epi8, + _mm_set_epi8( 5, 1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128), + _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128, -128, -128, -128)); + + _mm_stream_si128((__m128i*) &outY[strideY * (y + 0) + x], _mm_add_epi8(_mm_set1_epi8(16), wipY0)); + _mm_stream_si128((__m128i*) &outY[strideY * (y + 1) + x], _mm_add_epi8(_mm_set1_epi8(16), wipY1)); + _mm_storeu_si128((__m128i*) &outU[strideU * (y / 2) + x / 2], _mm_add_epi8(wipU, _mm_set1_epi8(128))); + _mm_storeu_si128((__m128i*) &outV[strideV * (y / 2) + x / 2], _mm_add_epi8(wipV, _mm_set1_epi8(128))); + } + } +}