#include"node.h" #include #include #include #include #include #include"mode.h" #include"img.h" #include #include #include #include #include"linearity.h" struct CHiEncodeVP9Node { vpx_codec_ctx_t codec; vpx_codec_enc_cfg_t cfg; enum { WAITING, IN_PROGRESS } state; uint8_t *outY, *outU, *outV; uint16_t strideY, strideU, strideV; vpx_codec_iface_t *iface; CHiPubNode pub; }; static int encodevp9_perform(CHiPubNode *pub) { CHiEncodeVP9Node *node = (CHiEncodeVP9Node*) ((uintptr_t) pub - offsetof(CHiEncodeVP9Node, pub)); pub->sources[0].type = CUTIHI_VAL_VP9BS; pub->sources[0].data.bitstream = NULL; if(node->state == CHiEncodeVP9Node::WAITING) return 1; CHiImage *rgbIn = (CHiImage*) CHi_Crawl(&pub->sinks[0])->data.sample; #pragma omp parallel for simd for(size_t y = 0; y < node->cfg.g_h; y += 2) { for(size_t x = 0; x < node->cfg.g_w; x += 16) { __m128i rgb, partY, partU, partV, dotY, dotU, dotV; __m128i wipY0 = _mm_setzero_si128(); __m128i wipY1 = _mm_setzero_si128(); __m128i wipU = _mm_setzero_si128(); __m128i wipV = _mm_setzero_si128(); __m128i tempU = _mm_setzero_si128(); __m128i tempV = _mm_setzero_si128(); #define DO_DAH_DOO_DOO(LoOrHi, shufY, shufUV) \ /* Process top two */\ rgb = _mm_srli_epi16(apply_gamma_epi16(line0, _mm_set1_ps(1 / 2.2f)), 8); \ /* Start matrix multiplication (BT.709 + full->studio range) */\ partY = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 47, 157, 16, 0, 47, 157, 16));\ partU = _mm_mullo_epi16(rgb, _mm_set_epi16(0, -25, -85, 110, 0, -25, -85, 110));\ partV = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 110, -100, -10, 0, 110, -100, -10));\ /* Finish mat-mul with dot products */\ dotY = _mm_madd_epi16(partY, _mm_set1_epi16(1));\ dotY = _mm_hadd_epi32(dotY, _mm_setzero_si128());\ dotU = _mm_madd_epi16(partU, _mm_set1_epi16(1));\ dotU = _mm_hadd_epi32(dotU, _mm_setzero_si128());\ dotV = _mm_madd_epi16(partV, _mm_set1_epi16(1));\ dotV = _mm_hadd_epi32(dotV, _mm_setzero_si128());\ /* Insert Ys */\ wipY0 = _mm_or_si128(wipY0, _mm_shuffle_epi8(dotY, shufY));\ /* Save top UV */\ tempU = dotU;\ tempV = dotV;\ \ /* Process bottom two */\ rgb = _mm_srli_epi16(apply_gamma_epi16(line1, _mm_set1_ps(1 / 2.2f)), 8); \ /* Start matrix multiplication (BT.709 + full->studio range) */\ partY = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 47, 157, 16, 0, 47, 157, 16));\ partU = _mm_mullo_epi16(rgb, _mm_set_epi16(0, -25, -85, 110, 0, -25, -85, 110));\ partV = _mm_mullo_epi16(rgb, _mm_set_epi16(0, 110, -100, -10, 0, 110, -100, -10));\ /* Finish mat-mul with dot products */\ dotY = _mm_madd_epi16(partY, _mm_set1_epi16(1));\ dotY = _mm_hadd_epi32(dotY, _mm_setzero_si128());\ dotU = _mm_madd_epi16(partU, _mm_set1_epi16(1));\ dotU = _mm_hadd_epi32(dotU, _mm_setzero_si128());\ dotV = _mm_madd_epi16(partV, _mm_set1_epi16(1));\ dotV = _mm_hadd_epi32(dotV, _mm_setzero_si128());\ /* Insert Ys */\ wipY1 = _mm_or_si128(wipY1, _mm_shuffle_epi8(dotY, shufY));\ /* Save bottom UVs */\ tempU = _mm_hadd_epi32(_mm_add_epi32(tempU, dotU), _mm_setzero_si128());\ tempV = _mm_hadd_epi32(_mm_add_epi32(tempV, dotV), _mm_setzero_si128());\ \ /* Insert UVs */\ wipU = _mm_or_si128(wipU, _mm_shuffle_epi8(_mm_srli_epi32(tempU, 2), shufUV));\ wipV = _mm_or_si128(wipV, _mm_shuffle_epi8(_mm_srli_epi32(tempV, 2), shufUV)); __m128i line0 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 0) * rgbIn->stride + (x + 0) * 8)); // Load two pixels __m128i line1 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 1) * rgbIn->stride + (x + 0) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 5, 1), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1)); line0 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 0) * rgbIn->stride + (x + 2) * 8)); // Load two pixels line1 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 1) * rgbIn->stride + (x + 2) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 5, 1, -128, -128), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128)); line0 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 0) * rgbIn->stride + (x + 4) * 8)); // Load two pixels line1 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 1) * rgbIn->stride + (x + 4) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 5, 1, -128, -128, -128, -128), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128)); line0 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 0) * rgbIn->stride + (x + 6) * 8)); // Load two pixels line1 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 1) * rgbIn->stride + (x + 6) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, 5, 1, -128, -128, -128, -128, -128, -128), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128)); line0 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 0) * rgbIn->stride + (x + 8) * 8)); // Load two pixels line1 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 1) * rgbIn->stride + (x + 8) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8(-128, -128, -128, -128, -128, -128, 5, 1, -128, -128, -128, -128, -128, -128, -128, -128), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128)); line0 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 0) * rgbIn->stride + (x + 10) * 8)); // Load two pixels line1 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 1) * rgbIn->stride + (x + 10) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8(-128, -128, -128, -128, 5, 1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128, -128)); line0 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 0) * rgbIn->stride + (x + 12) * 8)); // Load two pixels line1 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 1) * rgbIn->stride + (x + 12) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8(-128, -128, 5, 1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128, -128, -128)); line0 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 0) * rgbIn->stride + (x + 14) * 8)); // Load two pixels line1 = _mm_load_si128((__m128i*) ((uintptr_t) rgbIn->data16 + (y + 1) * rgbIn->stride + (x + 14) * 8)); // Load two pixels DO_DAH_DOO_DOO(_mm_unpacklo_epi8, _mm_set_epi8( 5, 1, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128, -128), _mm_set_epi8(-128, -128, -128, -128, -128, -128, -128, -128, 1, -128, -128, -128, -128, -128, -128, -128)); _mm_stream_si128((__m128i*) &node->outY[node->strideY * (y + 0) + x], _mm_add_epi8(_mm_set1_epi8(16), wipY0)); _mm_stream_si128((__m128i*) &node->outY[node->strideY * (y + 1) + x], _mm_add_epi8(_mm_set1_epi8(16), wipY1)); _mm_storeu_si128((__m128i*) &node->outU[node->strideU * (y / 2) + x / 2], _mm_add_epi8(wipU, _mm_set1_epi8(128))); _mm_storeu_si128((__m128i*) &node->outV[node->strideV * (y / 2) + x / 2], _mm_add_epi8(wipV, _mm_set1_epi8(128))); } } vpx_image_t vpxraw; vpxraw.fmt = VPX_IMG_FMT_I420; vpxraw.cs = VPX_CS_BT_709; vpxraw.range = VPX_CR_STUDIO_RANGE; vpxraw.bit_depth = 8; vpxraw.w = vpxraw.d_w = node->cfg.g_w; vpxraw.h = vpxraw.d_h = node->cfg.g_h; vpxraw.r_w = vpxraw.r_h = 0; vpxraw.x_chroma_shift = vpxraw.y_chroma_shift = 1; vpxraw.img_data_owner = 0; vpxraw.self_allocd = 0; vpxraw.bps = 12; vpxraw.stride[VPX_PLANE_Y] = node->strideY; vpxraw.planes[VPX_PLANE_Y] = node->outY; vpxraw.stride[VPX_PLANE_U] = node->strideU; vpxraw.planes[VPX_PLANE_U] = node->outU; vpxraw.stride[VPX_PLANE_V] = node->strideV; vpxraw.planes[VPX_PLANE_V] = node->outV; vpx_codec_encode(&node->codec, &vpxraw, CHi_Time_Get(pub->ng) * 1000.f, 1, 0, VPX_DL_REALTIME); auto ret = (CHiBSFrames*) malloc(sizeof(CHiBSFrames)); ret->count = 0; vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt; while((pkt = vpx_codec_get_cx_data(&node->codec, &iter)) != NULL) { if(pkt->kind == VPX_CODEC_CX_FRAME_PKT) { ret = (CHiBSFrames*) realloc(ret, sizeof(CHiBSFrames) + sizeof(CHiBSFrame) * (ret->count + 1)); ret->data[ret->count].timestamp = pkt->data.frame.pts; ret->data[ret->count].sz = pkt->data.frame.sz; ret->data[ret->count].flags = pkt->data.frame.flags & VPX_FRAME_IS_KEY; ret->data[ret->count].ptr = malloc(ret->data[ret->count].sz); memcpy(ret->data[ret->count].ptr, pkt->data.frame.buf, ret->data[ret->count].sz); ret->count++; } } // if(pktRet) v->queueOut.enqueue(pktRet); //memcpy(node->vpxraw.planes[VPX_PLANE_Y], VIPS_IMAGE_ADDR(y, 0, 0), node->vpxraw.stride[VPX_PLANE_Y] * node->vpxraw.d_h); //memcpy(node->vpxraw.planes[VPX_PLANE_U], VIPS_IMAGE_ADDR(u, 0, 0), node->vpxraw.stride[VPX_PLANE_U] * (node->vpxraw.d_h >> node->vpxraw.y_chroma_shift)); //memcpy(node->vpxraw.planes[VPX_PLANE_V], VIPS_IMAGE_ADDR(v, 0, 0), node->vpxraw.stride[VPX_PLANE_V] * (node->vpxraw.d_h >> node->vpxraw.y_chroma_shift)); //const vpx_codec_cx_pkt_t *pkt; //while(!node->queueOut.try_dequeue(pkt)) usleep(0); pub->sources[0].data.bitstream = ret; return 1; } CUTIVIS CHiPubNode *CHi_EncodeVP8() { CHiEncodeVP9Node *n = (CHiEncodeVP9Node*) malloc(sizeof(*n)); new (n) CHiEncodeVP9Node(); n->pub.type = CUTIHI_T('CEnc','GVP8'); n->pub.Start = CHi_EncodeVP9_Start; n->pub.Perform = encodevp9_perform; n->pub.Stop = CHi_EncodeVP9_Stop; n->pub.clean = 0; n->pub.sinks = (CHiValue*) calloc(sizeof(*n->pub.sinks), n->pub.sinkCount = 1); n->pub.sources = (CHiValue*) calloc(sizeof(*n->pub.sources), n->pub.sourceCount = 1); n->state = CHiEncodeVP9Node::WAITING; n->iface = vpx_codec_vp8_cx(); return &n->pub; } CUTIVIS CHiPubNode *CHi_EncodeVP9() { CHiEncodeVP9Node *n = (CHiEncodeVP9Node*) malloc(sizeof(*n)); new (n) CHiEncodeVP9Node(); n->pub.type = CUTIHI_T('CEnc','GVP9'); n->pub.Start = CHi_EncodeVP9_Start; n->pub.Perform = encodevp9_perform; n->pub.Stop = CHi_EncodeVP9_Stop; n->pub.clean = 0; n->pub.sinks = (CHiValue*) calloc(sizeof(*n->pub.sinks), n->pub.sinkCount = 1); n->pub.sources = (CHiValue*) calloc(sizeof(*n->pub.sources), n->pub.sourceCount = 1); n->state = CHiEncodeVP9Node::WAITING; n->iface = vpx_codec_vp9_cx(); return &n->pub; } CUTIVIS int CHi_EncodeVP9_Start(CHiPubNode *pubn) { CHiEncodeVP9Node *node = (CHiEncodeVP9Node*) ((uintptr_t) pubn - offsetof(CHiEncodeVP9Node, pub)); node->state = CHiEncodeVP9Node::IN_PROGRESS; CHiImage *firstFrame = (CHiImage*) CHi_Crawl(&pubn->sinks[0])->data.sample; vpx_codec_enc_config_default(node->iface, &node->cfg, 0); node->cfg.g_w = firstFrame->width; node->cfg.g_h = firstFrame->height; node->cfg.g_timebase.num = 1; node->cfg.g_timebase.den = 30; node->cfg.g_lag_in_frames = 0; node->cfg.g_threads = 8; node->cfg.kf_mode = VPX_KF_AUTO; node->cfg.kf_max_dist = 300; node->cfg.rc_end_usage = VPX_VBR; node->cfg.rc_target_bitrate = 512; node->cfg.rc_min_quantizer = 4; node->cfg.rc_max_quantizer = 48; vpx_codec_enc_init(&node->codec, node->iface, &node->cfg, 0); vpx_codec_control(&node->codec, VP8E_SET_CPUUSED, 8); vpx_codec_control(&node->codec, VP9E_SET_ROW_MT, 1); vpx_codec_control(&node->codec, VP9E_SET_TILE_COLUMNS, 2); vpx_codec_control(&node->codec, VP9E_SET_TILE_ROWS, 1); vpx_codec_control(&node->codec, VP9E_SET_TUNE_CONTENT, VP9E_CONTENT_SCREEN); node->strideY = (node->cfg.g_w + 64) & ~63; node->strideU = (node->cfg.g_w / 2 + 64) & ~63; node->strideV = (node->cfg.g_w / 2 + 64) & ~63; node->outY = (uint8_t*) _mm_malloc(node->strideY * node->cfg.g_h, 16); node->outU = (uint8_t*) _mm_malloc(node->strideU * node->cfg.g_h / 2, 16); node->outV = (uint8_t*) _mm_malloc(node->strideV * node->cfg.g_h / 2, 16); return 1; } CUTIVIS int CHi_EncodeVP9_Stop(CHiPubNode *pubn) { CHiEncodeVP9Node *node = (CHiEncodeVP9Node*) ((uintptr_t) pubn - offsetof(CHiEncodeVP9Node, pub)); node->state = CHiEncodeVP9Node::WAITING; _mm_free(node->outY); _mm_free(node->outU); _mm_free(node->outV); return 1; } struct CHiMuxWebmNode { CHiPubNode pub; mkvmuxer::MkvWriter w; mkvmuxer::Segment seg; size_t videoTrack, audioTrack; std::queue audioBacklog; std::queue videoBacklog; }; static int muxwebm_perform(CHiPubNode *pubn) { using namespace mkvmuxer; CHiMuxWebmNode *alln = (CHiMuxWebmNode*) pubn; if(pubn->sinks[1].data.linked.to) { CHiBSFrames *opus = CHi_Crawl(&pubn->sinks[1])->data.bitstream; for(size_t i = 0; i < opus->count; i++) { alln->audioBacklog.push(opus->data[i]); } } auto vp9bs = CHi_Crawl(&pubn->sinks[0])->data.bitstream; if(vp9bs) { for(size_t i = 0; i < vp9bs->count; i++) { alln->videoBacklog.push(vp9bs->data[i]); } } while(pubn->sinks[1].data.linked.to && alln->audioBacklog.size() > 0 && alln->videoBacklog.size() > 0 && alln->audioBacklog.front().timestamp <= alln->videoBacklog.front().timestamp) { Frame frame; frame.Init((const uint8_t*) alln->audioBacklog.front().ptr, alln->audioBacklog.front().sz); frame.set_track_number(alln->audioTrack); frame.set_timestamp(alln->audioBacklog.front().timestamp * 1000000L); frame.set_is_key(true); alln->seg.AddGenericFrame(&frame); alln->audioBacklog.pop(); } if(pubn->sinks[1].data.linked.to == NULL || (alln->audioBacklog.size() > 0 && alln->videoBacklog.size() > 0 && alln->audioBacklog.front().timestamp >= alln->videoBacklog.front().timestamp)) { Frame frame; if(!frame.Init((const uint8_t*) alln->videoBacklog.front().ptr, alln->videoBacklog.front().sz)) puts("INIT FAIL"); frame.set_track_number(alln->videoTrack); frame.set_timestamp(alln->videoBacklog.front().timestamp * 1000000L); frame.set_is_key(!!(alln->videoBacklog.front().flags & CUTIHI_BS_FLAG_KEY)); if(!alln->seg.AddGenericFrame(&frame)) puts("ADD FAIL"); alln->videoBacklog.pop(); } return 1; } CUTIVIS CHiPubNode *CHi_MuxWebm() { CHiMuxWebmNode *n = (CHiMuxWebmNode*) malloc(sizeof(*n)); n->pub.type = CUTIHI_T('CExp','Webm'); n->pub.Start = CHi_MuxWebm_Start; n->pub.Perform = muxwebm_perform; n->pub.Stop = CHi_MuxWebm_Stop; n->pub.clean = 0; n->pub.sinks = (CHiValue*) calloc(sizeof(*n->pub.sinks), n->pub.sinkCount = 3); n->pub.sourceCount = 0; n->pub.sources = NULL; new (&n->audioBacklog) std::queue(); new (&n->videoBacklog) std::queue(); return &n->pub; } CUTIVIS int CHi_MuxWebm_Start(CHiPubNode *pubn) { using namespace mkvmuxer; CHiMuxWebmNode *alln = (CHiMuxWebmNode*) pubn; new (&alln->w) MkvWriter{}; alln->w.Open(CHi_Crawl(&pubn->sinks[CUTIHI_MUXWEBM_IN_FILENAME])->data.text); new (&alln->seg) Segment{}; alln->seg.Init(&alln->w); alln->seg.AccurateClusterDuration(true); alln->seg.UseFixedSizeClusterTimecode(false); alln->seg.set_mode(Segment::kFile); alln->seg.OutputCues(true); alln->seg.set_duration(pubn->ng->duration * 1000); alln->seg.GetSegmentInfo()->set_timecode_scale(1000000); alln->seg.GetSegmentInfo()->set_writing_app("Cuticle"); /* Hack into first frame to get resolution */ CHiPubNode *evp9 = pubn->sinks[0].data.linked.to; CHiImage *firstFrame = (CHiImage*) CHi_Crawl(&evp9->sinks[0])->data.sample; alln->videoTrack = alln->seg.AddVideoTrack(firstFrame->width, firstFrame->height, 0); VideoTrack *track = (VideoTrack*) alln->seg.GetTrackByNumber(alln->videoTrack); track->set_codec_id(CHi_Crawl(&pubn->sinks[0])->type == CUTIHI_VAL_VP9BS ? Tracks::kVp9CodecId : Tracks::kVp8CodecId); track->set_frame_rate(30); Colour colourspace; colourspace.set_matrix_coefficients(Colour::MatrixCoefficients::kBt709); colourspace.set_range(Colour::Range::kBroadcastRange); colourspace.set_transfer_characteristics(Colour::TransferCharacteristics::kIturBt709Tc); colourspace.set_primaries(Colour::Primaries::kIturBt709P); track->SetColour(colourspace); alln->seg.CuesTrack(alln->videoTrack); if(pubn->sinks[1].data.linked.to) { struct __attribute__((packed)) { uint32_t magic1; uint32_t magic2; uint8_t version; uint8_t channels; // NUMBER OF CHANNELS IS HARDCODED TO ONE uint16_t preskip; uint32_t sampleRate; uint16_t gain; uint8_t family; } header = {0x7375704f, 0x64616548, 1, 1, 3840, 48000, 0, 0}; alln->audioTrack = alln->seg.AddAudioTrack(48000, 1, 0); AudioTrack *atrack = (AudioTrack*) alln->seg.GetTrackByNumber(alln->audioTrack); atrack->set_codec_id(Tracks::kOpusCodecId); atrack->set_seek_pre_roll(80000000); atrack->SetCodecPrivate((const uint8_t*) &header, sizeof(header)); } return 1; } CUTIVIS int CHi_MuxWebm_Stop(CHiPubNode *pubn) { CHiMuxWebmNode *alln =(CHiMuxWebmNode*) pubn; alln->seg.Finalize(); alln->w.Close(); return 1; }