cuticle/hi/webmdec.cpp

#include"node.h"

#include<stdlib.h>
#include<webm/webm_parser.h>
#include<webm/file_reader.h>

#include<vpx/vpx_decoder.h>
#include<vpx/vp8dx.h>

#include<assert.h>
#include<time.h>

#include"img.h"

#include<string.h>

#include<tmmintrin.h>
#include<smmintrin.h>

#include<opus.h>

#include<math.h>

#include"minitrace.h"

#include"linearity.h"

struct CHiMovieNode;
struct CueParser : webm::Callback {
	CHiMovieNode *node;

	CueParser(CHiMovieNode *node) : node(node) {}

	webm::Status OnInfo(const webm::ElementMetadata& metadata, const webm::Info& info) final override;
	webm::Status OnTrackEntry(const webm::ElementMetadata &metadata, const webm::TrackEntry &info) override;
	webm::Status OnSegmentBegin(const webm::ElementMetadata &metadata, webm::Action *action) override;
	webm::Status OnCuePoint(const webm::ElementMetadata &metadata, const webm::CuePoint &cue) override;
};

struct AudioParser final : webm::Callback {

	uint64_t audioTrack;
	uint64_t currentClusterTimecode;
	uint64_t untihl;

	bool stop = true;
	bool skip = false;

	#define SAMPLE_ARR 48000
	OpusDecoder *opus;
	size_t sampleI = 0;
	size_t sampleReadI = 0;
	int16_t sampleArray[SAMPLE_ARR];

	~AudioParser() {
		if(opus) {
			opus_decoder_destroy(opus);
		}
	}

	webm::Status OnClusterBegin(const webm::ElementMetadata &metadata, const webm::Cluster &cluster, webm::Action *action) final override {
		currentClusterTimecode = cluster.timecode.value();
		return webm::Status(webm::Status::kOkCompleted);
	}
	webm::Status OnBlockBegin(const webm::ElementMetadata &metadata, const webm::Block &block, webm::Action *action) final override {
		if(block.track_number != audioTrack) {
			skip = true;
			*action = webm::Action::kSkip;
		} else {
			skip = false;
			if(currentClusterTimecode + block.timecode >= untihl) {
				stop = true;
			} else {
				stop = false;
			}
		}
		return webm::Status(webm::Status::kOkCompleted);
	}
	webm::Status OnSimpleBlockBegin(const webm::ElementMetadata &metadata, const webm::SimpleBlock &block, webm::Action *action) final override {
		return OnBlockBegin(metadata, block, action);
	}
	webm::Status OnFrame(const webm::FrameMetadata &metadata, webm::Reader *reader, uint64_t *bytes_remaining) final override {
		uint8_t *data = new uint8_t[metadata.size];
		uint64_t actuallyRead;
		reader->Read(metadata.size, data, &actuallyRead);

		if(!skip) {
			int16_t *f = new int16_t[6400];
			int numSamples = opus_decode(opus, data, metadata.size, f, 6400, 0);

			if(numSamples >= 0) {
				if(SAMPLE_ARR - sampleI >= (size_t) numSamples) {
					memcpy(&sampleArray[sampleI], f, sizeof(*sampleArray) * numSamples);
					sampleI = (sampleI + numSamples) % SAMPLE_ARR;
				} else {
					memcpy(&sampleArray[sampleI], f, sizeof(*sampleArray) * (SAMPLE_ARR - sampleI));
					memcpy(sampleArray, &f[SAMPLE_ARR - sampleI], sizeof(*sampleArray) * (numSamples - SAMPLE_ARR + sampleI));

					sampleI = (sampleI + numSamples) % SAMPLE_ARR;
				}
			}

			delete[] f;
		}

		delete[] data;
		*bytes_remaining = 0;

		return webm::Status{stop ? webm::Status::kOkPartial : webm::Status::kOkCompleted};
	}
};
struct FrameParser final : webm::Callback {
	uint64_t videoTrack, audioTrack;
	uint64_t currentClusterTimecode;
	uint64_t untihl;

	bool skip = true;

	vpx_image *lastImg = nullptr;

	CHiImage *output = nullptr;

	vpx_codec_ctx_t *codec;
	vpx_codec_iter_t *iter;

	uint64_t currentlyAt = 0;

	webm::Status OnClusterBegin(const webm::ElementMetadata &metadata, const webm::Cluster &cluster, webm::Action *action) final override {
		currentClusterTimecode = cluster.timecode.value();
		return webm::Status(webm::Status::kOkCompleted);
	}
	webm::Status OnBlockBegin(const webm::ElementMetadata &metadata, const webm::Block &block, webm::Action *action) final override {
		/*if(block.track_number == videoTrack) {
			printf("%lu %lu %i\n", currentClusterTimecode + block.timecode, untihl, currentlyAt <= untihl && currentClusterTimecode + block.timecode >= untihl);
		}*/
		if(block.track_number != videoTrack) {
			*action = webm::Action::kSkip;
		} else {
			if(currentlyAt <= untihl && currentClusterTimecode + block.timecode >= untihl) {
				skip = false;
			} else {
				skip = true;
			}
			currentlyAt = currentClusterTimecode + block.timecode;
		}
		return webm::Status(webm::Status::kOkCompleted);
	}
	webm::Status OnSimpleBlockBegin(const webm::ElementMetadata &metadata, const webm::SimpleBlock &block, webm::Action *action) final override {
		return OnBlockBegin(metadata, block, action);
	}
	webm::Status OnFrame(const webm::FrameMetadata &metadata, webm::Reader *reader, uint64_t *bytes_remaining) final override {
		//printf("FRAME WITH SKIP %i\n", skip);
		uint8_t *data = new uint8_t[metadata.size];
		uint64_t actuallyRead;
		reader->Read(metadata.size, data, &actuallyRead);
		vpx_codec_decode(codec, data, metadata.size, NULL, 0);
		vpx_image *img = NULL;
		while((img = vpx_codec_get_frame(codec, iter)) != NULL) {
			if(lastImg) vpx_img_free(lastImg);
			lastImg = img;
		}
		if(!skip && lastImg) {
			assert(lastImg->fmt & VPX_IMG_FMT_PLANAR);

			output = CHi_Image_New(2, 4, 8 * ((lastImg->d_w + 15) & ~15), lastImg->d_w, lastImg->d_h, NULL);

			__m128i z = _mm_set1_epi32(0);
			__m128i alpha = _mm_set_epi32(0xFFFF0000, 0, 0xFFFF0000, 0);
			__m128i sub16 = _mm_set1_epi32(-16);
			__m128i sub128 = _mm_set1_epi32(-128);
			#pragma omp parallel for simd
			for(size_t y = 0; y < lastImg->d_h; y++) {
				for(size_t x = 0; x < lastImg->d_w; x += 4) {
					__m128i ychannel = _mm_loadu_si128((__m128i*) (lastImg->planes[VPX_PLANE_Y] + y * lastImg->stride[VPX_PLANE_Y] + x));
					__m128i uchannel = _mm_loadu_si128((__m128i*) (lastImg->planes[VPX_PLANE_U] + y / 2 * lastImg->stride[VPX_PLANE_U] + x / 2));
					uchannel = _mm_unpacklo_epi8(uchannel, uchannel); // stretch color channels
					__m128i vchannel = _mm_loadu_si128((__m128i*) (lastImg->planes[VPX_PLANE_V] + y / 2 * lastImg->stride[VPX_PLANE_V] + x / 2));
					vchannel = _mm_unpacklo_epi8(vchannel, vchannel); // stretch color channels

					/* Interleave with zeroes to push out 12 of 16 pixels (we're working in groups of four) */
					__m128i ylo = _mm_add_epi32(sub16, _mm_unpacklo_epi16(_mm_unpacklo_epi8(ychannel, z), z));
					__m128i ulo = _mm_add_epi32(sub128, _mm_unpacklo_epi16(_mm_unpacklo_epi8(uchannel, z), z));
					__m128i vlo = _mm_add_epi32(sub128, _mm_unpacklo_epi16(_mm_unpacklo_epi8(vchannel, z), z));

					/* Start parallel matrix multiplication (BT.709 matrix * 255/219 to turn from studio to full range) */
					/*
								/ 1.164  0      1.833 \
						RGB =	| 1.164 -0.218 -0.545 | * (Y - 16, U - 128, V - 128)
								\ 1.164  2.160  0     /
					*/
					__m128i partY = _mm_mullo_epi32(ylo, _mm_set1_epi32(297));
					__m128i partVR = _mm_mullo_epi32(vlo, _mm_set1_epi32(467));
					__m128i partUG = _mm_mullo_epi32(ulo, _mm_set1_epi32(-56));
					__m128i partVG = _mm_mullo_epi32(vlo, _mm_set1_epi32(-139));
					__m128i partUB = _mm_mullo_epi32(ulo, _mm_set1_epi32(551));

					/* Finish matrix multiplication by summing up parts (finishing the dot products), clip */
					__m128i r = _mm_max_epi32(z, _mm_min_epi32(_mm_set1_epi32(0xFFFF), _mm_add_epi32(partY, partVR)));
					__m128i g = _mm_max_epi32(z, _mm_min_epi32(_mm_set1_epi32(0xFFFF), _mm_add_epi32(partY, _mm_add_epi32(partUG, partVG))));
					__m128i b = _mm_max_epi32(z, _mm_min_epi32(_mm_set1_epi32(0xFFFF), _mm_add_epi32(partY, partUB)));

					r = apply_gamma_epi32(r, _mm_set1_ps(2.2f));
					g = apply_gamma_epi32(g, _mm_set1_ps(2.2f));
					b = apply_gamma_epi32(b, _mm_set1_ps(2.2f));

					__m128i rgblo = _mm_or_si128(alpha, _mm_or_si128(_mm_or_si128(_mm_unpacklo_epi32(b, z), _mm_slli_si128(_mm_unpacklo_epi32(g, z), 2)), _mm_slli_si128(_mm_unpacklo_epi32(r, z), 4)));

					_mm_stream_si128((__m128i*) ((uintptr_t) output->data16 + y * output->stride + x * 8 + 0), rgblo);

					__m128i rgbhi = _mm_or_si128(alpha, _mm_or_si128(_mm_or_si128(_mm_unpackhi_epi32(b, z), _mm_slli_si128(_mm_unpackhi_epi32(g, z), 2)), _mm_slli_si128(_mm_unpackhi_epi32(r, z), 4)));

					_mm_stream_si128((__m128i*) ((uintptr_t) output->data16 + y * output->stride + x * 8 + 16), rgbhi);
				}
			}
		}

		delete[] data;
		*bytes_remaining = 0;

		webm::Status ret{skip ? webm::Status::kOkCompleted : webm::Status::kOkPartial};
		skip = true;
		return ret;
	}
};

struct CHiMovieNode {
	int64_t timeCache = -1;
	char *filepathCache;

	FILE *vf;
	webm::FileReader vreader;
	webm::WebmParser vparser;
	FrameParser fp;
	std::string vcodecid;
	size_t vw, vh;

	FILE *af;
	webm::FileReader areader;
	webm::WebmParser aparser;
	AudioParser ap;

	std::vector<webm::CuePoint> cuepoints;
	uint64_t segmentOff, videoTrack, audioTrack;

	double duration;

	vpx_codec_ctx_t codec;
	vpx_codec_iter_t iter;

	CHiPubNode pub;
};

webm::Status CueParser::OnInfo(const webm::ElementMetadata &metadata, const webm::Info &info) {
	node->duration = info.duration.value() / 1000;
	return webm::Status(webm::Status::kOkCompleted);
}

webm::Status CueParser::OnTrackEntry(const webm::ElementMetadata &metadata, const webm::TrackEntry &info) {
	if(info.track_type.value() == webm::TrackType::kVideo && info.is_enabled.value() /*&& !info.uses_lacing.value()*/) {
		node->vcodecid = info.codec_id.value();
		node->videoTrack = info.track_number.value();
		node->vw = info.video.value().pixel_width.value();
		node->vh = info.video.value().pixel_height.value();
	}
	if(info.track_type.value() == webm::TrackType::kAudio && info.is_enabled.value()) {
		node->audioTrack = info.track_number.value();
	}

	return webm::Status(webm::Status::kOkCompleted);
}
webm::Status CueParser::OnSegmentBegin(const webm::ElementMetadata &metadata, webm::Action *action) {
	node->segmentOff = metadata.position + metadata.header_size;
	return webm::Status(webm::Status::kOkCompleted);
}
webm::Status CueParser::OnCuePoint(const webm::ElementMetadata &metadata, const webm::CuePoint &cue) {
	node->cuepoints.push_back(cue);
	return webm::Status(webm::Status::kOkCompleted);
}

static int movie_perform(CHiPubNode *pub) {
	CHiMovieNode *node = (CHiMovieNode*) ((uintptr_t) pub - offsetof(CHiMovieNode, pub));

	MTR_BEGIN("CHi", "movie_perform");

	int64_t t;
	if(pub->sinks[1].type == CUTIHI_VAL_NONE) t = CHi_Time_Get(pub->ng) * 1000;
	else t = CHi_Crawl(&pub->sinks[1])->data.vec4[0] * 1000;

	pub->sources[0].type = CUTIHI_VAL_SAMPLE;

	char *filepath = CHi_Crawl(&pub->sinks[0])->data.text;

	if(!node->filepathCache || strcmp(node->filepathCache, filepath) != 0) {
		node->vf = fopen(filepath, "rb");
		new (&node->vreader) webm::FileReader{node->vf};
		new (&node->vparser) webm::WebmParser{};

		node->af = fopen(filepath, "rb");
		new (&node->areader) webm::FileReader{node->af};
		new (&node->aparser) webm::WebmParser{};

		node->cuepoints.clear();

		CueParser cp{node};
		node->vparser.Feed(&cp, &node->vreader);

		free(node->filepathCache);
		node->filepathCache = strdup(filepath);
		node->timeCache = std::numeric_limits<int64_t>::max();

		if(node->vcodecid == "V_VP9") {
			vpx_codec_dec_init(&node->codec, vpx_codec_vp9_dx(), NULL, 0);
		} else if(node->vcodecid == "V_VP8") {
			vpx_codec_dec_init(&node->codec, vpx_codec_vp8_dx(), NULL, 0);
		} else {
			return 1;
		}

		new (&node->fp) FrameParser{};
		node->fp.videoTrack = node->videoTrack;
		node->fp.codec = &node->codec;
		node->fp.iter = &node->iter;

		new (&node->ap) AudioParser{};
		int error;
		node->ap.opus = opus_decoder_create(48000, 1, &error);
		node->ap.audioTrack = node->audioTrack;
	}

	if(t == node->timeCache) {
		return 1;
	}

	if(pub->sources[0].data.sample) {
		CHi_Image_Free(pub->sources[0].data.sample);
		pub->sources[0].data.sample = nullptr;
	}

	if(t >= 0 && t < 1000 * node->duration) {

		if(t < node->timeCache || (t - node->timeCache) > 5000) {

			if(node->cuepoints.size() > 0) {

				size_t i;
				for(i = 0; i < node->cuepoints.size(); i++) {
					if(t < node->cuepoints[i].time.value()) {
						break;
					}
				}

				if(i != 0) i--;

				for(webm::Element<webm::CueTrackPositions> &p : node->cuepoints[i].cue_track_positions) {
					if(p.value().track.value() == node->videoTrack) {
						fseek(node->vf, node->segmentOff + p.value().cluster_position.value(), SEEK_SET);
						fseek(node->af, node->segmentOff + p.value().cluster_position.value(), SEEK_SET);
						break;
					}
				}

			} else {

				fseek(node->vf, 0, SEEK_SET);
				fseek(node->af, 0, SEEK_SET);

			}

		}

		node->fp.untihl = t;
		node->ap.untihl = t;

		/* Always necessary for some reason, else stops parsing after seek (as in no callbacks called).. */
		node->vparser.DidSeek();
		node->aparser.DidSeek();

		node->vparser.Feed(&node->fp, &node->vreader);
		node->aparser.Feed(&node->ap, &node->areader);

		pub->sources[0].data.sample = node->fp.output;

		node->timeCache = t;

	}

	if(!pub->sources[0].data.sample) {
		pub->sources[0].data.sample = CHi_Image_New(2, 4, 8 * node->vw, node->vw, node->vh, NULL);
	}

	size_t width = roundf(CHi_Time_GetDelta(pub->ng) * 48000);
	CHiImage *aud = CHi_Image_New(4, 1, 4 * width, width, 1, NULL);
	if(node->pub.ng->compilationStatus == CUTIHI_COMP_RUNNING) {
		if(node->ap.sampleReadI + width > SAMPLE_ARR) {
			memcpy(aud->data16, node->ap.sampleArray + node->ap.sampleReadI, sizeof(*node->ap.sampleArray) * (SAMPLE_ARR - node->ap.sampleReadI));
			memcpy(aud->data16 + SAMPLE_ARR - node->ap.sampleReadI, node->ap.sampleArray, sizeof(*node->ap.sampleArray) * (width - SAMPLE_ARR + node->ap.sampleReadI));
		} else {
			memcpy(aud->data16, node->ap.sampleArray + node->ap.sampleReadI, sizeof(*node->ap.sampleArray) * width);
		}
		node->ap.sampleReadI = (node->ap.sampleReadI + width) % SAMPLE_ARR;
	} else {
		memset(aud->data16, 0, aud->stride * aud->height);
	}

	if(pub->sources[1].data.sample) CHi_Image_Free(pub->sources[1].data.sample);
	pub->sources[1].type = CUTIHI_VAL_SAMPLE;
	pub->sources[1].data.sample = aud;

	pub->clean = 0;

	MTR_END("CHi", "movie_perform");

	return 1;
}

static void movie_destroy(CHiPubNode *pub) {
	CHiMovieNode *node = (CHiMovieNode*) ((uintptr_t) pub - offsetof(CHiMovieNode, pub));

	if(node->filepathCache) {
		free(node->filepathCache);
	}

	if(node->af) {
		fclose(node->af);
	}

	if(node->vf) {
		fclose(node->vf);
		vpx_codec_destroy(&node->codec);
	}

	node->~CHiMovieNode();

	free(node);
}

extern "C" {
CUTIVIS CHiPubNode *CHi_Movie() {
	CHiMovieNode *n = (CHiMovieNode*) calloc(1, sizeof(*n));
	new (n) CHiMovieNode();
	n->pub.type = CUTIHI_T('CMov','ie  ');
	n->pub.Perform = movie_perform;
	n->pub.Destroy = movie_destroy;
	n->pub.clean = 0;
	n->pub.sinkCount = 2;
	n->pub.sinks = (CHiValue*) calloc(sizeof(*n->pub.sinks), n->pub.sinkCount);
	n->pub.sinks[1].type = CUTIHI_VAL_VEC4;
	n->pub.sinks[1].data.vec4[0] = 0;
	n->pub.sourceCount = 2;
	n->pub.sources = (CHiValue*) calloc(sizeof(*n->pub.sources), n->pub.sourceCount);
	return &n->pub;
}
}