--- /dev/null
+#include "overlayframe.h"
+
+/* Fully resampled scale / translate / blend ******************************/
+/* resample into a temporary row vector, then blend */
+
+#define XSAMPLE(FN, temp_type, type, max, components, ofs, round) { \
+ float temp[oh*components]; \
+ temp_type opcty = fade * max + round, trnsp = max - opcty; \
+ type **output_rows = (type**)voutput->get_rows() + o1i; \
+ type **input_rows = (type**)vinput->get_rows(); \
+ \
+ for(int i = pkg->out_col1; i < pkg->out_col2; i++) { \
+ type *input = input_rows[i - engine->col_out1 + engine->row_in]; \
+ float *tempp = temp; \
+ if( !k ) { /* direct copy case */ \
+ type *ip = input + i1i * components; \
+ for(int j = 0; j < oh; j++) { \
+ *tempp++ = *ip++; \
+ *tempp++ = *ip++ - ofs; \
+ *tempp++ = *ip++ - ofs; \
+ if( components == 4 ) *tempp++ = *ip++; \
+ } \
+ } \
+ else { /* resample */ \
+ for(int j = 0; j < oh; j++) { \
+ float racc=0.f, gacc=0.f, bacc=0.f, aacc=0.f; \
+ int ki = lookup_sk[j], x = lookup_sx0[j]; \
+ type *ip = input + x * components; \
+ float wacc = 0, awacc = 0; \
+ while(x++ < lookup_sx1[j]) { \
+ float kv = k[abs(ki >> INDEX_FRACTION)]; \
+ /* handle fractional pixels on edges of input */ \
+ if(x == i1i) kv *= i1f; \
+ if(x + 1 == i2i) kv *= i2f; \
+ if( components == 4 ) { awacc += kv; kv *= ip[3]; } \
+ wacc += kv; \
+ racc += kv * *ip++; \
+ gacc += kv * (*ip++ - ofs); \
+ bacc += kv * (*ip++ - ofs); \
+ if( components == 4 ) { aacc += kv; ++ip; } \
+ ki += kd; \
+ } \
+ if(wacc > 0.) wacc = 1. / wacc; \
+ *tempp++ = racc * wacc; \
+ *tempp++ = gacc * wacc; \
+ *tempp++ = bacc * wacc; \
+ if( components == 4 ) { \
+ if(awacc > 0.) awacc = 1. / awacc; \
+ *tempp++ = aacc * awacc; \
+ } \
+ } \
+ } \
+ \
+ /* handle fractional pixels on edges of output */ \
+ temp[0] *= o1f; temp[1] *= o1f; temp[2] *= o1f; \
+ if( components == 4 ) temp[3] *= o1f; \
+ tempp = temp + (oh-1)*components; \
+ tempp[0] *= o2f; tempp[1] *= o2f; tempp[2] *= o2f; \
+ if( components == 4 ) tempp[3] *= o2f; \
+ tempp = temp; \
+ /* blend output */ \
+ for(int j = 0; j < oh; j++) { \
+ type *output = output_rows[j] + i * components; \
+ if( components == 4 ) { \
+ temp_type r, g, b, a; \
+ ALPHA4_BLEND(FN, temp_type, tempp, output, max, 0, ofs, round); \
+ ALPHA4_STORE(output, ofs, max); \
+ } \
+ else { \
+ temp_type r, g, b; \
+ ALPHA3_BLEND(FN, temp_type, tempp, output, max, 0, ofs, round); \
+ ALPHA3_STORE(output, ofs, max); \
+ } \
+ tempp += components; \
+ } \
+ } \
+ break; \
+}
+
+#define XBLEND_SAMPLE(FN) { \
+ switch(vinput->get_color_model()) { \
+ case BC_RGB_FLOAT: XSAMPLE(FN, z_float, z_float, 1.f, 3, 0.f, 0.f); \
+ case BC_RGBA_FLOAT: XSAMPLE(FN, z_float, z_float, 1.f, 4, 0.f, 0.f); \
+ case BC_RGB888: XSAMPLE(FN, z_int32_t, z_uint8_t, 0xff, 3, 0, .5f); \
+ case BC_YUV888: XSAMPLE(FN, z_int32_t, z_uint8_t, 0xff, 3, 0x80, .5f); \
+ case BC_RGBA8888: XSAMPLE(FN, z_int32_t, z_uint8_t, 0xff, 4, 0, .5f); \
+ case BC_YUVA8888: XSAMPLE(FN, z_int32_t, z_uint8_t, 0xff, 4, 0x80, .5f); \
+ case BC_RGB161616: XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0, .5f); \
+ case BC_YUV161616: XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0x8000, .5f); \
+ case BC_RGBA16161616: XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0, .5f); \
+ case BC_YUVA16161616: XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0x8000, .5f); \
+ } \
+ break; \
+}
+
+
+SamplePackage::SamplePackage()
+{
+}
+
+SampleUnit::SampleUnit(SampleEngine *server)
+ : LoadClient(server)
+{
+ this->engine = server;
+}
+
+SampleUnit::~SampleUnit()
+{
+}
+
+void SampleUnit::process_package(LoadPackage *package)
+{
+ SamplePackage *pkg = (SamplePackage*)package;
+
+ float i1 = engine->in1;
+ float i2 = engine->in2;
+ float o1 = engine->out1;
+ float o2 = engine->out2;
+
+ if(i2 - i1 <= 0 || o2 - o1 <= 0)
+ return;
+
+ VFrame *voutput = engine->output;
+ VFrame *vinput = engine->input;
+ int mode = engine->mode;
+ float fade =
+ BC_CModels::has_alpha(vinput->get_color_model()) &&
+ mode == TRANSFER_REPLACE ? 1.f : engine->alpha;
+
+ //int iw = vinput->get_w();
+ int i1i = floor(i1);
+ int i2i = ceil(i2);
+ float i1f = 1.f - i1 + i1i;
+ float i2f = 1.f - i2i + i2;
+
+ int o1i = floor(o1);
+ int o2i = ceil(o2);
+ float o1f = 1.f - o1 + o1i;
+ float o2f = 1.f - o2i + o2;
+ int oh = o2i - o1i;
+
+ float *k = engine->kernel->lookup;
+ //float kw = engine->kernel->width;
+ //int kn = engine->kernel->n;
+ int kd = engine->kd;
+
+ int *lookup_sx0 = engine->lookup_sx0;
+ int *lookup_sx1 = engine->lookup_sx1;
+ int *lookup_sk = engine->lookup_sk;
+ //float *lookup_wacc = engine->lookup_wacc;
+
+ BLEND_SWITCH(XBLEND_SAMPLE);
+}
+
+
+SampleEngine::SampleEngine(int cpus)
+ : LoadServer(cpus, cpus)
+{
+ lookup_sx0 = 0;
+ lookup_sx1 = 0;
+ lookup_sk = 0;
+ lookup_wacc = 0;
+ kd = 0;
+}
+
+SampleEngine::~SampleEngine()
+{
+ if(lookup_sx0) delete [] lookup_sx0;
+ if(lookup_sx1) delete [] lookup_sx1;
+ if(lookup_sk) delete [] lookup_sk;
+ if(lookup_wacc) delete [] lookup_wacc;
+}
+
+/*
+ * unlike the Direct and NN engines, the Sample engine works across
+ * output columns (it makes for more economical memory addressing
+ * during convolution)
+ */
+void SampleEngine::init_packages()
+{
+ int iw = input->get_w();
+ int i1i = floor(in1);
+ int i2i = ceil(in2);
+ float i1f = 1.f - in1 + i1i;
+ float i2f = 1.f - i2i + in2;
+
+ int oy = floor(out1);
+ float oyf = out1 - oy;
+ int oh = ceil(out2) - oy;
+
+ float *k = kernel->lookup;
+ float kw = kernel->width;
+ int kn = kernel->n;
+
+ if(in2 - in1 <= 0 || out2 - out1 <= 0)
+ return;
+
+ /* determine kernel spatial coverage */
+ float scale = (out2 - out1) / (in2 - in1);
+ float iscale = (in2 - in1) / (out2 - out1);
+ float coverage = fabs(1.f / scale);
+ float bound = (coverage < 1.f ? kw : kw * coverage) - (.5f / TRANSFORM_SPP);
+ float coeff = (coverage < 1.f ? 1.f : scale) * TRANSFORM_SPP;
+
+ delete [] lookup_sx0;
+ delete [] lookup_sx1;
+ delete [] lookup_sk;
+ delete [] lookup_wacc;
+
+ lookup_sx0 = new int[oh];
+ lookup_sx1 = new int[oh];
+ lookup_sk = new int[oh];
+ lookup_wacc = new float[oh];
+
+ kd = (double)coeff * (1 << INDEX_FRACTION) + .5;
+
+ /* precompute kernel values and weight sums */
+ for(int i = 0; i < oh; i++) {
+ /* map destination back to source */
+ double sx = (i - oyf + .5) * iscale + in1 - .5;
+
+ /*
+ * clip iteration to source area but not source plane. Points
+ * outside the source plane count as transparent. Points outside
+ * the source area don't count at all. The actual convolution
+ * later will be clipped to both, but we need to compute
+ * weights.
+ */
+ int sx0 = mmax((int)floor(sx - bound) + 1, i1i);
+ int sx1 = mmin((int)ceil(sx + bound), i2i);
+ int ki = (double)(sx0 - sx) * coeff * (1 << INDEX_FRACTION)
+ + (1 << (INDEX_FRACTION - 1)) + .5;
+ float wacc=0.;
+
+ lookup_sx0[i] = -1;
+ lookup_sx1[i] = -1;
+
+ for(int j= sx0; j < sx1; j++) {
+ int kv = (ki >> INDEX_FRACTION);
+ if(kv > kn) break;
+ if(kv >= -kn) {
+ /*
+ * the contribution of the first and last input pixel (if
+ * fractional) are linearly weighted by the fraction
+ */
+ if(j == i1i)
+ wacc += k[abs(kv)] * i1f;
+ else if(j + 1 == i2i)
+ wacc += k[abs(kv)] * i2f;
+ else
+ wacc += k[abs(kv)];
+
+ /* this is where we clip the kernel convolution to the source plane */
+ if(j >= 0 && j < iw) {
+ if(lookup_sx0[i] == -1) {
+ lookup_sx0[i] = j;
+ lookup_sk[i] = ki;
+ }
+ lookup_sx1[i] = j + 1;
+ }
+ }
+ ki += kd;
+ }
+ lookup_wacc[i] = wacc > 0. ? 1. / wacc : 0.;
+ }
+
+ int cols = col_out2 - col_out1;
+ int pkgs = get_total_packages();
+ int col1 = col_out1, col2 = col1;
+ for(int i = 0; i < pkgs; col1=col2 ) {
+ SamplePackage *package = (SamplePackage*)get_package(i);
+ col2 = ++i * cols / pkgs + col_out1;
+ package->out_col1 = col1;
+ package->out_col2 = col2;
+ }
+}
+
+LoadClient* SampleEngine::new_client()
+{
+ return new SampleUnit(this);
+}
+
+LoadPackage* SampleEngine::new_package()
+{
+ return new SamplePackage;
+}
+
+