cinelerra-5.0/cinelerra/overlayframe.C

   1
   2 /*
   3  * CINELERRA
   4  * Copyright (C) 2008 Adam Williams <broadcast at earthling dot net>
   5  * Copyright (C) 2012 Monty <monty@xiph.org>
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License as published by
   9  * the Free Software Foundation; either version 2 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20  *
  21  */
  22
  23 #include <math.h>
  24 #include <stdio.h>
  25 #include <string.h>
  26 #include <stdint.h>
  27 #include <stdlib.h>
  28 #include <unistd.h>
  29
  30 #include "clip.h"
  31 #include "edl.inc"
  32 #include "mutex.h"
  33 #include "overlayframe.h"
  34 #include "units.h"
  35 #include "vframe.h"
  36
  37 static inline int   mabs(int32_t v) { return abs(v); }
  38 static inline int   mabs(int64_t v) { return llabs(v); }
  39 static inline float mabs(float v)   { return fabsf(v); }
  40
  41 static inline int32_t aclip(int32_t v, int mx) {
  42         return v < 0 ? 0 : v > mx ? mx : v;
  43 }
  44 static inline int64_t aclip(int64_t v, int mx) {
  45         return v < 0 ? 0 : v > mx ? mx : v;
  46 }
  47 static inline float   aclip(float v, float mx) {
  48         return v < 0 ? 0 : v > mx ? mx : v;
  49 }
  50 static inline float   aclip(float v, int mx) {
  51         return v < 0 ? 0 : v > mx ? mx : v;
  52 }
  53 static inline int   aclip(int v, float mx) {
  54         return v < 0 ? 0 : v > mx ? mx : v;
  55 }
  56 static inline int32_t cclip(int32_t v, int mx) {
  57         return v > (mx/=2) ? mx : v < (mx=(-mx-1)) ? mx : v;
  58 }
  59 static inline int64_t cclip(int64_t v, int mx) {
  60         return v > (mx/=2) ? mx : v < (mx=(-mx-1)) ? mx : v;
  61 }
  62 static inline float   cclip(float v, float mx) {
  63         return v > (mx/=2) ? mx : v < (mx=(-mx)) ? mx : v;
  64 }
  65 static inline float   cclip(float v, int mx) {
  66         return v > (mx/=2) ? mx : v < (mx=(-mx-1)) ? mx : v;
  67 }
  68 static inline int   cclip(int v, float mx) {
  69         return v > (mx/=2) ? mx : v < (mx=(-mx-1)) ? mx : v;
  70 }
  71
  72 /*
  73  * New resampler code; replace the original somehwat blurry engine
  74  * with a fairly standard kernel resampling core.  This could be used
  75  * for full affine transformation but only implements scale/translate.
  76  * Mostly reuses the old blending macro code.
  77  *
  78  * Pixel convention:
  79  *
  80  *  1) Pixels are points, not areas or squares.
  81  *
  82  *  2) To maintain the usual edge and scaling conventions, pixels are
  83  *     set inward from the image edge, eg, the left edge of an image is
  84  *     at pixel location x=-.5, not x=0.  Although pixels are not
  85  *     squares, the usual way of stating this is 'the pixel is located
  86  *     at the center of its square'.
  87  *
  88  *  3) Because of 1 and 2, we must truncate and weight the kernel
  89  *     convolution at the edge of the input area.  Otherwise, all
  90  *     resampled areas would be bordered by a transparency halo. E.g.
  91  *     in the old engine, upsampling HDV to 1920x1080 results in the
  92  *     left and right edges being partially transparent and underlying
  93  *     layers shining through.
  94  *
  95  *   4) The contribution of fractional pixels at the edges of input
  96  *     ranges are weighted according to the fraction.  Note that the
  97  *     kernel weighting is adjusted, not the opacity.  This is one
  98  *     exception to 'pixels have no area'.
  99  *
 100  *  5) The opacity of fractional pixels at the edges of the output
 101  *     range is adjusted according to the fraction. This is the other
 102  *     exception to 'pixels have no area'.
 103  *
 104  * Fractional alpha blending has been modified across the board from:
 105  *    output_alpha = input_alpha > output_alpha ? input_alpha : output_alpha;
 106  *  to:
 107  *    output_alpha = output_alpha + ((max - output_alpha) * input_alpha) / max;
 108  */
 109
 110 #define TRANSFORM_SPP    (4096)    /* number of data pts per unit x in lookup table */
 111 #define INDEX_FRACTION   (8)       /* bits of fraction past TRANSFORM_SPP on kernel
 112                                       index accumulation */
 113 #define TRANSFORM_MIN    (.5 / TRANSFORM_SPP)
 114
 115 /* Sinc needed for Lanczos kernel */
 116 static float sinc(const float x)
 117 {
 118         float y = x * M_PI;
 119
 120         if(fabsf(x) < TRANSFORM_MIN)
 121                 return 1.0f;
 122
 123         return sinf(y) / y;
 124 }
 125
 126 /*
 127  * All resampling (except Nearest Neighbor) is performed via
 128  *   transformed 2D resampling kernels bult from 1D lookups.
 129  */
 130 OverlayKernel::OverlayKernel(int interpolation_type)
 131 {
 132         int i;
 133         this->type = interpolation_type;
 134
 135         switch(interpolation_type)
 136         {
 137         case BILINEAR:
 138                 width = 1.f;
 139                 lookup = new float[(n = TRANSFORM_SPP) + 1];
 140                 for (i = 0; i <= TRANSFORM_SPP; i++)
 141                         lookup[i] = (float)(TRANSFORM_SPP - i) / TRANSFORM_SPP;
 142                 break;
 143
 144         /* Use a Catmull-Rom filter (not b-spline) */
 145         case BICUBIC:
 146                 width = 2.;
 147                 lookup = new float[(n = 2 * TRANSFORM_SPP) + 1];
 148                 for(i = 0; i <= TRANSFORM_SPP; i++) {
 149                         float x = i / (float)TRANSFORM_SPP;
 150                         lookup[i] = 1.f - 2.5f * x * x + 1.5f * x * x * x;
 151                 }
 152                 for(; i <= 2 * TRANSFORM_SPP; i++) {
 153                         float x = i / (float)TRANSFORM_SPP;
 154                         lookup[i] = 2.f - 4.f * x  + 2.5f * x * x - .5f * x * x * x;
 155                 }
 156                 break;
 157
 158         case LANCZOS:
 159                 width = 3.;
 160                 lookup = new float[(n = 3 * TRANSFORM_SPP) + 1];
 161                 for (i = 0; i <= 3 * TRANSFORM_SPP; i++)
 162                         lookup[i] = sinc((float)i / TRANSFORM_SPP) *
 163                                 sinc((float)i / TRANSFORM_SPP / 3.0f);
 164                 break;
 165
 166         default:
 167                 width = 0.;
 168                 lookup = 0;
 169                 n = 0;
 170                 break;
 171         }
 172 }
 173
 174 OverlayKernel::~OverlayKernel()
 175 {
 176         if(lookup) delete [] lookup;
 177 }
 178
 179 OverlayFrame::OverlayFrame(int cpus)
 180 {
 181         direct_engine = 0;
 182         nn_engine = 0;
 183         sample_engine = 0;
 184         temp_frame = 0;
 185         memset(kernel, 0, sizeof(kernel));
 186         this->cpus = cpus;
 187 }
 188
 189 OverlayFrame::~OverlayFrame()
 190 {
 191         if(temp_frame) delete temp_frame;
 192
 193         if(direct_engine) delete direct_engine;
 194         if(nn_engine) delete nn_engine;
 195         if(sample_engine) delete sample_engine;
 196
 197         if(kernel[NEAREST_NEIGHBOR]) delete kernel[NEAREST_NEIGHBOR];
 198         if(kernel[BILINEAR]) delete kernel[BILINEAR];
 199         if(kernel[BICUBIC]) delete kernel[BICUBIC];
 200         if(kernel[LANCZOS]) delete kernel[LANCZOS];
 201 }
 202
 203 static float epsilon_snap(float f)
 204 {
 205         return rintf(f * 1024) / 1024.;
 206 }
 207
 208 int OverlayFrame::overlay(VFrame *output, VFrame *input,
 209         float in_x1, float in_y1, float in_x2, float in_y2,
 210         float out_x1, float out_y1, float out_x2, float out_y2,
 211         float alpha, int mode, int interpolation_type)
 212 {
 213         in_x1 = epsilon_snap(in_x1);
 214         in_x2 = epsilon_snap(in_x2);
 215         in_y1 = epsilon_snap(in_y1);
 216         in_y2 = epsilon_snap(in_y2);
 217         out_x1 = epsilon_snap(out_x1);
 218         out_x2 = epsilon_snap(out_x2);
 219         out_y1 = epsilon_snap(out_y1);
 220         out_y2 = epsilon_snap(out_y2);
 221
 222         if (isnan(in_x1) || isnan(in_x2) ||
 223                 isnan(in_y1) || isnan(in_y2) ||
 224                 isnan(out_x1) || isnan(out_x2) ||
 225                 isnan(out_y1) || isnan(out_y2)) return 1;
 226
 227         if(in_x1 < 0) in_x1 = 0;
 228         if(in_y1 < 0) in_y1 = 0;
 229         if(in_x2 > input->get_w()) in_x2 = input->get_w();
 230         if(in_y2 > input->get_h()) in_y2 = input->get_h();
 231         if(out_x1 < 0) out_x1 = 0;
 232         if(out_y1 < 0) out_y1 = 0;
 233         if(out_x2 > output->get_w()) out_x2 = output->get_w();
 234         if(out_y2 > output->get_h()) out_y2 = output->get_h();
 235
 236         float xscale = (out_x2 - out_x1) / (in_x2 - in_x1);
 237         float yscale = (out_y2 - out_y1) / (in_y2 - in_y1);
 238
 239         /* don't interpolate integer translations, or scale no-ops */
 240         if(xscale == 1. && yscale == 1. &&
 241                 (int)in_x1 == in_x1 && (int)in_x2 == in_x2 &&
 242                 (int)in_y1 == in_y1 && (int)in_y2 == in_y2 &&
 243                 (int)out_x1 == out_x1 && (int)out_x2 == out_x2 &&
 244                 (int)out_y1 == out_y1 && (int)out_y2 == out_y2) {
 245                 if(!direct_engine) direct_engine = new DirectEngine(cpus);
 246
 247                 direct_engine->output = output;   direct_engine->input = input;
 248                 direct_engine->in_x1 = in_x1;     direct_engine->in_y1 = in_y1;
 249                 direct_engine->out_x1 = out_x1;   direct_engine->out_x2 = out_x2;
 250                 direct_engine->out_y1 = out_y1;   direct_engine->out_y2 = out_y2;
 251                 direct_engine->alpha = alpha;     direct_engine->mode = mode;
 252                 direct_engine->process_packages();
 253         }
 254         else if(interpolation_type == NEAREST_NEIGHBOR) {
 255                 if(!nn_engine) nn_engine = new NNEngine(cpus);
 256                 nn_engine->output = output;       nn_engine->input = input;
 257                 nn_engine->in_x1 = in_x1;         nn_engine->in_x2 = in_x2;
 258                 nn_engine->in_y1 = in_y1;         nn_engine->in_y2 = in_y2;
 259                 nn_engine->out_x1 = out_x1;       nn_engine->out_x2 = out_x2;
 260                 nn_engine->out_y1 = out_y1;       nn_engine->out_y2 = out_y2;
 261                 nn_engine->alpha = alpha;         nn_engine->mode = mode;
 262                 nn_engine->process_packages();
 263         }
 264         else {
 265                 int xtype = BILINEAR;
 266                 int ytype = BILINEAR;
 267
 268                 switch(interpolation_type)
 269                 {
 270                 case CUBIC_CUBIC: // Bicubic enlargement and reduction
 271                         xtype = ytype = BICUBIC;
 272                         break;
 273                 case CUBIC_LINEAR: // Bicubic enlargement and bilinear reduction
 274                         xtype = xscale > 1. ? BICUBIC : BILINEAR;
 275                         ytype = yscale > 1. ? BICUBIC : BILINEAR;
 276                         break;
 277                 case LINEAR_LINEAR: // Bilinear enlargement and bilinear reduction
 278                         xtype = ytype = BILINEAR;
 279                         break;
 280                 case LANCZOS_LANCZOS: // Because we can
 281                         xtype = ytype = LANCZOS;
 282                         break;
 283                 }
 284
 285                 if(xscale == 1. && (int)in_x1 == in_x1 && (int)in_x2 == in_x2 &&
 286                                 (int)out_x1 == out_x1 && (int)out_x2 == out_x2)
 287                         xtype = DIRECT_COPY;
 288
 289                 if(yscale == 1. && (int)in_y1 == in_y1 && (int)in_y2 == in_y2 &&
 290                                 (int)out_y1 == out_y1 && (int)out_y2 == out_y2)
 291                         ytype = DIRECT_COPY;
 292
 293                 if(!kernel[xtype])
 294                         kernel[xtype] = new OverlayKernel(xtype);
 295                 if(!kernel[ytype])
 296                         kernel[ytype] = new OverlayKernel(ytype);
 297
 298 /*
 299  * horizontal and vertical are separately resampled.  First we
 300  * resample the input along X into a transposed, temporary frame,
 301  * then resample/transpose the temporary space along X into the
 302  * output.  Fractional pixels along the edge are handled in the X
 303  * direction of each step
 304  */
 305                 // resampled dimension matches the transposed output space
 306                 float temp_y1 = out_x1 - floor(out_x1);
 307                 float temp_y2 = temp_y1 + (out_x2 - out_x1);
 308                 int temp_h = ceil(temp_y2);
 309
 310                 // non-resampled dimension merely cropped
 311                 float temp_x1 = in_y1 - floor(in_y1);
 312                 float temp_x2 = temp_x1 + (in_y2 - in_y1);
 313                 int temp_w = ceil(temp_x2);
 314
 315                 if( temp_frame &&
 316                    (temp_frame->get_color_model() != input->get_color_model() ||
 317                     temp_frame->get_w() != temp_w || temp_frame->get_h() != temp_h) ) {
 318                         delete temp_frame;
 319                         temp_frame = 0;
 320                 }
 321
 322                 if(!temp_frame) {
 323                         temp_frame = new VFrame(0, -1, temp_w, temp_h,
 324                                 input->get_color_model(), -1);
 325                 }
 326
 327                 temp_frame->clear_frame();
 328
 329                 if(!sample_engine) sample_engine = new SampleEngine(cpus);
 330
 331                 sample_engine->output = temp_frame;
 332                 sample_engine->input = input;
 333                 sample_engine->kernel = kernel[xtype];
 334                 sample_engine->col_out1 = 0;
 335                 sample_engine->col_out2 = temp_w;
 336                 sample_engine->row_in = floor(in_y1);
 337
 338                 sample_engine->in1 = in_x1;
 339                 sample_engine->in2 = in_x2;
 340                 sample_engine->out1 = temp_y1;
 341                 sample_engine->out2 = temp_y2;
 342                 sample_engine->alpha = 1.;
 343                 sample_engine->mode = TRANSFER_REPLACE;
 344                 sample_engine->process_packages();
 345
 346                 sample_engine->output = output;
 347                 sample_engine->input = temp_frame;
 348                 sample_engine->kernel = kernel[ytype];
 349                 sample_engine->col_out1 = floor(out_x1);
 350                 sample_engine->col_out2 = ceil(out_x2);
 351                 sample_engine->row_in = 0;
 352
 353                 sample_engine->in1 = temp_x1;
 354                 sample_engine->in2 = temp_x2;
 355                 sample_engine->out1 = out_y1;
 356                 sample_engine->out2 = out_y2;
 357                 sample_engine->alpha = alpha;
 358                 sample_engine->mode = mode;
 359                 sample_engine->process_packages();
 360         }
 361         return 0;
 362 }
 363
 364 // NORMAL       [Sa * Sa + Da * (1 - Sa), Sc * Sa + Dc * (1 - Sa)])
 365 #define ALPHA_NORMAL(mx, Sa, Da) ((Sa * Sa + Da * (mx - Sa)) / mx)
 366 #define COLOR_NORMAL(mx, Sc, Sa, Dc, Da) ((Sc * Sa + Dc * (mx - Sa)) / mx)
 367 #define CHROMA_NORMAL COLOR_NORMAL
 368
 369 // ADDITION     [(Sa + Da), (Sc + Dc)]
 370 #define ALPHA_ADDITION(mx, Sa, Da) (Sa + Da)
 371 #define COLOR_ADDITION(mx, Sc, Sa, Dc, Da) (Sc + Dc)
 372 #define CHROMA_ADDITION(mx, Sc, Sa, Dc, Da) (Sc + Dc)
 373
 374 // SUBTRACT     [(Sa - Da), (Sc - Dc)]
 375 #define ALPHA_SUBTRACT(mx, Sa, Da) (Sa - Da)
 376 #define COLOR_SUBTRACT(mx, Sc, Sa, Dc, Da) (Sc - Dc)
 377 #define CHROMA_SUBTRACT(mx, Sc, Sa, Dc, Da) (Sc - Dc)
 378
 379 // MULTIPLY     [(Sa * Da), Sc * Dc]
 380 #define ALPHA_MULTIPLY(mx, Sa, Da) ((Sa * Da) / mx)
 381 #define COLOR_MULTIPLY(mx, Sc, Sa, Dc, Da) ((Sc * Dc) / mx)
 382 #define CHROMA_MULTIPLY(mx, Sc, Sa, Dc, Da) ((Sc * Dc) / mx)
 383
 384 // DIVIDE       [(Sa / Da), (Sc / Dc)]
 385 #define ALPHA_DIVIDE(mx, Sa, Da) (Da ? ((Sa * mx) / Da) : mx)
 386 #define COLOR_DIVIDE(mx, Sc, Sa, Dc, Da) (Dc ? ((Sc * mx) / Dc) : mx)
 387 #define CHROMA_DIVIDE(mx, Sc, Sa, Dc, Da) (Dc ? ((Sc * mx) / Dc) : mx)
 388
 389 // REPLACE      [Sa, Sc] (fade = 1)
 390 #define ALPHA_REPLACE(mx, Sa, Da) Sa
 391 #define COLOR_REPLACE(mx, Sc, Sa, Dc, Da) Sc
 392 #define CHROMA_REPLACE COLOR_REPLACE
 393
 394 // MAX          [max(Sa, Da), MAX(Sc, Dc)]
 395 #define ALPHA_MAX(mx, Sa, Da) (Sa > Da ? Sa : Da)
 396 #define COLOR_MAX(mx, Sc, Sa, Dc, Da) (Sc > Dc ? Sc : Dc)
 397 #define CHROMA_MAX(mx, Sc, Sa, Dc, Da) (Sc > Dc ? Sc : Dc)
 398
 399 // MIN          [min(Sa, Da), MIN(Sc, Dc)]
 400 #define ALPHA_MIN(mx, Sa, Da) (Sa < Da ? Sa : Da)
 401 #define COLOR_MIN(mx, Sc, Sa, Dc, Da) (Sc < Dc ? Sc : Dc)
 402 #define CHROMA_MIN(mx, Sc, Sa, Dc, Da) (Sc < Dc ? Sc : Dc)
 403
 404 // AVERAGE      [(Sa + Da) * 0.5, (Sc + Dc) * 0.5]
 405 #define ALPHA_AVERAGE(mx, Sa, Da) ((Sa + Da) / 2)
 406 #define COLOR_AVERAGE(mx, Sc, Sa, Dc, Da) ((Sc + Dc) / 2)
 407 #define CHROMA_AVERAGE COLOR_AVERAGE
 408
 409 // DARKEN       [Sa + Da - Sa*Da, Sc*(1 - Da) + Dc*(1 - Sa) + min(Sc, Dc)]
 410 #define ALPHA_DARKEN(mx, Sa, Da) (Sa + Da - (Sa * Da) / mx)
 411 #define COLOR_DARKEN(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * (mx - Sa)) / mx + (Sc < Dc ? Sc : Dc))
 412 #define CHROMA_DARKEN COLOR_DARKEN
 413
 414 // LIGHTEN      [Sa + Da - Sa*Da, Sc*(1 - Da) + Dc*(1 - Sa) + max(Sc, Dc)]
 415 #define ALPHA_LIGHTEN(mx, Sa, Da) (Sa + Da - Sa * Da / mx)
 416 #define COLOR_LIGHTEN(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * (mx - Sa)) / mx + (Sc > Dc ? Sc : Dc))
 417 #define CHROMA_LIGHTEN COLOR_LIGHTEN
 418
 419 // DST          [Da, Dc]
 420 #define ALPHA_DST(mx, Sa, Da) Da
 421 #define COLOR_DST(mx, Sc, Sa, Dc, Da) Dc
 422 #define CHROMA_DST COLOR_DST
 423
 424 // DST_ATOP     [Sa, Sc * (1 - Da) + Dc * Sa]
 425 #define ALPHA_DST_ATOP(mx, Sa, Da) Sa
 426 #define COLOR_DST_ATOP(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * Sa) / mx)
 427 #define CHROMA_DST_ATOP COLOR_DST_ATOP
 428
 429 // DST_IN       [Da * Sa, Dc * Sa]
 430 #define ALPHA_DST_IN(mx, Sa, Da) ((Da * Sa) / mx)
 431 #define COLOR_DST_IN(mx, Sc, Sa, Dc, Da) ((Dc * Sa) / mx)
 432 #define CHROMA_DST_IN COLOR_DST_IN
 433
 434 // DST_OUT      [Da * (1 - Sa), Dc * (1 - Sa)]
 435 #define ALPHA_DST_OUT(mx, Sa, Da) (Da * (mx - Sa) / mx)
 436 #define COLOR_DST_OUT(mx, Sc, Sa, Dc, Da) (Dc * (mx - Sa) / mx)
 437 #define CHROMA_DST_OUT COLOR_DST_OUT
 438
 439 // DST_OVER     [Sa * (1 - Da) + Da, Sc * (1 - Da) + Dc]
 440 #define ALPHA_DST_OVER(mx, Sa, Da) ((Sa * (mx - Da)) / mx + Da)
 441 #define COLOR_DST_OVER(mx, Sc, Sa, Dc, Da) (Sc * (mx - Da)/ mx + Dc)
 442 #define CHROMA_DST_OVER COLOR_DST_OVER
 443
 444 // SRC                  [Sa, Sc]
 445 #define ALPHA_SRC(mx, Sa, Da) Sa
 446 #define COLOR_SRC(mx, Sc, Sa, Dc, Da) Sc
 447 #define CHROMA_SRC COLOR_SRC
 448
 449 // SRC_ATOP     [Da, Sc * Da + Dc * (1 - Sa)]
 450 #define ALPHA_SRC_ATOP(mx, Sa, Da) Da
 451 #define COLOR_SRC_ATOP(mx, Sc, Sa, Dc, Da) ((Sc * Da + Dc * (mx - Sa)) / mx)
 452 #define CHROMA_SRC_ATOP COLOR_SRC_ATOP
 453
 454 // SRC_IN       [Sa * Da, Sc * Da]
 455 #define ALPHA_SRC_IN(mx, Sa, Da) ((Sa * Da) / mx)
 456 #define COLOR_SRC_IN(mx, Sc, Sa, Dc, Da) (Sc * Da / mx)
 457 #define CHROMA_SRC_IN COLOR_SRC_IN
 458
 459 // SRC_OUT      [Sa * (1 - Da), Sc * (1 - Da)]
 460 #define ALPHA_SRC_OUT(mx, Sa, Da) (Sa * (mx - Da) / mx)
 461 #define COLOR_SRC_OUT(mx, Sc, Sa, Dc, Da) (Sc * (mx - Da) / mx)
 462 #define CHROMA_SRC_OUT COLOR_SRC_OUT
 463
 464 // SRC_OVER     [Sa + Da * (1 - Sa), Sc + (1 - Sa) * Dc]
 465 #define ALPHA_SRC_OVER(mx, Sa, Da) (Sa + Da * (mx - Sa) / mx)
 466 #define COLOR_SRC_OVER(mx, Sc, Sa, Dc, Da) (Sc + Dc * (mx - Sa) / mx)
 467 #define CHROMA_SRC_OVER COLOR_SRC_OVER
 468
 469 // OR   [Sa + Da - Sa * Da, Sc + Dc - Sc * Dc]
 470 #define ALPHA_OR(mx, Sa, Da) (Sa + Da - (Sa * Da) / mx)
 471 #define COLOR_OR(mx, Sc, Sa, Dc, Da) (Sc + Dc - (Sc * Dc) / mx)
 472 #define CHROMA_OR COLOR_OR
 473
 474 // XOR          [Sa * (1 - Da) + Da * (1 - Sa), Sc * (1 - Da) + Dc * (1 - Sa)]
 475 #define ALPHA_XOR(mx, Sa, Da) ((Sa * (mx - Da) + Da * (mx - Sa)) / mx)
 476 #define COLOR_XOR(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * (mx - Sa)) / mx)
 477 #define CHROMA_XOR COLOR_XOR
 478
 479 #define ZTYP(ty) typedef ty z_##ty __attribute__ ((__unused__))
 480 ZTYP(int8_t);   ZTYP(uint8_t);
 481 ZTYP(int16_t);  ZTYP(uint16_t);
 482 ZTYP(int32_t);  ZTYP(uint32_t);
 483 ZTYP(int64_t);  ZTYP(uint64_t);
 484 ZTYP(float);    ZTYP(double);
 485
 486 #define ALPHA3_BLEND(FN, typ, inp, out, mx, ofs, rnd) \
 487   typ inp0 = (typ)inp[0], inp1 = (typ)inp[1] - ofs; \
 488   typ inp2 = (typ)inp[2] - ofs, inp3 = fade * mx + rnd; \
 489   typ out0 = (typ)out[0], out1 = (typ)out[1] - ofs; \
 490   typ out2 = (typ)out[2] - ofs, out3 = mx; \
 491   r = COLOR_##FN(mx, inp0, inp3, out0, out3); \
 492   if( ofs ) { \
 493     g = CHROMA_##FN(mx, inp1, inp3, out1, out3); \
 494     b = CHROMA_##FN(mx, inp2, inp3, out2, out3); \
 495   } \
 496   else { \
 497     g = COLOR_##FN(mx, inp1, inp3, out1, out3); \
 498     b = COLOR_##FN(mx, inp2, inp3, out2, out3); \
 499   }
 500
 501 #define ALPHA4_BLEND(FN, typ, inp, out, mx, ofs, rnd) \
 502   typ inp0 = (typ)inp[0], inp1 = (typ)inp[1] - ofs; \
 503   typ inp2 = (typ)inp[2] - ofs, inp3 = (typ)inp[3] * fade + rnd; \
 504   typ out0 = (typ)out[0], out1 = (typ)out[1] - ofs; \
 505   typ out2 = (typ)out[2] - ofs, out3 = out[3]; \
 506   r = COLOR_##FN(mx, inp0, inp3, out0, out3); \
 507   if( ofs ) { \
 508     g = CHROMA_##FN(mx, inp1, inp3, out1, out3); \
 509     b = CHROMA_##FN(mx, inp2, inp3, out2, out3); \
 510   } \
 511   else { \
 512     g = COLOR_##FN(mx, inp1, inp3, out1, out3); \
 513     b = COLOR_##FN(mx, inp2, inp3, out2, out3); \
 514   } \
 515   a = ALPHA_##FN(mx, inp3, out3)
 516
 517 #define ALPHA_STORE(out, ofs, mx) \
 518   out[0] = r; \
 519   out[1] = g + ofs; \
 520   out[2] = b + ofs
 521
 522 #define ALPHA3_STORE(out, ofs, mx) \
 523   r = aclip(r, mx); \
 524   g = ofs ? cclip(g, mx) : aclip(g, mx); \
 525   b = ofs ? cclip(b, mx) : aclip(b, mx); \
 526   if( trnsp ) { \
 527     r = (r * opcty + out0 * trnsp) / mx; \
 528     g = (g * opcty + out1 * trnsp) / mx; \
 529     b = (b * opcty + out2 * trnsp) / mx; \
 530   } \
 531   ALPHA_STORE(out, ofs, mx)
 532
 533 #define ALPHA4_STORE(out, ofs, mx) \
 534   r = aclip(r, mx); \
 535   g = ofs ? cclip(g, mx) : aclip(g, mx); \
 536   b = ofs ? cclip(b, mx) : aclip(b, mx); \
 537   if( trnsp ) { \
 538     r = (r * opcty + out0 * trnsp) / mx; \
 539     g = (g * opcty + out1 * trnsp) / mx; \
 540     b = (b * opcty + out2 * trnsp) / mx; \
 541     a = (a * opcty + out3 * trnsp) / mx; \
 542   } \
 543   ALPHA_STORE(out, ofs, mx); \
 544   out[3] = aclip(a, mx)
 545
 546 #define XBLEND(FN, temp_type, type, max, components, chroma_offset, round) { \
 547         temp_type opcty = alpha * max + round, trnsp = max - opcty; \
 548         type** output_rows = (type**)output->get_rows(); \
 549         type** input_rows = (type**)input->get_rows(); \
 550         ix *= components;  ox *= components; \
 551  \
 552         for(int i = pkg->out_row1; i < pkg->out_row2; i++) { \
 553                 type* in_row = input_rows[i + iy] + ix; \
 554                 type* output = output_rows[i] + ox; \
 555                 for(int j = 0; j < ow; j++) { \
 556                         if( components == 4 ) { \
 557                                 temp_type r, g, b, a; \
 558                                 ALPHA4_BLEND(FN, temp_type, in_row, output, max, chroma_offset, round); \
 559                                 ALPHA4_STORE(output, chroma_offset, max); \
 560                         } \
 561                         else { \
 562                                 temp_type r, g, b; \
 563                                 ALPHA3_BLEND(FN, temp_type, in_row, output, max, chroma_offset, round); \
 564                                 ALPHA3_STORE(output, chroma_offset, max); \
 565                         } \
 566                         in_row += components;  output += components; \
 567                 } \
 568         } \
 569         break; \
 570 }
 571
 572 #define XBLEND_ONLY(FN) { \
 573         switch(input->get_color_model()) { \
 574         case BC_RGB_FLOAT:      XBLEND(FN, z_float,   z_float,    1.f,    3, 0,      0.f); \
 575         case BC_RGBA_FLOAT:     XBLEND(FN, z_float,   z_float,    1.f,    4, 0,      0.f); \
 576         case BC_RGB888:         XBLEND(FN, z_int32_t, z_uint8_t,  0xff,   3, 0,      .5f); \
 577         case BC_YUV888:         XBLEND(FN, z_int32_t, z_uint8_t,  0xff,   3, 0x80,   .5f); \
 578         case BC_RGBA8888:       XBLEND(FN, z_int32_t, z_uint8_t,  0xff,   4, 0,      .5f); \
 579         case BC_YUVA8888:       XBLEND(FN, z_int32_t, z_uint8_t,  0xff,   4, 0x80,   .5f); \
 580         case BC_RGB161616:      XBLEND(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0,      .5f); \
 581         case BC_YUV161616:      XBLEND(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0x8000, .5f); \
 582         case BC_RGBA16161616:   XBLEND(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0,      .5f); \
 583         case BC_YUVA16161616:   XBLEND(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0x8000, .5f); \
 584         } \
 585         break; \
 586 }
 587
 588 /* Direct translate / blend **********************************************/
 589
 590 DirectPackage::DirectPackage()
 591 {
 592 }
 593
 594 DirectUnit::DirectUnit(DirectEngine *server)
 595  : LoadClient(server)
 596 {
 597         this->engine = server;
 598 }
 599
 600 DirectUnit::~DirectUnit()
 601 {
 602 }
 603
 604 void DirectUnit::process_package(LoadPackage *package)
 605 {
 606         DirectPackage *pkg = (DirectPackage*)package;
 607
 608         VFrame *output = engine->output;
 609         VFrame *input = engine->input;
 610         int mode = engine->mode;
 611         float fade = engine->alpha;
 612         float alpha =
 613                 BC_CModels::has_alpha(input->get_color_model()) &&
 614                 mode == TRANSFER_REPLACE ? 1.f : engine->alpha;
 615
 616         int ix = engine->in_x1;
 617         int ox = engine->out_x1;
 618         int ow = engine->out_x2 - ox;
 619         int iy = engine->in_y1 - engine->out_y1;
 620
 621         switch( mode ) {
 622         case TRANSFER_NORMAL:           XBLEND_ONLY(NORMAL);
 623         case TRANSFER_ADDITION:         XBLEND_ONLY(ADDITION);
 624         case TRANSFER_SUBTRACT:         XBLEND_ONLY(SUBTRACT);
 625         case TRANSFER_MULTIPLY:         XBLEND_ONLY(MULTIPLY);
 626         case TRANSFER_DIVIDE:           XBLEND_ONLY(DIVIDE);
 627         case TRANSFER_REPLACE:          XBLEND_ONLY(REPLACE);
 628         case TRANSFER_MAX:              XBLEND_ONLY(MAX);
 629         case TRANSFER_MIN:              XBLEND_ONLY(MIN);
 630         case TRANSFER_AVERAGE:          XBLEND_ONLY(AVERAGE);
 631         case TRANSFER_DARKEN:           XBLEND_ONLY(DARKEN);
 632         case TRANSFER_LIGHTEN:          XBLEND_ONLY(LIGHTEN);
 633         case TRANSFER_DST:              XBLEND_ONLY(DST);
 634         case TRANSFER_DST_ATOP:         XBLEND_ONLY(DST_ATOP);
 635         case TRANSFER_DST_IN:           XBLEND_ONLY(DST_IN);
 636         case TRANSFER_DST_OUT:          XBLEND_ONLY(DST_OUT);
 637         case TRANSFER_DST_OVER:         XBLEND_ONLY(DST_OVER);
 638         case TRANSFER_SRC:              XBLEND_ONLY(SRC);
 639         case TRANSFER_SRC_ATOP:         XBLEND_ONLY(SRC_ATOP);
 640         case TRANSFER_SRC_IN:           XBLEND_ONLY(SRC_IN);
 641         case TRANSFER_SRC_OUT:          XBLEND_ONLY(SRC_OUT);
 642         case TRANSFER_SRC_OVER:         XBLEND_ONLY(SRC_OVER);
 643         case TRANSFER_OR:               XBLEND_ONLY(OR);
 644         case TRANSFER_XOR:              XBLEND_ONLY(XOR);
 645         }
 646 }
 647
 648 DirectEngine::DirectEngine(int cpus)
 649  : LoadServer(cpus, cpus)
 650 {
 651 }
 652
 653 DirectEngine::~DirectEngine()
 654 {
 655 }
 656
 657 void DirectEngine::init_packages()
 658 {
 659         if(in_x1 < 0) { out_x1 -= in_x1; in_x1 = 0; }
 660         if(in_y1 < 0) { out_y1 -= in_y1; in_y1 = 0; }
 661         if(out_x1 < 0) { in_x1 -= out_x1; out_x1 = 0; }
 662         if(out_y1 < 0) { in_y1 -= out_y1; out_y1 = 0; }
 663         if(out_x2 > output->get_w()) out_x2 = output->get_w();
 664         if(out_y2 > output->get_h()) out_y2 = output->get_h();
 665         int out_w = out_x2 - out_x1;
 666         int out_h = out_y2 - out_y1;
 667         if( !out_w || !out_h ) return;
 668
 669         int rows = out_h;
 670         int pkgs = get_total_packages();
 671         int row1 = out_y1, row2 = row1;
 672         for(int i = 0; i < pkgs; row1=row2 ) {
 673                 DirectPackage *package = (DirectPackage*)get_package(i);
 674                 row2 = ++i * rows / pkgs + out_y1;
 675                 package->out_row1 = row1;
 676                 package->out_row2 = row2;
 677         }
 678 }
 679
 680 LoadClient* DirectEngine::new_client()
 681 {
 682         return new DirectUnit(this);
 683 }
 684
 685 LoadPackage* DirectEngine::new_package()
 686 {
 687         return new DirectPackage;
 688 }
 689
 690 /* Nearest Neighbor scale / translate / blend ********************/
 691
 692 #define XBLEND_3NN(FN, temp_type, type, max, components, chroma_offset, round) { \
 693         temp_type opcty = alpha * max + round, trnsp = max - opcty; \
 694         type** output_rows = (type**)output->get_rows(); \
 695         type** input_rows = (type**)input->get_rows(); \
 696         ox *= components; \
 697  \
 698         for(int i = pkg->out_row1; i < pkg->out_row2; i++) { \
 699                 int *lx = engine->in_lookup_x; \
 700                 type* in_row = input_rows[*ly++]; \
 701                 type* output = output_rows[i] + ox; \
 702                 for(int j = 0; j < ow; j++) { \
 703                         in_row += *lx++; \
 704                         if( components == 4 ) { \
 705                                 temp_type r, g, b, a; \
 706                                 ALPHA4_BLEND(FN, temp_type, in_row, output, max, chroma_offset, round); \
 707                                 ALPHA4_STORE(output, chroma_offset, max); \
 708                         } \
 709                         else { \
 710                                 temp_type r, g, b; \
 711                                 ALPHA3_BLEND(FN, temp_type, in_row, output, max, chroma_offset, round); \
 712                                 ALPHA3_STORE(output, chroma_offset, max); \
 713                         } \
 714                         output += components; \
 715                 } \
 716         } \
 717         break; \
 718 }
 719
 720 #define XBLEND_NN(FN) { \
 721         switch(input->get_color_model()) { \
 722         case BC_RGB_FLOAT:      XBLEND_3NN(FN, z_float,   z_float,    1.f,    3, 0,       0.f); \
 723         case BC_RGBA_FLOAT:     XBLEND_3NN(FN, z_float,   z_float,    1.f,    4, 0,       0.f); \
 724         case BC_RGB888:         XBLEND_3NN(FN, z_int32_t, z_uint8_t,  0xff,   3, 0,      .5f); \
 725         case BC_YUV888:         XBLEND_3NN(FN, z_int32_t, z_uint8_t,  0xff,   3, 0x80,   .5f); \
 726         case BC_RGBA8888:       XBLEND_3NN(FN, z_int32_t, z_uint8_t,  0xff,   4, 0,      .5f); \
 727         case BC_YUVA8888:       XBLEND_3NN(FN, z_int32_t, z_uint8_t,  0xff,   4, 0x80,   .5f); \
 728         case BC_RGB161616:      XBLEND_3NN(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0,      .5f); \
 729         case BC_YUV161616:      XBLEND_3NN(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0x8000, .5f); \
 730         case BC_RGBA16161616:   XBLEND_3NN(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0,      .5f); \
 731         case BC_YUVA16161616:   XBLEND_3NN(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0x8000, .5f); \
 732         } \
 733         break; \
 734 }
 735
 736 NNPackage::NNPackage()
 737 {
 738 }
 739
 740 NNUnit::NNUnit(NNEngine *server)
 741  : LoadClient(server)
 742 {
 743         this->engine = server;
 744 }
 745
 746 NNUnit::~NNUnit()
 747 {
 748 }
 749
 750 void NNUnit::process_package(LoadPackage *package)
 751 {
 752         NNPackage *pkg = (NNPackage*)package;
 753         VFrame *output = engine->output;
 754         VFrame *input = engine->input;
 755         int mode = engine->mode;
 756         float fade = engine->alpha;
 757         float alpha =
 758                 BC_CModels::has_alpha(input->get_color_model()) &&
 759                 mode == TRANSFER_REPLACE ? 1.f : engine->alpha;
 760
 761         int ox = engine->out_x1i;
 762         int ow = engine->out_x2i - ox;
 763         int *ly = engine->in_lookup_y + pkg->out_row1;
 764
 765         switch( mode ) {
 766         case TRANSFER_NORMAL:           XBLEND_NN(NORMAL);
 767         case TRANSFER_ADDITION:         XBLEND_NN(ADDITION);
 768         case TRANSFER_SUBTRACT:         XBLEND_NN(SUBTRACT);
 769         case TRANSFER_MULTIPLY:         XBLEND_NN(MULTIPLY);
 770         case TRANSFER_DIVIDE:           XBLEND_NN(DIVIDE);
 771         case TRANSFER_REPLACE:          XBLEND_NN(REPLACE);
 772         case TRANSFER_MAX:              XBLEND_NN(MAX);
 773         case TRANSFER_MIN:              XBLEND_NN(MIN);
 774         case TRANSFER_AVERAGE:          XBLEND_NN(AVERAGE);
 775         case TRANSFER_DARKEN:           XBLEND_NN(DARKEN);
 776         case TRANSFER_LIGHTEN:          XBLEND_NN(LIGHTEN);
 777         case TRANSFER_DST:              XBLEND_NN(DST);
 778         case TRANSFER_DST_ATOP:         XBLEND_NN(DST_ATOP);
 779         case TRANSFER_DST_IN:           XBLEND_NN(DST_IN);
 780         case TRANSFER_DST_OUT:          XBLEND_NN(DST_OUT);
 781         case TRANSFER_DST_OVER:         XBLEND_NN(DST_OVER);
 782         case TRANSFER_SRC:              XBLEND_NN(SRC);
 783         case TRANSFER_SRC_ATOP:         XBLEND_NN(SRC_ATOP);
 784         case TRANSFER_SRC_IN:           XBLEND_NN(SRC_IN);
 785         case TRANSFER_SRC_OUT:          XBLEND_NN(SRC_OUT);
 786         case TRANSFER_SRC_OVER:         XBLEND_NN(SRC_OVER);
 787         case TRANSFER_OR:               XBLEND_NN(OR);
 788         case TRANSFER_XOR:              XBLEND_NN(XOR);
 789         }
 790 }
 791
 792 NNEngine::NNEngine(int cpus)
 793  : LoadServer(cpus, cpus)
 794 {
 795         in_lookup_x = 0;
 796         in_lookup_y = 0;
 797 }
 798
 799 NNEngine::~NNEngine()
 800 {
 801         if(in_lookup_x)
 802                 delete[] in_lookup_x;
 803         if(in_lookup_y)
 804                 delete[] in_lookup_y;
 805 }
 806
 807 void NNEngine::init_packages()
 808 {
 809         int in_w = input->get_w();
 810         int in_h = input->get_h();
 811         int out_w = output->get_w();
 812         int out_h = output->get_h();
 813
 814         float in_subw = in_x2 - in_x1;
 815         float in_subh = in_y2 - in_y1;
 816         float out_subw = out_x2 - out_x1;
 817         float out_subh = out_y2 - out_y1;
 818         int first, last, count, i;
 819         int components = 3;
 820
 821         out_x1i = rint(out_x1);
 822         out_x2i = rint(out_x2);
 823         if(out_x1i < 0) out_x1i = 0;
 824         if(out_x1i > out_w) out_x1i = out_w;
 825         if(out_x2i < 0) out_x2i = 0;
 826         if(out_x2i > out_w) out_x2i = out_w;
 827         int out_wi = out_x2i - out_x1i;
 828         if( !out_wi ) return;
 829
 830         delete[] in_lookup_x;
 831         in_lookup_x = new int[out_wi];
 832         delete[] in_lookup_y;
 833         in_lookup_y = new int[out_h];
 834
 835         switch(input->get_color_model()) {
 836         case BC_RGBA_FLOAT:
 837         case BC_RGBA8888:
 838         case BC_YUVA8888:
 839         case BC_RGBA16161616:
 840                 components = 4;
 841                 break;
 842         }
 843
 844         first = count = 0;
 845
 846         for(i = out_x1i; i < out_x2i; i++) {
 847                 int in = (i - out_x1 + .5) * in_subw / out_subw + in_x1;
 848                 if(in < in_x1)
 849                         in = in_x1;
 850                 if(in > in_x2)
 851                         in = in_x2;
 852
 853                 if(in >= 0 && in < in_w && in >= in_x1 && i >= 0 && i < out_w) {
 854                         if(count == 0) {
 855                                 first = i;
 856                                 in_lookup_x[0] = in * components;
 857                         }
 858                         else {
 859                                 in_lookup_x[count] = (in-last)*components;
 860                         }
 861                         last = in;
 862                         count++;
 863                 }
 864                 else if(count)
 865                         break;
 866         }
 867         out_x1i = first;
 868         out_x2i = first + count;
 869         first = count = 0;
 870
 871         for(i = out_y1; i < out_y2; i++) {
 872                 int in = (i - out_y1+.5) * in_subh / out_subh + in_y1;
 873                 if(in < in_y1) in = in_y1;
 874                 if(in > in_y2) in = in_y2;
 875                 if(in >= 0 && in < in_h && i >= 0 && i < out_h) {
 876                         if(count == 0) first = i;
 877                         in_lookup_y[i] = in;
 878                         count++;
 879                 }
 880                 else if(count)
 881                         break;
 882         }
 883         out_y1 = first;
 884         out_y2 = first + count;
 885
 886         int rows = count;
 887         int pkgs = get_total_packages();
 888         int row1 = out_y1, row2 = row1;
 889         for(int i = 0; i < pkgs; row1=row2 ) {
 890                 NNPackage *package = (NNPackage*)get_package(i);
 891                 row2 = ++i * rows / pkgs + out_y1;
 892                 package->out_row1 = row1;
 893                 package->out_row2 = row2;
 894         }
 895 }
 896
 897 LoadClient* NNEngine::new_client()
 898 {
 899         return new NNUnit(this);
 900 }
 901
 902 LoadPackage* NNEngine::new_package()
 903 {
 904         return new NNPackage;
 905 }
 906
 907 /* Fully resampled scale / translate / blend ******************************/
 908 /* resample into a temporary row vector, then blend */
 909
 910 #define XSAMPLE(FN, temp_type, type, max, components, chroma_offset, round) { \
 911         float temp[oh*components]; \
 912         temp_type opcty = alpha * max + round, trnsp = max - opcty; \
 913         type **output_rows = (type**)voutput->get_rows() + o1i; \
 914         type **input_rows = (type**)vinput->get_rows(); \
 915  \
 916         for(int i = pkg->out_col1; i < pkg->out_col2; i++) { \
 917                 type *input = input_rows[i - engine->col_out1 + engine->row_in]; \
 918                 float *tempp = temp; \
 919                 if( !k ) { /* direct copy case */ \
 920                         type *ip = input + i1i * components; \
 921                         for(int j = 0; j < oh; j++) { \
 922                                 *tempp++ = *ip++; \
 923                                 *tempp++ = *ip++ - chroma_offset; \
 924                                 *tempp++ = *ip++ - chroma_offset; \
 925                                 if( components == 4 ) *tempp++ = *ip++; \
 926                         } \
 927                 } \
 928                 else { /* resample */ \
 929                         for(int j = 0; j < oh; j++) { \
 930                                 float racc=0.f, gacc=0.f, bacc=0.f, aacc=0.f; \
 931                                 int ki = lookup_sk[j], x = lookup_sx0[j]; \
 932                                 type *ip = input + x * components; \
 933                                 float wacc = 0, awacc = 0; \
 934                                 while(x++ < lookup_sx1[j]) { \
 935                                         float kv = k[abs(ki >> INDEX_FRACTION)]; \
 936                                         /* handle fractional pixels on edges of input */ \
 937                                         if(x == i1i) kv *= i1f; \
 938                                         if(x + 1 == i2i) kv *= i2f; \
 939                                         if( components == 4 ) { awacc += kv;  kv *= ip[3]; } \
 940                                         wacc += kv; \
 941                                         racc += kv * *ip++; \
 942                                         gacc += kv * (*ip++ - chroma_offset); \
 943                                         bacc += kv * (*ip++ - chroma_offset); \
 944                                         if( components == 4 ) { aacc += kv;  ++ip; } \
 945                                         ki += kd; \
 946                                 } \
 947                                 if(wacc > 0.) wacc = 1. / wacc; \
 948                                 *tempp++ = racc * wacc; \
 949                                 *tempp++ = gacc * wacc; \
 950                                 *tempp++ = bacc * wacc; \
 951                                 if( components == 4 ) { \
 952                                         if(awacc > 0.) awacc = 1. / awacc; \
 953                                         *tempp++ = aacc * awacc; \
 954                                 } \
 955                         } \
 956                 } \
 957  \
 958                 /* handle fractional pixels on edges of output */ \
 959                 temp[0] *= o1f;   temp[1] *= o1f;   temp[2] *= o1f; \
 960                 if( components == 4 ) temp[3] *= o1f; \
 961                 tempp = temp + (oh-1)*components; \
 962                 tempp[0] *= o2f;  tempp[1] *= o2f;  tempp[2] *= o2f; \
 963                 if( components == 4 ) tempp[3] *= o2f; \
 964                 tempp = temp; \
 965                 /* blend output */ \
 966                 for(int j = 0; j < oh; j++) { \
 967                         type *output = output_rows[j] + i * components; \
 968                         if( components == 4 ) { \
 969                                 temp_type r, g, b, a; \
 970                                 ALPHA4_BLEND(FN, temp_type, tempp, output, max, 0, round); \
 971                                 ALPHA4_STORE(output, chroma_offset, max); \
 972                         } \
 973                         else { \
 974                                 temp_type r, g, b; \
 975                                 ALPHA3_BLEND(FN, temp_type, tempp, output, max, 0, round); \
 976                                 ALPHA3_STORE(output, chroma_offset, max); \
 977                         } \
 978                         tempp += components; \
 979                 } \
 980         } \
 981         break; \
 982 }
 983
 984 #define XBLEND_SAMPLE(FN) { \
 985         switch(vinput->get_color_model()) { \
 986         case BC_RGB_FLOAT:      XSAMPLE(FN, z_float,   z_float,    1.f,    3, 0.f,    0.f); \
 987         case BC_RGBA_FLOAT:     XSAMPLE(FN, z_float,   z_float,    1.f,    4, 0.f,    0.f); \
 988         case BC_RGB888:         XSAMPLE(FN, z_int32_t, z_uint8_t,  0xff,   3, 0,      .5f); \
 989         case BC_YUV888:         XSAMPLE(FN, z_int32_t, z_uint8_t,  0xff,   3, 0x80,   .5f); \
 990         case BC_RGBA8888:       XSAMPLE(FN, z_int32_t, z_uint8_t,  0xff,   4, 0,      .5f); \
 991         case BC_YUVA8888:       XSAMPLE(FN, z_int32_t, z_uint8_t,  0xff,   4, 0x80,   .5f); \
 992         case BC_RGB161616:      XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0,      .5f); \
 993         case BC_YUV161616:      XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0x8000, .5f); \
 994         case BC_RGBA16161616:   XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0,      .5f); \
 995         case BC_YUVA16161616:   XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0x8000, .5f); \
 996         } \
 997         break; \
 998 }
 999
1000
1001 SamplePackage::SamplePackage()
1002 {
1003 }
1004
1005 SampleUnit::SampleUnit(SampleEngine *server)
1006  : LoadClient(server)
1007 {
1008         this->engine = server;
1009 }
1010
1011 SampleUnit::~SampleUnit()
1012 {
1013 }
1014
1015 void SampleUnit::process_package(LoadPackage *package)
1016 {
1017         SamplePackage *pkg = (SamplePackage*)package;
1018
1019         float i1  = engine->in1;
1020         float i2  = engine->in2;
1021         float o1  = engine->out1;
1022         float o2  = engine->out2;
1023
1024         if(i2 - i1 <= 0 || o2 - o1 <= 0)
1025                 return;
1026
1027         VFrame *voutput = engine->output;
1028         VFrame *vinput = engine->input;
1029         int mode = engine->mode;
1030         float fade = engine->alpha;
1031         float alpha =
1032                 BC_CModels::has_alpha(vinput->get_color_model()) &&
1033                 mode == TRANSFER_REPLACE ? 1.f : engine->alpha;
1034
1035         //int   iw  = vinput->get_w();
1036         int   i1i = floor(i1);
1037         int   i2i = ceil(i2);
1038         float i1f = 1.f - i1 + i1i;
1039         float i2f = 1.f - i2i + i2;
1040
1041         int   o1i = floor(o1);
1042         int   o2i = ceil(o2);
1043         float o1f = 1.f - o1 + o1i;
1044         float o2f = 1.f - o2i + o2;
1045         int   oh  = o2i - o1i;
1046
1047         float *k  = engine->kernel->lookup;
1048         //float kw  = engine->kernel->width;
1049         //int   kn  = engine->kernel->n;
1050         int   kd = engine->kd;
1051
1052         int *lookup_sx0 = engine->lookup_sx0;
1053         int *lookup_sx1 = engine->lookup_sx1;
1054         int *lookup_sk = engine->lookup_sk;
1055         //float *lookup_wacc = engine->lookup_wacc;
1056
1057         switch( mode ) {
1058         case TRANSFER_NORMAL:           XBLEND_SAMPLE(NORMAL);
1059         case TRANSFER_ADDITION:         XBLEND_SAMPLE(ADDITION);
1060         case TRANSFER_SUBTRACT:         XBLEND_SAMPLE(SUBTRACT);
1061         case TRANSFER_MULTIPLY:         XBLEND_SAMPLE(MULTIPLY);
1062         case TRANSFER_DIVIDE:           XBLEND_SAMPLE(DIVIDE);
1063         case TRANSFER_REPLACE:          XBLEND_SAMPLE(REPLACE);
1064         case TRANSFER_MAX:              XBLEND_SAMPLE(MAX);
1065         case TRANSFER_MIN:              XBLEND_SAMPLE(MIN);
1066         case TRANSFER_AVERAGE:          XBLEND_SAMPLE(AVERAGE);
1067         case TRANSFER_DARKEN:           XBLEND_SAMPLE(DARKEN);
1068         case TRANSFER_LIGHTEN:          XBLEND_SAMPLE(LIGHTEN);
1069         case TRANSFER_DST:              XBLEND_SAMPLE(DST);
1070         case TRANSFER_DST_ATOP:         XBLEND_SAMPLE(DST_ATOP);
1071         case TRANSFER_DST_IN:           XBLEND_SAMPLE(DST_IN);
1072         case TRANSFER_DST_OUT:          XBLEND_SAMPLE(DST_OUT);
1073         case TRANSFER_DST_OVER:         XBLEND_SAMPLE(DST_OVER);
1074         case TRANSFER_SRC:              XBLEND_SAMPLE(SRC);
1075         case TRANSFER_SRC_ATOP:         XBLEND_SAMPLE(SRC_ATOP);
1076         case TRANSFER_SRC_IN:           XBLEND_SAMPLE(SRC_IN);
1077         case TRANSFER_SRC_OUT:          XBLEND_SAMPLE(SRC_OUT);
1078         case TRANSFER_SRC_OVER:         XBLEND_SAMPLE(SRC_OVER);
1079         case TRANSFER_OR:               XBLEND_SAMPLE(OR);
1080         case TRANSFER_XOR:              XBLEND_SAMPLE(XOR);
1081         }
1082 }
1083
1084
1085 SampleEngine::SampleEngine(int cpus)
1086  : LoadServer(cpus, cpus)
1087 {
1088         lookup_sx0 = 0;
1089         lookup_sx1 = 0;
1090         lookup_sk = 0;
1091         lookup_wacc = 0;
1092         kd = 0;
1093 }
1094
1095 SampleEngine::~SampleEngine()
1096 {
1097         if(lookup_sx0) delete [] lookup_sx0;
1098         if(lookup_sx1) delete [] lookup_sx1;
1099         if(lookup_sk) delete [] lookup_sk;
1100         if(lookup_wacc) delete [] lookup_wacc;
1101 }
1102
1103 /*
1104  * unlike the Direct and NN engines, the Sample engine works across
1105  * output columns (it makes for more economical memory addressing
1106  * during convolution)
1107  */
1108 void SampleEngine::init_packages()
1109 {
1110         int   iw  = input->get_w();
1111         int   i1i = floor(in1);
1112         int   i2i = ceil(in2);
1113         float i1f = 1.f - in1 + i1i;
1114         float i2f = 1.f - i2i + in2;
1115
1116         int   oy  = floor(out1);
1117         float oyf = out1 - oy;
1118         int   oh  = ceil(out2) - oy;
1119
1120         float *k  = kernel->lookup;
1121         float kw  = kernel->width;
1122         int   kn  = kernel->n;
1123
1124         if(in2 - in1 <= 0 || out2 - out1 <= 0)
1125                 return;
1126
1127         /* determine kernel spatial coverage */
1128         float scale = (out2 - out1) / (in2 - in1);
1129         float iscale = (in2 - in1) / (out2 - out1);
1130         float coverage = fabs(1.f / scale);
1131         float bound = (coverage < 1.f ? kw : kw * coverage) - (.5f / TRANSFORM_SPP);
1132         float coeff = (coverage < 1.f ? 1.f : scale) * TRANSFORM_SPP;
1133
1134         delete [] lookup_sx0;
1135         delete [] lookup_sx1;
1136         delete [] lookup_sk;
1137         delete [] lookup_wacc;
1138
1139         lookup_sx0 = new int[oh];
1140         lookup_sx1 = new int[oh];
1141         lookup_sk = new int[oh];
1142         lookup_wacc = new float[oh];
1143
1144         kd = (double)coeff * (1 << INDEX_FRACTION) + .5;
1145
1146         /* precompute kernel values and weight sums */
1147         for(int i = 0; i < oh; i++) {
1148                 /* map destination back to source */
1149                 double sx = (i - oyf + .5) * iscale + in1 - .5;
1150
1151                 /*
1152                  * clip iteration to source area but not source plane. Points
1153                  * outside the source plane count as transparent. Points outside
1154                  * the source area don't count at all.  The actual convolution
1155                  * later will be clipped to both, but we need to compute
1156                  * weights.
1157                  */
1158                 int sx0 = MAX((int)floor(sx - bound) + 1, i1i);
1159                 int sx1 = MIN((int)ceil(sx + bound), i2i);
1160                 int ki = (double)(sx0 - sx) * coeff * (1 << INDEX_FRACTION)
1161                                 + (1 << (INDEX_FRACTION - 1)) + .5;
1162                 float wacc=0.;
1163
1164                 lookup_sx0[i] = -1;
1165                 lookup_sx1[i] = -1;
1166
1167                 for(int j= sx0; j < sx1; j++) {
1168                         int kv = (ki >> INDEX_FRACTION);
1169                         if(kv > kn) break;
1170                         if(kv >= -kn) {
1171                                 /*
1172                                  * the contribution of the first and last input pixel (if
1173                                  * fractional) are linearly weighted by the fraction
1174                                  */
1175                                 if(j == i1i)
1176                                         wacc += k[abs(kv)] * i1f;
1177                                 else if(j + 1 == i2i)
1178                                         wacc += k[abs(kv)] * i2f;
1179                                 else
1180                                         wacc += k[abs(kv)];
1181
1182                                 /* this is where we clip the kernel convolution to the source plane */
1183                                 if(j >= 0 && j < iw) {
1184                                         if(lookup_sx0[i] == -1) {
1185                                                 lookup_sx0[i] = j;
1186                                                 lookup_sk[i] = ki;
1187                                         }
1188                                         lookup_sx1[i] = j + 1;
1189                                 }
1190                         }
1191                         ki += kd;
1192                 }
1193                 lookup_wacc[i] = wacc > 0. ? 1. / wacc : 0.;
1194         }
1195
1196         int cols = col_out2 - col_out1;
1197         int pkgs = get_total_packages();
1198         int col1 = col_out1, col2 = col1;
1199         for(int i = 0; i < pkgs; col1=col2 ) {
1200                 SamplePackage *package = (SamplePackage*)get_package(i);
1201                 col2 = ++i * cols / pkgs + col_out1;
1202                 package->out_col1 = col1;
1203                 package->out_col2 = col2;
1204         }
1205 }
1206
1207 LoadClient* SampleEngine::new_client()
1208 {
1209         return new SampleUnit(this);
1210 }
1211
1212 LoadPackage* SampleEngine::new_package()
1213 {
1214         return new SamplePackage;
1215 }