cinelerra-5.1/cinelerra/overlayframe.C

   1
   2 /*
   3  * CINELERRA
   4  * Copyright (C) 2008 Adam Williams <broadcast at earthling dot net>
   5  * Copyright (C) 2012 Monty <monty@xiph.org>
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License as published by
   9  * the Free Software Foundation; either version 2 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20  *
  21  */
  22
  23 #include <math.h>
  24 #include <stdio.h>
  25 #include <string.h>
  26 #include <stdint.h>
  27 #include <stdlib.h>
  28 #include <unistd.h>
  29
  30 #include "clip.h"
  31 #include "edl.inc"
  32 #include "mutex.h"
  33 #include "overlayframe.h"
  34 #include "units.h"
  35 #include "vframe.h"
  36
  37 static inline int   mabs(int32_t v) { return abs(v); }
  38 static inline int   mabs(int64_t v) { return llabs(v); }
  39 static inline float mabs(float v)   { return fabsf(v); }
  40
  41 static inline int32_t aclip(int32_t v, int mx) {
  42         return v < 0 ? 0 : v > mx ? mx : v;
  43 }
  44 static inline int64_t aclip(int64_t v, int mx) {
  45         return v < 0 ? 0 : v > mx ? mx : v;
  46 }
  47 static inline float   aclip(float v, float mx) {
  48         return v < 0 ? 0 : v > mx ? mx : v;
  49 }
  50 static inline float   aclip(float v, int mx) {
  51         return v < 0 ? 0 : v > mx ? mx : v;
  52 }
  53 static inline int   aclip(int v, float mx) {
  54         return v < 0 ? 0 : v > mx ? mx : v;
  55 }
  56 static inline int32_t cclip(int32_t v, int mx) {
  57         return v > (mx/=2) ? mx : v < (mx=(-mx-1)) ? mx : v;
  58 }
  59 static inline int64_t cclip(int64_t v, int mx) {
  60         return v > (mx/=2) ? mx : v < (mx=(-mx-1)) ? mx : v;
  61 }
  62 static inline float   cclip(float v, float mx) {
  63         return v > (mx/=2) ? mx : v < (mx=(-mx)) ? mx : v;
  64 }
  65 static inline float   cclip(float v, int mx) {
  66         return v > (mx/=2) ? mx : v < (mx=(-mx-1)) ? mx : v;
  67 }
  68 static inline int   cclip(int v, float mx) {
  69         return v > (mx/=2) ? mx : v < (mx=(-mx-1)) ? mx : v;
  70 }
  71
  72 /*
  73  * New resampler code; replace the original somehwat blurry engine
  74  * with a fairly standard kernel resampling core.  This could be used
  75  * for full affine transformation but only implements scale/translate.
  76  * Mostly reuses the old blending macro code.
  77  *
  78  * Pixel convention:
  79  *
  80  *  1) Pixels are points, not areas or squares.
  81  *
  82  *  2) To maintain the usual edge and scaling conventions, pixels are
  83  *     set inward from the image edge, eg, the left edge of an image is
  84  *     at pixel location x=-.5, not x=0.  Although pixels are not
  85  *     squares, the usual way of stating this is 'the pixel is located
  86  *     at the center of its square'.
  87  *
  88  *  3) Because of 1 and 2, we must truncate and weight the kernel
  89  *     convolution at the edge of the input area.  Otherwise, all
  90  *     resampled areas would be bordered by a transparency halo. E.g.
  91  *     in the old engine, upsampling HDV to 1920x1080 results in the
  92  *     left and right edges being partially transparent and underlying
  93  *     layers shining through.
  94  *
  95  *   4) The contribution of fractional pixels at the edges of input
  96  *     ranges are weighted according to the fraction.  Note that the
  97  *     kernel weighting is adjusted, not the opacity.  This is one
  98  *     exception to 'pixels have no area'.
  99  *
 100  *  5) The opacity of fractional pixels at the edges of the output
 101  *     range is adjusted according to the fraction. This is the other
 102  *     exception to 'pixels have no area'.
 103  *
 104  * Fractional alpha blending has been modified across the board from:
 105  *    output_alpha = input_alpha > output_alpha ? input_alpha : output_alpha;
 106  *  to:
 107  *    output_alpha = output_alpha + ((max - output_alpha) * input_alpha) / max;
 108  */
 109
 110 #define TRANSFORM_SPP    (4096)    /* number of data pts per unit x in lookup table */
 111 #define INDEX_FRACTION   (8)       /* bits of fraction past TRANSFORM_SPP on kernel
 112                                       index accumulation */
 113 #define TRANSFORM_MIN    (.5 / TRANSFORM_SPP)
 114
 115 /* Sinc needed for Lanczos kernel */
 116 static float sinc(const float x)
 117 {
 118         float y = x * M_PI;
 119
 120         if(fabsf(x) < TRANSFORM_MIN)
 121                 return 1.0f;
 122
 123         return sinf(y) / y;
 124 }
 125
 126 /*
 127  * All resampling (except Nearest Neighbor) is performed via
 128  *   transformed 2D resampling kernels bult from 1D lookups.
 129  */
 130 OverlayKernel::OverlayKernel(int interpolation_type)
 131 {
 132         int i;
 133         this->type = interpolation_type;
 134
 135         switch(interpolation_type)
 136         {
 137         case BILINEAR:
 138                 width = 1.f;
 139                 lookup = new float[(n = TRANSFORM_SPP) + 1];
 140                 for (i = 0; i <= TRANSFORM_SPP; i++)
 141                         lookup[i] = (float)(TRANSFORM_SPP - i) / TRANSFORM_SPP;
 142                 break;
 143
 144         /* Use a Catmull-Rom filter (not b-spline) */
 145         case BICUBIC:
 146                 width = 2.;
 147                 lookup = new float[(n = 2 * TRANSFORM_SPP) + 1];
 148                 for(i = 0; i <= TRANSFORM_SPP; i++) {
 149                         float x = i / (float)TRANSFORM_SPP;
 150                         lookup[i] = 1.f - 2.5f * x * x + 1.5f * x * x * x;
 151                 }
 152                 for(; i <= 2 * TRANSFORM_SPP; i++) {
 153                         float x = i / (float)TRANSFORM_SPP;
 154                         lookup[i] = 2.f - 4.f * x  + 2.5f * x * x - .5f * x * x * x;
 155                 }
 156                 break;
 157
 158         case LANCZOS:
 159                 width = 3.;
 160                 lookup = new float[(n = 3 * TRANSFORM_SPP) + 1];
 161                 for (i = 0; i <= 3 * TRANSFORM_SPP; i++)
 162                         lookup[i] = sinc((float)i / TRANSFORM_SPP) *
 163                                 sinc((float)i / TRANSFORM_SPP / 3.0f);
 164                 break;
 165
 166         default:
 167                 width = 0.;
 168                 lookup = 0;
 169                 n = 0;
 170                 break;
 171         }
 172 }
 173
 174 OverlayKernel::~OverlayKernel()
 175 {
 176         if(lookup) delete [] lookup;
 177 }
 178
 179 OverlayFrame::OverlayFrame(int cpus)
 180 {
 181         direct_engine = 0;
 182         nn_engine = 0;
 183         sample_engine = 0;
 184         temp_frame = 0;
 185         memset(kernel, 0, sizeof(kernel));
 186         this->cpus = cpus;
 187 }
 188
 189 OverlayFrame::~OverlayFrame()
 190 {
 191         if(temp_frame) delete temp_frame;
 192
 193         if(direct_engine) delete direct_engine;
 194         if(nn_engine) delete nn_engine;
 195         if(sample_engine) delete sample_engine;
 196
 197         if(kernel[NEAREST_NEIGHBOR]) delete kernel[NEAREST_NEIGHBOR];
 198         if(kernel[BILINEAR]) delete kernel[BILINEAR];
 199         if(kernel[BICUBIC]) delete kernel[BICUBIC];
 200         if(kernel[LANCZOS]) delete kernel[LANCZOS];
 201 }
 202
 203 static float epsilon_snap(float f)
 204 {
 205         return rintf(f * 1024) / 1024.;
 206 }
 207
 208 int OverlayFrame::overlay(VFrame *output, VFrame *input,
 209         float in_x1, float in_y1, float in_x2, float in_y2,
 210         float out_x1, float out_y1, float out_x2, float out_y2,
 211         float alpha, int mode, int interpolation_type)
 212 {
 213         in_x1 = epsilon_snap(in_x1);
 214         in_x2 = epsilon_snap(in_x2);
 215         in_y1 = epsilon_snap(in_y1);
 216         in_y2 = epsilon_snap(in_y2);
 217         out_x1 = epsilon_snap(out_x1);
 218         out_x2 = epsilon_snap(out_x2);
 219         out_y1 = epsilon_snap(out_y1);
 220         out_y2 = epsilon_snap(out_y2);
 221
 222         if (isnan(in_x1) || isnan(in_x2) ||
 223                 isnan(in_y1) || isnan(in_y2) ||
 224                 isnan(out_x1) || isnan(out_x2) ||
 225                 isnan(out_y1) || isnan(out_y2)) return 1;
 226
 227         if(in_x1 < 0) in_x1 = 0;
 228         if(in_y1 < 0) in_y1 = 0;
 229         if(in_x2 > input->get_w()) in_x2 = input->get_w();
 230         if(in_y2 > input->get_h()) in_y2 = input->get_h();
 231         if(out_x1 < 0) out_x1 = 0;
 232         if(out_y1 < 0) out_y1 = 0;
 233         if(out_x2 > output->get_w()) out_x2 = output->get_w();
 234         if(out_y2 > output->get_h()) out_y2 = output->get_h();
 235
 236         float xscale = (out_x2 - out_x1) / (in_x2 - in_x1);
 237         float yscale = (out_y2 - out_y1) / (in_y2 - in_y1);
 238
 239         /* don't interpolate integer translations, or scale no-ops */
 240         if(xscale == 1. && yscale == 1. &&
 241                 (int)in_x1 == in_x1 && (int)in_x2 == in_x2 &&
 242                 (int)in_y1 == in_y1 && (int)in_y2 == in_y2 &&
 243                 (int)out_x1 == out_x1 && (int)out_x2 == out_x2 &&
 244                 (int)out_y1 == out_y1 && (int)out_y2 == out_y2) {
 245                 if(!direct_engine) direct_engine = new DirectEngine(cpus);
 246
 247                 direct_engine->output = output;   direct_engine->input = input;
 248                 direct_engine->in_x1 = in_x1;     direct_engine->in_y1 = in_y1;
 249                 direct_engine->out_x1 = out_x1;   direct_engine->out_x2 = out_x2;
 250                 direct_engine->out_y1 = out_y1;   direct_engine->out_y2 = out_y2;
 251                 direct_engine->alpha = alpha;     direct_engine->mode = mode;
 252                 direct_engine->process_packages();
 253         }
 254         else if(interpolation_type == NEAREST_NEIGHBOR) {
 255                 if(!nn_engine) nn_engine = new NNEngine(cpus);
 256                 nn_engine->output = output;       nn_engine->input = input;
 257                 nn_engine->in_x1 = in_x1;         nn_engine->in_x2 = in_x2;
 258                 nn_engine->in_y1 = in_y1;         nn_engine->in_y2 = in_y2;
 259                 nn_engine->out_x1 = out_x1;       nn_engine->out_x2 = out_x2;
 260                 nn_engine->out_y1 = out_y1;       nn_engine->out_y2 = out_y2;
 261                 nn_engine->alpha = alpha;         nn_engine->mode = mode;
 262                 nn_engine->process_packages();
 263         }
 264         else {
 265                 int xtype = BILINEAR;
 266                 int ytype = BILINEAR;
 267
 268                 switch(interpolation_type)
 269                 {
 270                 case CUBIC_CUBIC: // Bicubic enlargement and reduction
 271                         xtype = ytype = BICUBIC;
 272                         break;
 273                 case CUBIC_LINEAR: // Bicubic enlargement and bilinear reduction
 274                         xtype = xscale > 1. ? BICUBIC : BILINEAR;
 275                         ytype = yscale > 1. ? BICUBIC : BILINEAR;
 276                         break;
 277                 case LINEAR_LINEAR: // Bilinear enlargement and bilinear reduction
 278                         xtype = ytype = BILINEAR;
 279                         break;
 280                 case LANCZOS_LANCZOS: // Because we can
 281                         xtype = ytype = LANCZOS;
 282                         break;
 283                 }
 284
 285                 if(xscale == 1. && (int)in_x1 == in_x1 && (int)in_x2 == in_x2 &&
 286                                 (int)out_x1 == out_x1 && (int)out_x2 == out_x2)
 287                         xtype = DIRECT_COPY;
 288
 289                 if(yscale == 1. && (int)in_y1 == in_y1 && (int)in_y2 == in_y2 &&
 290                                 (int)out_y1 == out_y1 && (int)out_y2 == out_y2)
 291                         ytype = DIRECT_COPY;
 292
 293                 if(!kernel[xtype])
 294                         kernel[xtype] = new OverlayKernel(xtype);
 295                 if(!kernel[ytype])
 296                         kernel[ytype] = new OverlayKernel(ytype);
 297
 298 /*
 299  * horizontal and vertical are separately resampled.  First we
 300  * resample the input along X into a transposed, temporary frame,
 301  * then resample/transpose the temporary space along X into the
 302  * output.  Fractional pixels along the edge are handled in the X
 303  * direction of each step
 304  */
 305                 // resampled dimension matches the transposed output space
 306                 float temp_y1 = out_x1 - floor(out_x1);
 307                 float temp_y2 = temp_y1 + (out_x2 - out_x1);
 308                 int temp_h = ceil(temp_y2);
 309
 310                 // non-resampled dimension merely cropped
 311                 float temp_x1 = in_y1 - floor(in_y1);
 312                 float temp_x2 = temp_x1 + (in_y2 - in_y1);
 313                 int temp_w = ceil(temp_x2);
 314
 315                 if( temp_frame &&
 316                    (temp_frame->get_color_model() != input->get_color_model() ||
 317                     temp_frame->get_w() != temp_w || temp_frame->get_h() != temp_h) ) {
 318                         delete temp_frame;
 319                         temp_frame = 0;
 320                 }
 321
 322                 if(!temp_frame) {
 323                         temp_frame = new VFrame(0, -1, temp_w, temp_h,
 324                                 input->get_color_model(), -1);
 325                 }
 326
 327                 temp_frame->clear_frame();
 328
 329                 if(!sample_engine) sample_engine = new SampleEngine(cpus);
 330
 331                 sample_engine->output = temp_frame;
 332                 sample_engine->input = input;
 333                 sample_engine->kernel = kernel[xtype];
 334                 sample_engine->col_out1 = 0;
 335                 sample_engine->col_out2 = temp_w;
 336                 sample_engine->row_in = floor(in_y1);
 337
 338                 sample_engine->in1 = in_x1;
 339                 sample_engine->in2 = in_x2;
 340                 sample_engine->out1 = temp_y1;
 341                 sample_engine->out2 = temp_y2;
 342                 sample_engine->alpha = 1.;
 343                 sample_engine->mode = TRANSFER_REPLACE;
 344                 sample_engine->process_packages();
 345
 346                 sample_engine->output = output;
 347                 sample_engine->input = temp_frame;
 348                 sample_engine->kernel = kernel[ytype];
 349                 sample_engine->col_out1 = floor(out_x1);
 350                 sample_engine->col_out2 = ceil(out_x2);
 351                 sample_engine->row_in = 0;
 352
 353                 sample_engine->in1 = temp_x1;
 354                 sample_engine->in2 = temp_x2;
 355                 sample_engine->out1 = out_y1;
 356                 sample_engine->out2 = out_y2;
 357                 sample_engine->alpha = alpha;
 358                 sample_engine->mode = mode;
 359                 sample_engine->process_packages();
 360         }
 361         return 0;
 362 }
 363
 364 // NORMAL       [Sa + Da * (1 - Sa), Sc * Sa + Dc * (1 - Sa)])
 365 #define ALPHA_NORMAL(mx, Sa, Da) (Sa + (Da * (mx - Sa)) / mx)
 366 #define COLOR_NORMAL(mx, Sc, Sa, Dc, Da) ((Sc * Sa + Dc * (mx - Sa)) / mx)
 367 #define CHROMA_NORMAL COLOR_NORMAL
 368
 369 // ADDITION     [(Sa + Da), (Sc + Dc)]
 370 #define ALPHA_ADDITION(mx, Sa, Da) (Sa + Da)
 371 #define COLOR_ADDITION(mx, Sc, Sa, Dc, Da) (Sc + Dc)
 372 #define CHROMA_ADDITION(mx, Sc, Sa, Dc, Da) (Sc + Dc)
 373
 374 // SUBTRACT     [(Sa - Da), (Sc - Dc)]
 375 #define ALPHA_SUBTRACT(mx, Sa, Da) (Sa - Da)
 376 #define COLOR_SUBTRACT(mx, Sc, Sa, Dc, Da) (Sc - Dc)
 377 #define CHROMA_SUBTRACT(mx, Sc, Sa, Dc, Da) (Sc - Dc)
 378
 379 // MULTIPLY     [(Sa * Da), Sc * Dc]
 380 #define ALPHA_MULTIPLY(mx, Sa, Da) ((Sa * Da) / mx)
 381 #define COLOR_MULTIPLY(mx, Sc, Sa, Dc, Da) ((Sc * Dc) / mx)
 382 #define CHROMA_MULTIPLY(mx, Sc, Sa, Dc, Da) ((Sc * Dc) / mx)
 383
 384 // DIVIDE       [(Sa / Da), (Sc / Dc)]
 385 #define ALPHA_DIVIDE(mx, Sa, Da) (Da ? ((Sa * mx) / Da) : mx)
 386 #define COLOR_DIVIDE(mx, Sc, Sa, Dc, Da) (Dc ? ((Sc * mx) / Dc) : mx)
 387 #define CHROMA_DIVIDE(mx, Sc, Sa, Dc, Da) (Dc ? ((Sc * mx) / Dc) : mx)
 388
 389 // REPLACE      [Sa, Sc] (fade = 1)
 390 #define ALPHA_REPLACE(mx, Sa, Da) Sa
 391 #define COLOR_REPLACE(mx, Sc, Sa, Dc, Da) Sc
 392 #define CHROMA_REPLACE COLOR_REPLACE
 393
 394 // MAX          [max(Sa, Da), MAX(Sc, Dc)]
 395 #define ALPHA_MAX(mx, Sa, Da) (Sa > Da ? Sa : Da)
 396 #define COLOR_MAX(mx, Sc, Sa, Dc, Da) (Sc > Dc ? Sc : Dc)
 397 #define CHROMA_MAX(mx, Sc, Sa, Dc, Da) (mabs(Sc) > mabs(Dc) ? Sc : Dc)
 398
 399 // MIN          [min(Sa, Da), MIN(Sc, Dc)]
 400 #define ALPHA_MIN(mx, Sa, Da) (Sa < Da ? Sa : Da)
 401 #define COLOR_MIN(mx, Sc, Sa, Dc, Da) (Sc < Dc ? Sc : Dc)
 402 #define CHROMA_MIN(mx, Sc, Sa, Dc, Da) (mabs(Sc) < mabs(Dc) ? Sc : Dc)
 403
 404 // AVERAGE      [(Sa + Da) * 0.5, (Sc + Dc) * 0.5]
 405 #define ALPHA_AVERAGE(mx, Sa, Da) ((Sa + Da) / 2)
 406 #define COLOR_AVERAGE(mx, Sc, Sa, Dc, Da) ((Sc + Dc) / 2)
 407 #define CHROMA_AVERAGE COLOR_AVERAGE
 408
 409 // DARKEN       [Sa + Da - Sa*Da, Sc*(1 - Da) + Dc*(1 - Sa) + min(Sc, Dc)]
 410 #define ALPHA_DARKEN(mx, Sa, Da) (Sa + Da - (Sa * Da) / mx)
 411 #define COLOR_DARKEN(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * (mx - Sa)) / mx + (Sc < Dc ? Sc : Dc))
 412 #define CHROMA_DARKEN(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * (mx - Sa)) / mx + (mabs(Sc) < mabs(Dc) ? Sc : Dc))
 413
 414 // LIGHTEN      [Sa + Da - Sa*Da, Sc*(1 - Da) + Dc*(1 - Sa) + max(Sc, Dc)]
 415 #define ALPHA_LIGHTEN(mx, Sa, Da) (Sa + Da - Sa * Da / mx)
 416 #define COLOR_LIGHTEN(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * (mx - Sa)) / mx + (Sc > Dc ? Sc : Dc))
 417 #define CHROMA_LIGHTEN(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * (mx - Sa)) / mx + (mabs(Sc) > mabs(Dc) ? Sc : Dc))
 418
 419 // DST          [Da, Dc]
 420 #define ALPHA_DST(mx, Sa, Da) Da
 421 #define COLOR_DST(mx, Sc, Sa, Dc, Da) Dc
 422 #define CHROMA_DST COLOR_DST
 423
 424 // DST_ATOP     [Sa, Sc * (1 - Da) + Dc * Sa]
 425 #define ALPHA_DST_ATOP(mx, Sa, Da) Sa
 426 #define COLOR_DST_ATOP(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * Sa) / mx)
 427 #define CHROMA_DST_ATOP COLOR_DST_ATOP
 428
 429 // DST_IN       [Da * Sa, Dc * Sa]
 430 #define ALPHA_DST_IN(mx, Sa, Da) ((Da * Sa) / mx)
 431 #define COLOR_DST_IN(mx, Sc, Sa, Dc, Da) ((Dc * Sa) / mx)
 432 #define CHROMA_DST_IN COLOR_DST_IN
 433
 434 // DST_OUT      [Da * (1 - Sa), Dc * (1 - Sa)]
 435 #define ALPHA_DST_OUT(mx, Sa, Da) (Da * (mx - Sa) / mx)
 436 #define COLOR_DST_OUT(mx, Sc, Sa, Dc, Da) (Dc * (mx - Sa) / mx)
 437 #define CHROMA_DST_OUT COLOR_DST_OUT
 438
 439 // DST_OVER     [Sa * (1 - Da) + Da, Sc * (1 - Da) + Dc]
 440 #define ALPHA_DST_OVER(mx, Sa, Da) ((Sa * (mx - Da)) / mx + Da)
 441 #define COLOR_DST_OVER(mx, Sc, Sa, Dc, Da) (Sc * (mx - Da)/ mx + Dc)
 442 #define CHROMA_DST_OVER COLOR_DST_OVER
 443
 444 // SRC                  [Sa, Sc]
 445 #define ALPHA_SRC(mx, Sa, Da) Sa
 446 #define COLOR_SRC(mx, Sc, Sa, Dc, Da) Sc
 447 #define CHROMA_SRC COLOR_SRC
 448
 449 // SRC_ATOP     [Da, Sc * Da + Dc * (1 - Sa)]
 450 #define ALPHA_SRC_ATOP(mx, Sa, Da) Da
 451 #define COLOR_SRC_ATOP(mx, Sc, Sa, Dc, Da) ((Sc * Da + Dc * (mx - Sa)) / mx)
 452 #define CHROMA_SRC_ATOP COLOR_SRC_ATOP
 453
 454 // SRC_IN       [Sa * Da, Sc * Da]
 455 #define ALPHA_SRC_IN(mx, Sa, Da) ((Sa * Da) / mx)
 456 #define COLOR_SRC_IN(mx, Sc, Sa, Dc, Da) (Sc * Da / mx)
 457 #define CHROMA_SRC_IN COLOR_SRC_IN
 458
 459 // SRC_OUT      [Sa * (1 - Da), Sc * (1 - Da)]
 460 #define ALPHA_SRC_OUT(mx, Sa, Da) (Sa * (mx - Da) / mx)
 461 #define COLOR_SRC_OUT(mx, Sc, Sa, Dc, Da) (Sc * (mx - Da) / mx)
 462 #define CHROMA_SRC_OUT COLOR_SRC_OUT
 463
 464 // SRC_OVER     [Sa + Da * (1 - Sa), Sc + (1 - Sa) * Dc]
 465 #define ALPHA_SRC_OVER(mx, Sa, Da) (Sa + Da * (mx - Sa) / mx)
 466 #define COLOR_SRC_OVER(mx, Sc, Sa, Dc, Da) (Sc + Dc * (mx - Sa) / mx)
 467 #define CHROMA_SRC_OVER COLOR_SRC_OVER
 468
 469 // OR   [Sa + Da - Sa * Da, Sc + Dc - Sc * Dc]
 470 #define ALPHA_OR(mx, Sa, Da) (Sa + Da - (Sa * Da) / mx)
 471 #define COLOR_OR(mx, Sc, Sa, Dc, Da) (Sc + Dc - (Sc * Dc) / mx)
 472 #define CHROMA_OR COLOR_OR
 473
 474 // XOR          [Sa * (1 - Da) + Da * (1 - Sa), Sc * (1 - Da) + Dc * (1 - Sa)]
 475 #define ALPHA_XOR(mx, Sa, Da) ((Sa * (mx - Da) + Da * (mx - Sa)) / mx)
 476 #define COLOR_XOR(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * (mx - Sa)) / mx)
 477 #define CHROMA_XOR COLOR_XOR
 478
 479 #define ZTYP(ty) typedef ty z_##ty __attribute__ ((__unused__))
 480 ZTYP(int8_t);   ZTYP(uint8_t);
 481 ZTYP(int16_t);  ZTYP(uint16_t);
 482 ZTYP(int32_t);  ZTYP(uint32_t);
 483 ZTYP(int64_t);  ZTYP(uint64_t);
 484 ZTYP(float);    ZTYP(double);
 485
 486 #define ALPHA3_BLEND(FN, typ, inp, out, mx, ofs, rnd) \
 487   typ inp0 = (typ)inp[0], inp1 = (typ)inp[1] - ofs; \
 488   typ inp2 = (typ)inp[2] - ofs, inp3 = mx; \
 489   typ out0 = (typ)out[0], out1 = (typ)out[1] - ofs; \
 490   typ out2 = (typ)out[2] - ofs, out3 = mx; \
 491   r = COLOR_##FN(mx, inp0, inp3, out0, out3); \
 492   if( ofs ) { \
 493     g = CHROMA_##FN(mx, inp1, inp3, out1, out3); \
 494     b = CHROMA_##FN(mx, inp2, inp3, out2, out3); \
 495   } \
 496   else { \
 497     g = COLOR_##FN(mx, inp1, inp3, out1, out3); \
 498     b = COLOR_##FN(mx, inp2, inp3, out2, out3); \
 499   }
 500
 501 #define ALPHA4_BLEND(FN, typ, inp, out, mx, ofs, rnd) \
 502   typ inp0 = (typ)inp[0], inp1 = (typ)inp[1] - ofs; \
 503   typ inp2 = (typ)inp[2] - ofs, inp3 = inp[3]; \
 504   typ out0 = (typ)out[0], out1 = (typ)out[1] - ofs; \
 505   typ out2 = (typ)out[2] - ofs, out3 = out[3]; \
 506   r = COLOR_##FN(mx, inp0, inp3, out0, out3); \
 507   if( ofs ) { \
 508     g = CHROMA_##FN(mx, inp1, inp3, out1, out3); \
 509     b = CHROMA_##FN(mx, inp2, inp3, out2, out3); \
 510   } \
 511   else { \
 512     g = COLOR_##FN(mx, inp1, inp3, out1, out3); \
 513     b = COLOR_##FN(mx, inp2, inp3, out2, out3); \
 514   } \
 515   a = ALPHA_##FN(mx, inp3, out3)
 516
 517 #define ALPHA_STORE(out, ofs, mx) \
 518   out[0] = r; \
 519   out[1] = g + ofs; \
 520   out[2] = b + ofs
 521
 522 #define ALPHA3_STORE(out, ofs, mx) \
 523   r = aclip(r, mx); \
 524   g = ofs ? cclip(g, mx) : aclip(g, mx); \
 525   b = ofs ? cclip(b, mx) : aclip(b, mx); \
 526   if( trnsp ) { \
 527     r = (r * opcty + out0 * trnsp) / mx; \
 528     g = (g * opcty + out1 * trnsp) / mx; \
 529     b = (b * opcty + out2 * trnsp) / mx; \
 530   } \
 531   ALPHA_STORE(out, ofs, mx)
 532
 533 #define ALPHA4_STORE(out, ofs, mx) \
 534   r = aclip(r, mx); \
 535   g = ofs ? cclip(g, mx) : aclip(g, mx); \
 536   b = ofs ? cclip(b, mx) : aclip(b, mx); \
 537   if( trnsp ) { \
 538     r = (r * opcty + out0 * trnsp) / mx; \
 539     g = (g * opcty + out1 * trnsp) / mx; \
 540     b = (b * opcty + out2 * trnsp) / mx; \
 541     a = (a * opcty + out3 * trnsp) / mx; \
 542   } \
 543   ALPHA_STORE(out, ofs, mx); \
 544   out[3] = aclip(a, mx)
 545
 546 #define XBLEND(FN, temp_type, type, max, components, chroma_offset, round) { \
 547         temp_type opcty = fade * max + round, trnsp = max - opcty; \
 548         type** output_rows = (type**)output->get_rows(); \
 549         type** input_rows = (type**)input->get_rows(); \
 550         ix *= components;  ox *= components; \
 551  \
 552         for(int i = pkg->out_row1; i < pkg->out_row2; i++) { \
 553                 type* in_row = input_rows[i + iy] + ix; \
 554                 type* output = output_rows[i] + ox; \
 555                 for(int j = 0; j < ow; j++) { \
 556                         if( components == 4 ) { \
 557                                 temp_type r, g, b, a; \
 558                                 ALPHA4_BLEND(FN, temp_type, in_row, output, max, chroma_offset, round); \
 559                                 ALPHA4_STORE(output, chroma_offset, max); \
 560                         } \
 561                         else { \
 562                                 temp_type r, g, b; \
 563                                 ALPHA3_BLEND(FN, temp_type, in_row, output, max, chroma_offset, round); \
 564                                 ALPHA3_STORE(output, chroma_offset, max); \
 565                         } \
 566                         in_row += components;  output += components; \
 567                 } \
 568         } \
 569         break; \
 570 }
 571
 572 #define XBLEND_ONLY(FN) { \
 573         switch(input->get_color_model()) { \
 574         case BC_RGB_FLOAT:      XBLEND(FN, z_float,   z_float,    1.f,    3, 0,      0.f); \
 575         case BC_RGBA_FLOAT:     XBLEND(FN, z_float,   z_float,    1.f,    4, 0,      0.f); \
 576         case BC_RGB888:         XBLEND(FN, z_int32_t, z_uint8_t,  0xff,   3, 0,      .5f); \
 577         case BC_YUV888:         XBLEND(FN, z_int32_t, z_uint8_t,  0xff,   3, 0x80,   .5f); \
 578         case BC_RGBA8888:       XBLEND(FN, z_int32_t, z_uint8_t,  0xff,   4, 0,      .5f); \
 579         case BC_YUVA8888:       XBLEND(FN, z_int32_t, z_uint8_t,  0xff,   4, 0x80,   .5f); \
 580         case BC_RGB161616:      XBLEND(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0,      .5f); \
 581         case BC_YUV161616:      XBLEND(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0x8000, .5f); \
 582         case BC_RGBA16161616:   XBLEND(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0,      .5f); \
 583         case BC_YUVA16161616:   XBLEND(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0x8000, .5f); \
 584         } \
 585         break; \
 586 }
 587
 588 /* Direct translate / blend **********************************************/
 589
 590 DirectPackage::DirectPackage()
 591 {
 592 }
 593
 594 DirectUnit::DirectUnit(DirectEngine *server)
 595  : LoadClient(server)
 596 {
 597         this->engine = server;
 598 }
 599
 600 DirectUnit::~DirectUnit()
 601 {
 602 }
 603
 604 void DirectUnit::process_package(LoadPackage *package)
 605 {
 606         DirectPackage *pkg = (DirectPackage*)package;
 607
 608         VFrame *output = engine->output;
 609         VFrame *input = engine->input;
 610         int mode = engine->mode;
 611         float fade =
 612                 BC_CModels::has_alpha(input->get_color_model()) &&
 613                 mode == TRANSFER_REPLACE ? 1.f : engine->alpha;
 614
 615         int ix = engine->in_x1;
 616         int ox = engine->out_x1;
 617         int ow = engine->out_x2 - ox;
 618         int iy = engine->in_y1 - engine->out_y1;
 619
 620         switch( mode ) {
 621         case TRANSFER_NORMAL:           XBLEND_ONLY(NORMAL);
 622         case TRANSFER_ADDITION:         XBLEND_ONLY(ADDITION);
 623         case TRANSFER_SUBTRACT:         XBLEND_ONLY(SUBTRACT);
 624         case TRANSFER_MULTIPLY:         XBLEND_ONLY(MULTIPLY);
 625         case TRANSFER_DIVIDE:           XBLEND_ONLY(DIVIDE);
 626         case TRANSFER_REPLACE:          XBLEND_ONLY(REPLACE);
 627         case TRANSFER_MAX:              XBLEND_ONLY(MAX);
 628         case TRANSFER_MIN:              XBLEND_ONLY(MIN);
 629         case TRANSFER_AVERAGE:          XBLEND_ONLY(AVERAGE);
 630         case TRANSFER_DARKEN:           XBLEND_ONLY(DARKEN);
 631         case TRANSFER_LIGHTEN:          XBLEND_ONLY(LIGHTEN);
 632         case TRANSFER_DST:              XBLEND_ONLY(DST);
 633         case TRANSFER_DST_ATOP:         XBLEND_ONLY(DST_ATOP);
 634         case TRANSFER_DST_IN:           XBLEND_ONLY(DST_IN);
 635         case TRANSFER_DST_OUT:          XBLEND_ONLY(DST_OUT);
 636         case TRANSFER_DST_OVER:         XBLEND_ONLY(DST_OVER);
 637         case TRANSFER_SRC:              XBLEND_ONLY(SRC);
 638         case TRANSFER_SRC_ATOP:         XBLEND_ONLY(SRC_ATOP);
 639         case TRANSFER_SRC_IN:           XBLEND_ONLY(SRC_IN);
 640         case TRANSFER_SRC_OUT:          XBLEND_ONLY(SRC_OUT);
 641         case TRANSFER_SRC_OVER:         XBLEND_ONLY(SRC_OVER);
 642         case TRANSFER_OR:               XBLEND_ONLY(OR);
 643         case TRANSFER_XOR:              XBLEND_ONLY(XOR);
 644         }
 645 }
 646
 647 DirectEngine::DirectEngine(int cpus)
 648  : LoadServer(cpus, cpus)
 649 {
 650 }
 651
 652 DirectEngine::~DirectEngine()
 653 {
 654 }
 655
 656 void DirectEngine::init_packages()
 657 {
 658         if(in_x1 < 0) { out_x1 -= in_x1; in_x1 = 0; }
 659         if(in_y1 < 0) { out_y1 -= in_y1; in_y1 = 0; }
 660         if(out_x1 < 0) { in_x1 -= out_x1; out_x1 = 0; }
 661         if(out_y1 < 0) { in_y1 -= out_y1; out_y1 = 0; }
 662         if(out_x2 > output->get_w()) out_x2 = output->get_w();
 663         if(out_y2 > output->get_h()) out_y2 = output->get_h();
 664         int out_w = out_x2 - out_x1;
 665         int out_h = out_y2 - out_y1;
 666         if( !out_w || !out_h ) return;
 667
 668         int rows = out_h;
 669         int pkgs = get_total_packages();
 670         int row1 = out_y1, row2 = row1;
 671         for(int i = 0; i < pkgs; row1=row2 ) {
 672                 DirectPackage *package = (DirectPackage*)get_package(i);
 673                 row2 = ++i * rows / pkgs + out_y1;
 674                 package->out_row1 = row1;
 675                 package->out_row2 = row2;
 676         }
 677 }
 678
 679 LoadClient* DirectEngine::new_client()
 680 {
 681         return new DirectUnit(this);
 682 }
 683
 684 LoadPackage* DirectEngine::new_package()
 685 {
 686         return new DirectPackage;
 687 }
 688
 689 /* Nearest Neighbor scale / translate / blend ********************/
 690
 691 #define XBLEND_3NN(FN, temp_type, type, max, components, chroma_offset, round) { \
 692         temp_type opcty = fade * max + round, trnsp = max - opcty; \
 693         type** output_rows = (type**)output->get_rows(); \
 694         type** input_rows = (type**)input->get_rows(); \
 695         ox *= components; \
 696  \
 697         for(int i = pkg->out_row1; i < pkg->out_row2; i++) { \
 698                 int *lx = engine->in_lookup_x; \
 699                 type* in_row = input_rows[*ly++]; \
 700                 type* output = output_rows[i] + ox; \
 701                 for(int j = 0; j < ow; j++) { \
 702                         in_row += *lx++; \
 703                         if( components == 4 ) { \
 704                                 temp_type r, g, b, a; \
 705                                 ALPHA4_BLEND(FN, temp_type, in_row, output, max, chroma_offset, round); \
 706                                 ALPHA4_STORE(output, chroma_offset, max); \
 707                         } \
 708                         else { \
 709                                 temp_type r, g, b; \
 710                                 ALPHA3_BLEND(FN, temp_type, in_row, output, max, chroma_offset, round); \
 711                                 ALPHA3_STORE(output, chroma_offset, max); \
 712                         } \
 713                         output += components; \
 714                 } \
 715         } \
 716         break; \
 717 }
 718
 719 #define XBLEND_NN(FN) { \
 720         switch(input->get_color_model()) { \
 721         case BC_RGB_FLOAT:      XBLEND_3NN(FN, z_float,   z_float,    1.f,    3, 0,       0.f); \
 722         case BC_RGBA_FLOAT:     XBLEND_3NN(FN, z_float,   z_float,    1.f,    4, 0,       0.f); \
 723         case BC_RGB888:         XBLEND_3NN(FN, z_int32_t, z_uint8_t,  0xff,   3, 0,      .5f); \
 724         case BC_YUV888:         XBLEND_3NN(FN, z_int32_t, z_uint8_t,  0xff,   3, 0x80,   .5f); \
 725         case BC_RGBA8888:       XBLEND_3NN(FN, z_int32_t, z_uint8_t,  0xff,   4, 0,      .5f); \
 726         case BC_YUVA8888:       XBLEND_3NN(FN, z_int32_t, z_uint8_t,  0xff,   4, 0x80,   .5f); \
 727         case BC_RGB161616:      XBLEND_3NN(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0,      .5f); \
 728         case BC_YUV161616:      XBLEND_3NN(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0x8000, .5f); \
 729         case BC_RGBA16161616:   XBLEND_3NN(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0,      .5f); \
 730         case BC_YUVA16161616:   XBLEND_3NN(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0x8000, .5f); \
 731         } \
 732         break; \
 733 }
 734
 735 NNPackage::NNPackage()
 736 {
 737 }
 738
 739 NNUnit::NNUnit(NNEngine *server)
 740  : LoadClient(server)
 741 {
 742         this->engine = server;
 743 }
 744
 745 NNUnit::~NNUnit()
 746 {
 747 }
 748
 749 void NNUnit::process_package(LoadPackage *package)
 750 {
 751         NNPackage *pkg = (NNPackage*)package;
 752         VFrame *output = engine->output;
 753         VFrame *input = engine->input;
 754         int mode = engine->mode;
 755         float fade =
 756                 BC_CModels::has_alpha(input->get_color_model()) &&
 757                 mode == TRANSFER_REPLACE ? 1.f : engine->alpha;
 758
 759         int ox = engine->out_x1i;
 760         int ow = engine->out_x2i - ox;
 761         int *ly = engine->in_lookup_y + pkg->out_row1;
 762
 763         switch( mode ) {
 764         case TRANSFER_NORMAL:           XBLEND_NN(NORMAL);
 765         case TRANSFER_ADDITION:         XBLEND_NN(ADDITION);
 766         case TRANSFER_SUBTRACT:         XBLEND_NN(SUBTRACT);
 767         case TRANSFER_MULTIPLY:         XBLEND_NN(MULTIPLY);
 768         case TRANSFER_DIVIDE:           XBLEND_NN(DIVIDE);
 769         case TRANSFER_REPLACE:          XBLEND_NN(REPLACE);
 770         case TRANSFER_MAX:              XBLEND_NN(MAX);
 771         case TRANSFER_MIN:              XBLEND_NN(MIN);
 772         case TRANSFER_AVERAGE:          XBLEND_NN(AVERAGE);
 773         case TRANSFER_DARKEN:           XBLEND_NN(DARKEN);
 774         case TRANSFER_LIGHTEN:          XBLEND_NN(LIGHTEN);
 775         case TRANSFER_DST:              XBLEND_NN(DST);
 776         case TRANSFER_DST_ATOP:         XBLEND_NN(DST_ATOP);
 777         case TRANSFER_DST_IN:           XBLEND_NN(DST_IN);
 778         case TRANSFER_DST_OUT:          XBLEND_NN(DST_OUT);
 779         case TRANSFER_DST_OVER:         XBLEND_NN(DST_OVER);
 780         case TRANSFER_SRC:              XBLEND_NN(SRC);
 781         case TRANSFER_SRC_ATOP:         XBLEND_NN(SRC_ATOP);
 782         case TRANSFER_SRC_IN:           XBLEND_NN(SRC_IN);
 783         case TRANSFER_SRC_OUT:          XBLEND_NN(SRC_OUT);
 784         case TRANSFER_SRC_OVER:         XBLEND_NN(SRC_OVER);
 785         case TRANSFER_OR:               XBLEND_NN(OR);
 786         case TRANSFER_XOR:              XBLEND_NN(XOR);
 787         }
 788 }
 789
 790 NNEngine::NNEngine(int cpus)
 791  : LoadServer(cpus, cpus)
 792 {
 793         in_lookup_x = 0;
 794         in_lookup_y = 0;
 795 }
 796
 797 NNEngine::~NNEngine()
 798 {
 799         if(in_lookup_x)
 800                 delete[] in_lookup_x;
 801         if(in_lookup_y)
 802                 delete[] in_lookup_y;
 803 }
 804
 805 void NNEngine::init_packages()
 806 {
 807         int in_w = input->get_w();
 808         int in_h = input->get_h();
 809         int out_w = output->get_w();
 810         int out_h = output->get_h();
 811
 812         float in_subw = in_x2 - in_x1;
 813         float in_subh = in_y2 - in_y1;
 814         float out_subw = out_x2 - out_x1;
 815         float out_subh = out_y2 - out_y1;
 816         int first, last, count, i;
 817         int components = 3;
 818
 819         out_x1i = rint(out_x1);
 820         out_x2i = rint(out_x2);
 821         if(out_x1i < 0) out_x1i = 0;
 822         if(out_x1i > out_w) out_x1i = out_w;
 823         if(out_x2i < 0) out_x2i = 0;
 824         if(out_x2i > out_w) out_x2i = out_w;
 825         int out_wi = out_x2i - out_x1i;
 826         if( !out_wi ) return;
 827
 828         delete[] in_lookup_x;
 829         in_lookup_x = new int[out_wi];
 830         delete[] in_lookup_y;
 831         in_lookup_y = new int[out_h];
 832
 833         switch(input->get_color_model()) {
 834         case BC_RGBA_FLOAT:
 835         case BC_RGBA8888:
 836         case BC_YUVA8888:
 837         case BC_RGBA16161616:
 838                 components = 4;
 839                 break;
 840         }
 841
 842         first = count = 0;
 843
 844         for(i = out_x1i; i < out_x2i; i++) {
 845                 int in = (i - out_x1 + .5) * in_subw / out_subw + in_x1;
 846                 if(in < in_x1)
 847                         in = in_x1;
 848                 if(in > in_x2)
 849                         in = in_x2;
 850
 851                 if(in >= 0 && in < in_w && in >= in_x1 && i >= 0 && i < out_w) {
 852                         if(count == 0) {
 853                                 first = i;
 854                                 in_lookup_x[0] = in * components;
 855                         }
 856                         else {
 857                                 in_lookup_x[count] = (in-last)*components;
 858                         }
 859                         last = in;
 860                         count++;
 861                 }
 862                 else if(count)
 863                         break;
 864         }
 865         out_x1i = first;
 866         out_x2i = first + count;
 867         first = count = 0;
 868
 869         for(i = out_y1; i < out_y2; i++) {
 870                 int in = (i - out_y1+.5) * in_subh / out_subh + in_y1;
 871                 if(in < in_y1) in = in_y1;
 872                 if(in > in_y2) in = in_y2;
 873                 if(in >= 0 && in < in_h && i >= 0 && i < out_h) {
 874                         if(count == 0) first = i;
 875                         in_lookup_y[i] = in;
 876                         count++;
 877                 }
 878                 else if(count)
 879                         break;
 880         }
 881         out_y1 = first;
 882         out_y2 = first + count;
 883
 884         int rows = count;
 885         int pkgs = get_total_packages();
 886         int row1 = out_y1, row2 = row1;
 887         for(int i = 0; i < pkgs; row1=row2 ) {
 888                 NNPackage *package = (NNPackage*)get_package(i);
 889                 row2 = ++i * rows / pkgs + out_y1;
 890                 package->out_row1 = row1;
 891                 package->out_row2 = row2;
 892         }
 893 }
 894
 895 LoadClient* NNEngine::new_client()
 896 {
 897         return new NNUnit(this);
 898 }
 899
 900 LoadPackage* NNEngine::new_package()
 901 {
 902         return new NNPackage;
 903 }
 904
 905 /* Fully resampled scale / translate / blend ******************************/
 906 /* resample into a temporary row vector, then blend */
 907
 908 #define XSAMPLE(FN, temp_type, type, max, components, chroma_offset, round) { \
 909         float temp[oh*components]; \
 910         temp_type opcty = fade * max + round, trnsp = max - opcty; \
 911         type **output_rows = (type**)voutput->get_rows() + o1i; \
 912         type **input_rows = (type**)vinput->get_rows(); \
 913  \
 914         for(int i = pkg->out_col1; i < pkg->out_col2; i++) { \
 915                 type *input = input_rows[i - engine->col_out1 + engine->row_in]; \
 916                 float *tempp = temp; \
 917                 if( !k ) { /* direct copy case */ \
 918                         type *ip = input + i1i * components; \
 919                         for(int j = 0; j < oh; j++) { \
 920                                 *tempp++ = *ip++; \
 921                                 *tempp++ = *ip++ - chroma_offset; \
 922                                 *tempp++ = *ip++ - chroma_offset; \
 923                                 if( components == 4 ) *tempp++ = *ip++; \
 924                         } \
 925                 } \
 926                 else { /* resample */ \
 927                         for(int j = 0; j < oh; j++) { \
 928                                 float racc=0.f, gacc=0.f, bacc=0.f, aacc=0.f; \
 929                                 int ki = lookup_sk[j], x = lookup_sx0[j]; \
 930                                 type *ip = input + x * components; \
 931                                 float wacc = 0, awacc = 0; \
 932                                 while(x++ < lookup_sx1[j]) { \
 933                                         float kv = k[abs(ki >> INDEX_FRACTION)]; \
 934                                         /* handle fractional pixels on edges of input */ \
 935                                         if(x == i1i) kv *= i1f; \
 936                                         if(x + 1 == i2i) kv *= i2f; \
 937                                         if( components == 4 ) { awacc += kv;  kv *= ip[3]; } \
 938                                         wacc += kv; \
 939                                         racc += kv * *ip++; \
 940                                         gacc += kv * (*ip++ - chroma_offset); \
 941                                         bacc += kv * (*ip++ - chroma_offset); \
 942                                         if( components == 4 ) { aacc += kv;  ++ip; } \
 943                                         ki += kd; \
 944                                 } \
 945                                 if(wacc > 0.) wacc = 1. / wacc; \
 946                                 *tempp++ = racc * wacc; \
 947                                 *tempp++ = gacc * wacc; \
 948                                 *tempp++ = bacc * wacc; \
 949                                 if( components == 4 ) { \
 950                                         if(awacc > 0.) awacc = 1. / awacc; \
 951                                         *tempp++ = aacc * awacc; \
 952                                 } \
 953                         } \
 954                 } \
 955  \
 956                 /* handle fractional pixels on edges of output */ \
 957                 temp[0] *= o1f;   temp[1] *= o1f;   temp[2] *= o1f; \
 958                 if( components == 4 ) temp[3] *= o1f; \
 959                 tempp = temp + (oh-1)*components; \
 960                 tempp[0] *= o2f;  tempp[1] *= o2f;  tempp[2] *= o2f; \
 961                 if( components == 4 ) tempp[3] *= o2f; \
 962                 tempp = temp; \
 963                 /* blend output */ \
 964                 for(int j = 0; j < oh; j++) { \
 965                         type *output = output_rows[j] + i * components; \
 966                         if( components == 4 ) { \
 967                                 temp_type r, g, b, a; \
 968                                 ALPHA4_BLEND(FN, temp_type, tempp, output, max, 0, round); \
 969                                 ALPHA4_STORE(output, chroma_offset, max); \
 970                         } \
 971                         else { \
 972                                 temp_type r, g, b; \
 973                                 ALPHA3_BLEND(FN, temp_type, tempp, output, max, 0, round); \
 974                                 ALPHA3_STORE(output, chroma_offset, max); \
 975                         } \
 976                         tempp += components; \
 977                 } \
 978         } \
 979         break; \
 980 }
 981
 982 #define XBLEND_SAMPLE(FN) { \
 983         switch(vinput->get_color_model()) { \
 984         case BC_RGB_FLOAT:      XSAMPLE(FN, z_float,   z_float,    1.f,    3, 0.f,    0.f); \
 985         case BC_RGBA_FLOAT:     XSAMPLE(FN, z_float,   z_float,    1.f,    4, 0.f,    0.f); \
 986         case BC_RGB888:         XSAMPLE(FN, z_int32_t, z_uint8_t,  0xff,   3, 0,      .5f); \
 987         case BC_YUV888:         XSAMPLE(FN, z_int32_t, z_uint8_t,  0xff,   3, 0x80,   .5f); \
 988         case BC_RGBA8888:       XSAMPLE(FN, z_int32_t, z_uint8_t,  0xff,   4, 0,      .5f); \
 989         case BC_YUVA8888:       XSAMPLE(FN, z_int32_t, z_uint8_t,  0xff,   4, 0x80,   .5f); \
 990         case BC_RGB161616:      XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0,      .5f); \
 991         case BC_YUV161616:      XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0x8000, .5f); \
 992         case BC_RGBA16161616:   XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0,      .5f); \
 993         case BC_YUVA16161616:   XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0x8000, .5f); \
 994         } \
 995         break; \
 996 }
 997
 998
 999 SamplePackage::SamplePackage()
1000 {
1001 }
1002
1003 SampleUnit::SampleUnit(SampleEngine *server)
1004  : LoadClient(server)
1005 {
1006         this->engine = server;
1007 }
1008
1009 SampleUnit::~SampleUnit()
1010 {
1011 }
1012
1013 void SampleUnit::process_package(LoadPackage *package)
1014 {
1015         SamplePackage *pkg = (SamplePackage*)package;
1016
1017         float i1  = engine->in1;
1018         float i2  = engine->in2;
1019         float o1  = engine->out1;
1020         float o2  = engine->out2;
1021
1022         if(i2 - i1 <= 0 || o2 - o1 <= 0)
1023                 return;
1024
1025         VFrame *voutput = engine->output;
1026         VFrame *vinput = engine->input;
1027         int mode = engine->mode;
1028         float fade =
1029                 BC_CModels::has_alpha(vinput->get_color_model()) &&
1030                 mode == TRANSFER_REPLACE ? 1.f : engine->alpha;
1031
1032         //int   iw  = vinput->get_w();
1033         int   i1i = floor(i1);
1034         int   i2i = ceil(i2);
1035         float i1f = 1.f - i1 + i1i;
1036         float i2f = 1.f - i2i + i2;
1037
1038         int   o1i = floor(o1);
1039         int   o2i = ceil(o2);
1040         float o1f = 1.f - o1 + o1i;
1041         float o2f = 1.f - o2i + o2;
1042         int   oh  = o2i - o1i;
1043
1044         float *k  = engine->kernel->lookup;
1045         //float kw  = engine->kernel->width;
1046         //int   kn  = engine->kernel->n;
1047         int   kd = engine->kd;
1048
1049         int *lookup_sx0 = engine->lookup_sx0;
1050         int *lookup_sx1 = engine->lookup_sx1;
1051         int *lookup_sk = engine->lookup_sk;
1052         //float *lookup_wacc = engine->lookup_wacc;
1053
1054         switch( mode ) {
1055         case TRANSFER_NORMAL:           XBLEND_SAMPLE(NORMAL);
1056         case TRANSFER_ADDITION:         XBLEND_SAMPLE(ADDITION);
1057         case TRANSFER_SUBTRACT:         XBLEND_SAMPLE(SUBTRACT);
1058         case TRANSFER_MULTIPLY:         XBLEND_SAMPLE(MULTIPLY);
1059         case TRANSFER_DIVIDE:           XBLEND_SAMPLE(DIVIDE);
1060         case TRANSFER_REPLACE:          XBLEND_SAMPLE(REPLACE);
1061         case TRANSFER_MAX:              XBLEND_SAMPLE(MAX);
1062         case TRANSFER_MIN:              XBLEND_SAMPLE(MIN);
1063         case TRANSFER_AVERAGE:          XBLEND_SAMPLE(AVERAGE);
1064         case TRANSFER_DARKEN:           XBLEND_SAMPLE(DARKEN);
1065         case TRANSFER_LIGHTEN:          XBLEND_SAMPLE(LIGHTEN);
1066         case TRANSFER_DST:              XBLEND_SAMPLE(DST);
1067         case TRANSFER_DST_ATOP:         XBLEND_SAMPLE(DST_ATOP);
1068         case TRANSFER_DST_IN:           XBLEND_SAMPLE(DST_IN);
1069         case TRANSFER_DST_OUT:          XBLEND_SAMPLE(DST_OUT);
1070         case TRANSFER_DST_OVER:         XBLEND_SAMPLE(DST_OVER);
1071         case TRANSFER_SRC:              XBLEND_SAMPLE(SRC);
1072         case TRANSFER_SRC_ATOP:         XBLEND_SAMPLE(SRC_ATOP);
1073         case TRANSFER_SRC_IN:           XBLEND_SAMPLE(SRC_IN);
1074         case TRANSFER_SRC_OUT:          XBLEND_SAMPLE(SRC_OUT);
1075         case TRANSFER_SRC_OVER:         XBLEND_SAMPLE(SRC_OVER);
1076         case TRANSFER_OR:               XBLEND_SAMPLE(OR);
1077         case TRANSFER_XOR:              XBLEND_SAMPLE(XOR);
1078         }
1079 }
1080
1081
1082 SampleEngine::SampleEngine(int cpus)
1083  : LoadServer(cpus, cpus)
1084 {
1085         lookup_sx0 = 0;
1086         lookup_sx1 = 0;
1087         lookup_sk = 0;
1088         lookup_wacc = 0;
1089         kd = 0;
1090 }
1091
1092 SampleEngine::~SampleEngine()
1093 {
1094         if(lookup_sx0) delete [] lookup_sx0;
1095         if(lookup_sx1) delete [] lookup_sx1;
1096         if(lookup_sk) delete [] lookup_sk;
1097         if(lookup_wacc) delete [] lookup_wacc;
1098 }
1099
1100 /*
1101  * unlike the Direct and NN engines, the Sample engine works across
1102  * output columns (it makes for more economical memory addressing
1103  * during convolution)
1104  */
1105 void SampleEngine::init_packages()
1106 {
1107         int   iw  = input->get_w();
1108         int   i1i = floor(in1);
1109         int   i2i = ceil(in2);
1110         float i1f = 1.f - in1 + i1i;
1111         float i2f = 1.f - i2i + in2;
1112
1113         int   oy  = floor(out1);
1114         float oyf = out1 - oy;
1115         int   oh  = ceil(out2) - oy;
1116
1117         float *k  = kernel->lookup;
1118         float kw  = kernel->width;
1119         int   kn  = kernel->n;
1120
1121         if(in2 - in1 <= 0 || out2 - out1 <= 0)
1122                 return;
1123
1124         /* determine kernel spatial coverage */
1125         float scale = (out2 - out1) / (in2 - in1);
1126         float iscale = (in2 - in1) / (out2 - out1);
1127         float coverage = fabs(1.f / scale);
1128         float bound = (coverage < 1.f ? kw : kw * coverage) - (.5f / TRANSFORM_SPP);
1129         float coeff = (coverage < 1.f ? 1.f : scale) * TRANSFORM_SPP;
1130
1131         delete [] lookup_sx0;
1132         delete [] lookup_sx1;
1133         delete [] lookup_sk;
1134         delete [] lookup_wacc;
1135
1136         lookup_sx0 = new int[oh];
1137         lookup_sx1 = new int[oh];
1138         lookup_sk = new int[oh];
1139         lookup_wacc = new float[oh];
1140
1141         kd = (double)coeff * (1 << INDEX_FRACTION) + .5;
1142
1143         /* precompute kernel values and weight sums */
1144         for(int i = 0; i < oh; i++) {
1145                 /* map destination back to source */
1146                 double sx = (i - oyf + .5) * iscale + in1 - .5;
1147
1148                 /*
1149                  * clip iteration to source area but not source plane. Points
1150                  * outside the source plane count as transparent. Points outside
1151                  * the source area don't count at all.  The actual convolution
1152                  * later will be clipped to both, but we need to compute
1153                  * weights.
1154                  */
1155                 int sx0 = MAX((int)floor(sx - bound) + 1, i1i);
1156                 int sx1 = MIN((int)ceil(sx + bound), i2i);
1157                 int ki = (double)(sx0 - sx) * coeff * (1 << INDEX_FRACTION)
1158                                 + (1 << (INDEX_FRACTION - 1)) + .5;
1159                 float wacc=0.;
1160
1161                 lookup_sx0[i] = -1;
1162                 lookup_sx1[i] = -1;
1163
1164                 for(int j= sx0; j < sx1; j++) {
1165                         int kv = (ki >> INDEX_FRACTION);
1166                         if(kv > kn) break;
1167                         if(kv >= -kn) {
1168                                 /*
1169                                  * the contribution of the first and last input pixel (if
1170                                  * fractional) are linearly weighted by the fraction
1171                                  */
1172                                 if(j == i1i)
1173                                         wacc += k[abs(kv)] * i1f;
1174                                 else if(j + 1 == i2i)
1175                                         wacc += k[abs(kv)] * i2f;
1176                                 else
1177                                         wacc += k[abs(kv)];
1178
1179                                 /* this is where we clip the kernel convolution to the source plane */
1180                                 if(j >= 0 && j < iw) {
1181                                         if(lookup_sx0[i] == -1) {
1182                                                 lookup_sx0[i] = j;
1183                                                 lookup_sk[i] = ki;
1184                                         }
1185                                         lookup_sx1[i] = j + 1;
1186                                 }
1187                         }
1188                         ki += kd;
1189                 }
1190                 lookup_wacc[i] = wacc > 0. ? 1. / wacc : 0.;
1191         }
1192
1193         int cols = col_out2 - col_out1;
1194         int pkgs = get_total_packages();
1195         int col1 = col_out1, col2 = col1;
1196         for(int i = 0; i < pkgs; col1=col2 ) {
1197                 SamplePackage *package = (SamplePackage*)get_package(i);
1198                 col2 = ++i * cols / pkgs + col_out1;
1199                 package->out_col1 = col1;
1200                 package->out_col2 = col2;
1201         }
1202 }
1203
1204 LoadClient* SampleEngine::new_client()
1205 {
1206         return new SampleUnit(this);
1207 }
1208
1209 LoadPackage* SampleEngine::new_package()
1210 {
1211         return new SamplePackage;
1212 }