cinelerra-5.1/cinelerra/overlayframe.C

   1
   2 /*
   3  * CINELERRA
   4  * Copyright (C) 2008 Adam Williams <broadcast at earthling dot net>
   5  * Copyright (C) 2012 Monty <monty@xiph.org>
   6  *
   7  * This program is free software; you can redistribute it and/or modify
   8  * it under the terms of the GNU General Public License as published by
   9  * the Free Software Foundation; either version 2 of the License, or
  10  * (at your option) any later version.
  11  *
  12  * This program is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  15  * GNU General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  20  *
  21  */
  22
  23 #include <math.h>
  24 #include <stdio.h>
  25 #include <string.h>
  26 #include <stdint.h>
  27 #include <stdlib.h>
  28 #include <unistd.h>
  29
  30 #include "clip.h"
  31 #include "edl.inc"
  32 #include "mutex.h"
  33 #include "overlayframe.h"
  34 #include "units.h"
  35 #include "vframe.h"
  36
  37 static inline int   mabs(int32_t v) { return abs(v); }
  38 static inline int   mabs(int64_t v) { return llabs(v); }
  39 static inline float mabs(float v)   { return fabsf(v); }
  40
  41 static inline int32_t aclip(int32_t v, int mx) {
  42         return v < 0 ? 0 : v > mx ? mx : v;
  43 }
  44 static inline int64_t aclip(int64_t v, int mx) {
  45         return v < 0 ? 0 : v > mx ? mx : v;
  46 }
  47 static inline float   aclip(float v, float mx) {
  48         return v < 0 ? 0 : v > mx ? mx : v;
  49 }
  50 static inline float   aclip(float v, int mx) {
  51         return v < 0 ? 0 : v > mx ? mx : v;
  52 }
  53 static inline int   aclip(int v, float mx) {
  54         return v < 0 ? 0 : v > mx ? mx : v;
  55 }
  56 static inline int32_t cclip(int32_t v, int mx) {
  57         return v > (mx/=2) ? mx : v < (mx=(-mx-1)) ? mx : v;
  58 }
  59 static inline int64_t cclip(int64_t v, int mx) {
  60         return v > (mx/=2) ? mx : v < (mx=(-mx-1)) ? mx : v;
  61 }
  62 static inline float   cclip(float v, float mx) {
  63         return v > (mx/=2) ? mx : v < (mx=(-mx)) ? mx : v;
  64 }
  65 static inline float   cclip(float v, int mx) {
  66         return v > (mx/=2) ? mx : v < (mx=(-mx-1)) ? mx : v;
  67 }
  68 static inline int   cclip(int v, float mx) {
  69         return v > (mx/=2) ? mx : v < (mx=(-mx-1)) ? mx : v;
  70 }
  71
  72 /*
  73  * New resampler code; replace the original somehwat blurry engine
  74  * with a fairly standard kernel resampling core.  This could be used
  75  * for full affine transformation but only implements scale/translate.
  76  * Mostly reuses the old blending macro code.
  77  *
  78  * Pixel convention:
  79  *
  80  *  1) Pixels are points, not areas or squares.
  81  *
  82  *  2) To maintain the usual edge and scaling conventions, pixels are
  83  *     set inward from the image edge, eg, the left edge of an image is
  84  *     at pixel location x=-.5, not x=0.  Although pixels are not
  85  *     squares, the usual way of stating this is 'the pixel is located
  86  *     at the center of its square'.
  87  *
  88  *  3) Because of 1 and 2, we must truncate and weight the kernel
  89  *     convolution at the edge of the input area.  Otherwise, all
  90  *     resampled areas would be bordered by a transparency halo. E.g.
  91  *     in the old engine, upsampling HDV to 1920x1080 results in the
  92  *     left and right edges being partially transparent and underlying
  93  *     layers shining through.
  94  *
  95  *   4) The contribution of fractional pixels at the edges of input
  96  *     ranges are weighted according to the fraction.  Note that the
  97  *     kernel weighting is adjusted, not the opacity.  This is one
  98  *     exception to 'pixels have no area'.
  99  *
 100  *  5) The opacity of fractional pixels at the edges of the output
 101  *     range is adjusted according to the fraction. This is the other
 102  *     exception to 'pixels have no area'.
 103  *
 104  * Fractional alpha blending has been modified across the board from:
 105  *    output_alpha = input_alpha > output_alpha ? input_alpha : output_alpha;
 106  *  to:
 107  *    output_alpha = output_alpha + ((max - output_alpha) * input_alpha) / max;
 108  */
 109
 110 #define TRANSFORM_SPP    (4096)    /* number of data pts per unit x in lookup table */
 111 #define INDEX_FRACTION   (8)       /* bits of fraction past TRANSFORM_SPP on kernel
 112                                       index accumulation */
 113 #define TRANSFORM_MIN    (.5 / TRANSFORM_SPP)
 114
 115 /* Sinc needed for Lanczos kernel */
 116 static float sinc(const float x)
 117 {
 118         float y = x * M_PI;
 119
 120         if(fabsf(x) < TRANSFORM_MIN)
 121                 return 1.0f;
 122
 123         return sinf(y) / y;
 124 }
 125
 126 /*
 127  * All resampling (except Nearest Neighbor) is performed via
 128  *   transformed 2D resampling kernels bult from 1D lookups.
 129  */
 130 OverlayKernel::OverlayKernel(int interpolation_type)
 131 {
 132         int i;
 133         this->type = interpolation_type;
 134
 135         switch(interpolation_type)
 136         {
 137         case BILINEAR:
 138                 width = 1.f;
 139                 lookup = new float[(n = TRANSFORM_SPP) + 1];
 140                 for (i = 0; i <= TRANSFORM_SPP; i++)
 141                         lookup[i] = (float)(TRANSFORM_SPP - i) / TRANSFORM_SPP;
 142                 break;
 143
 144         /* Use a Catmull-Rom filter (not b-spline) */
 145         case BICUBIC:
 146                 width = 2.;
 147                 lookup = new float[(n = 2 * TRANSFORM_SPP) + 1];
 148                 for(i = 0; i <= TRANSFORM_SPP; i++) {
 149                         float x = i / (float)TRANSFORM_SPP;
 150                         lookup[i] = 1.f - 2.5f * x * x + 1.5f * x * x * x;
 151                 }
 152                 for(; i <= 2 * TRANSFORM_SPP; i++) {
 153                         float x = i / (float)TRANSFORM_SPP;
 154                         lookup[i] = 2.f - 4.f * x  + 2.5f * x * x - .5f * x * x * x;
 155                 }
 156                 break;
 157
 158         case LANCZOS:
 159                 width = 3.;
 160                 lookup = new float[(n = 3 * TRANSFORM_SPP) + 1];
 161                 for (i = 0; i <= 3 * TRANSFORM_SPP; i++)
 162                         lookup[i] = sinc((float)i / TRANSFORM_SPP) *
 163                                 sinc((float)i / TRANSFORM_SPP / 3.0f);
 164                 break;
 165
 166         default:
 167                 width = 0.;
 168                 lookup = 0;
 169                 n = 0;
 170                 break;
 171         }
 172 }
 173
 174 OverlayKernel::~OverlayKernel()
 175 {
 176         if(lookup) delete [] lookup;
 177 }
 178
 179 OverlayFrame::OverlayFrame(int cpus)
 180 {
 181         direct_engine = 0;
 182         nn_engine = 0;
 183         sample_engine = 0;
 184         temp_frame = 0;
 185         memset(kernel, 0, sizeof(kernel));
 186         this->cpus = cpus;
 187 }
 188
 189 OverlayFrame::~OverlayFrame()
 190 {
 191         if(temp_frame) delete temp_frame;
 192
 193         if(direct_engine) delete direct_engine;
 194         if(nn_engine) delete nn_engine;
 195         if(sample_engine) delete sample_engine;
 196
 197         if(kernel[NEAREST_NEIGHBOR]) delete kernel[NEAREST_NEIGHBOR];
 198         if(kernel[BILINEAR]) delete kernel[BILINEAR];
 199         if(kernel[BICUBIC]) delete kernel[BICUBIC];
 200         if(kernel[LANCZOS]) delete kernel[LANCZOS];
 201 }
 202
 203 static float epsilon_snap(float f)
 204 {
 205         return rintf(f * 1024) / 1024.;
 206 }
 207
 208 int OverlayFrame::overlay(VFrame *output, VFrame *input,
 209         float in_x1, float in_y1, float in_x2, float in_y2,
 210         float out_x1, float out_y1, float out_x2, float out_y2,
 211         float alpha, int mode, int interpolation_type)
 212 {
 213         in_x1 = epsilon_snap(in_x1);
 214         in_x2 = epsilon_snap(in_x2);
 215         in_y1 = epsilon_snap(in_y1);
 216         in_y2 = epsilon_snap(in_y2);
 217         out_x1 = epsilon_snap(out_x1);
 218         out_x2 = epsilon_snap(out_x2);
 219         out_y1 = epsilon_snap(out_y1);
 220         out_y2 = epsilon_snap(out_y2);
 221
 222         if (isnan(in_x1) || isnan(in_x2) ||
 223                 isnan(in_y1) || isnan(in_y2) ||
 224                 isnan(out_x1) || isnan(out_x2) ||
 225                 isnan(out_y1) || isnan(out_y2)) return 1;
 226
 227         if( in_x2 <= in_x1 || in_y2 <= in_y1 ) return 1;
 228         if( out_x2 <= out_x1 || out_y2 <= out_y1 ) return 1;
 229
 230         float xscale = (out_x2 - out_x1) / (in_x2 - in_x1);
 231         float yscale = (out_y2 - out_y1) / (in_y2 - in_y1);
 232         int in_w = input->get_w(), in_h = input->get_h();
 233         int out_w = output->get_w(), out_h = output->get_h();
 234
 235         if( in_x1 < 0 ) {
 236                 out_x1 -= in_x1 * xscale;
 237                 in_x1 = 0;
 238         }
 239         if( in_x2 > in_w ) {
 240                 out_x2 -= (in_x2 - in_w) * xscale;
 241                 in_x2 = in_w;
 242         }
 243         if( in_y1 < 0 ) {
 244                 out_y1 -= in_y1 * yscale;
 245                 in_y1 = 0;
 246         }
 247         if( in_y2 > in_h ) {
 248                 out_y2 -= (in_y2 - in_h) * yscale;
 249                 in_y2 = in_h;
 250         }
 251
 252         if( out_x1 < 0 ) {
 253                 in_x1 -= out_x1 / xscale;
 254                 out_x1 = 0;
 255         }
 256         if( out_x2 > out_w ) {
 257                 in_x2 -= (out_x2 - out_w) / xscale;
 258                 out_x2 = out_w;
 259         }
 260         if( out_y1 < 0 ) {
 261                 in_y1 -= out_y1 / yscale;
 262                 out_y1 = 0;
 263         }
 264         if( out_y2 > out_h ) {
 265                 in_y2 -= (out_y2 - out_h) / yscale;
 266                 out_y2 = out_h;
 267         }
 268
 269         if( in_x1 < 0) in_x1 = 0;
 270         if( in_y1 < 0) in_y1 = 0;
 271         if( in_x2 > in_w ) in_x2 = in_w;
 272         if( in_y2 > in_h ) in_y2 = in_h;
 273         if( out_x1 < 0) out_x1 = 0;
 274         if( out_y1 < 0) out_y1 = 0;
 275         if( out_x2 > out_w ) out_x2 = out_w;
 276         if( out_y2 > out_h ) out_y2 = out_h;
 277
 278         if( in_x2 <= in_x1 || in_y2 <= in_y1 ) return 1;
 279         if( out_x2 <= out_x1 || out_y2 <= out_y1 ) return 1;
 280         xscale = (out_x2 - out_x1) / (in_x2 - in_x1);
 281         yscale = (out_y2 - out_y1) / (in_y2 - in_y1);
 282
 283         /* don't interpolate integer translations, or scale no-ops */
 284         if(xscale == 1. && yscale == 1. &&
 285                 (int)in_x1 == in_x1 && (int)in_x2 == in_x2 &&
 286                 (int)in_y1 == in_y1 && (int)in_y2 == in_y2 &&
 287                 (int)out_x1 == out_x1 && (int)out_x2 == out_x2 &&
 288                 (int)out_y1 == out_y1 && (int)out_y2 == out_y2) {
 289                 if(!direct_engine) direct_engine = new DirectEngine(cpus);
 290
 291                 direct_engine->output = output;   direct_engine->input = input;
 292                 direct_engine->in_x1 = in_x1;     direct_engine->in_y1 = in_y1;
 293                 direct_engine->out_x1 = out_x1;   direct_engine->out_x2 = out_x2;
 294                 direct_engine->out_y1 = out_y1;   direct_engine->out_y2 = out_y2;
 295                 direct_engine->alpha = alpha;     direct_engine->mode = mode;
 296                 direct_engine->process_packages();
 297         }
 298         else if(interpolation_type == NEAREST_NEIGHBOR) {
 299                 if(!nn_engine) nn_engine = new NNEngine(cpus);
 300                 nn_engine->output = output;       nn_engine->input = input;
 301                 nn_engine->in_x1 = in_x1;         nn_engine->in_x2 = in_x2;
 302                 nn_engine->in_y1 = in_y1;         nn_engine->in_y2 = in_y2;
 303                 nn_engine->out_x1 = out_x1;       nn_engine->out_x2 = out_x2;
 304                 nn_engine->out_y1 = out_y1;       nn_engine->out_y2 = out_y2;
 305                 nn_engine->alpha = alpha;         nn_engine->mode = mode;
 306                 nn_engine->process_packages();
 307         }
 308         else {
 309                 int xtype = BILINEAR;
 310                 int ytype = BILINEAR;
 311
 312                 switch(interpolation_type)
 313                 {
 314                 case CUBIC_CUBIC: // Bicubic enlargement and reduction
 315                         xtype = ytype = BICUBIC;
 316                         break;
 317                 case CUBIC_LINEAR: // Bicubic enlargement and bilinear reduction
 318                         xtype = xscale > 1. ? BICUBIC : BILINEAR;
 319                         ytype = yscale > 1. ? BICUBIC : BILINEAR;
 320                         break;
 321                 case LINEAR_LINEAR: // Bilinear enlargement and bilinear reduction
 322                         xtype = ytype = BILINEAR;
 323                         break;
 324                 case LANCZOS_LANCZOS: // Because we can
 325                         xtype = ytype = LANCZOS;
 326                         break;
 327                 }
 328
 329                 if(xscale == 1. && (int)in_x1 == in_x1 && (int)in_x2 == in_x2 &&
 330                                 (int)out_x1 == out_x1 && (int)out_x2 == out_x2)
 331                         xtype = DIRECT_COPY;
 332
 333                 if(yscale == 1. && (int)in_y1 == in_y1 && (int)in_y2 == in_y2 &&
 334                                 (int)out_y1 == out_y1 && (int)out_y2 == out_y2)
 335                         ytype = DIRECT_COPY;
 336
 337                 if(!kernel[xtype])
 338                         kernel[xtype] = new OverlayKernel(xtype);
 339                 if(!kernel[ytype])
 340                         kernel[ytype] = new OverlayKernel(ytype);
 341
 342 /*
 343  * horizontal and vertical are separately resampled.  First we
 344  * resample the input along X into a transposed, temporary frame,
 345  * then resample/transpose the temporary space along X into the
 346  * output.  Fractional pixels along the edge are handled in the X
 347  * direction of each step
 348  */
 349                 // resampled dimension matches the transposed output space
 350                 float temp_y1 = out_x1 - floor(out_x1);
 351                 float temp_y2 = temp_y1 + (out_x2 - out_x1);
 352                 int temp_h = ceil(temp_y2);
 353
 354                 // non-resampled dimension merely cropped
 355                 float temp_x1 = in_y1 - floor(in_y1);
 356                 float temp_x2 = temp_x1 + (in_y2 - in_y1);
 357                 int temp_w = ceil(temp_x2);
 358
 359                 if( temp_frame &&
 360                    (temp_frame->get_color_model() != input->get_color_model() ||
 361                     temp_frame->get_w() != temp_w || temp_frame->get_h() != temp_h) ) {
 362                         delete temp_frame;
 363                         temp_frame = 0;
 364                 }
 365
 366                 if(!temp_frame) {
 367                         temp_frame = new VFrame(0, -1, temp_w, temp_h,
 368                                 input->get_color_model(), -1);
 369                 }
 370
 371                 temp_frame->clear_frame();
 372
 373                 if(!sample_engine) sample_engine = new SampleEngine(cpus);
 374
 375                 sample_engine->output = temp_frame;
 376                 sample_engine->input = input;
 377                 sample_engine->kernel = kernel[xtype];
 378                 sample_engine->col_out1 = 0;
 379                 sample_engine->col_out2 = temp_w;
 380                 sample_engine->row_in = floor(in_y1);
 381
 382                 sample_engine->in1 = in_x1;
 383                 sample_engine->in2 = in_x2;
 384                 sample_engine->out1 = temp_y1;
 385                 sample_engine->out2 = temp_y2;
 386                 sample_engine->alpha = 1.;
 387                 sample_engine->mode = TRANSFER_REPLACE;
 388                 sample_engine->process_packages();
 389
 390                 sample_engine->output = output;
 391                 sample_engine->input = temp_frame;
 392                 sample_engine->kernel = kernel[ytype];
 393                 sample_engine->col_out1 = floor(out_x1);
 394                 sample_engine->col_out2 = ceil(out_x2);
 395                 sample_engine->row_in = 0;
 396
 397                 sample_engine->in1 = temp_x1;
 398                 sample_engine->in2 = temp_x2;
 399                 sample_engine->out1 = out_y1;
 400                 sample_engine->out2 = out_y2;
 401                 sample_engine->alpha = alpha;
 402                 sample_engine->mode = mode;
 403                 sample_engine->process_packages();
 404         }
 405         return 0;
 406 }
 407
 408 // NORMAL       [Sa + Da * (1 - Sa), Sc * Sa + Dc * (1 - Sa)])
 409 #define ALPHA_NORMAL(mx, Sa, Da) (Sa + (Da * (mx - Sa)) / mx)
 410 #define COLOR_NORMAL(mx, Sc, Sa, Dc, Da) ((Sc * Sa + Dc * (mx - Sa)) / mx)
 411 #define CHROMA_NORMAL COLOR_NORMAL
 412
 413 // ADDITION     [(Sa + Da), (Sc + Dc)]
 414 #define ALPHA_ADDITION(mx, Sa, Da) (Sa + Da)
 415 #define COLOR_ADDITION(mx, Sc, Sa, Dc, Da) (Sc + Dc)
 416 #define CHROMA_ADDITION(mx, Sc, Sa, Dc, Da) (Sc + Dc)
 417
 418 // SUBTRACT     [(Sa - Da), (Sc - Dc)]
 419 #define ALPHA_SUBTRACT(mx, Sa, Da) (Sa - Da)
 420 #define COLOR_SUBTRACT(mx, Sc, Sa, Dc, Da) (Sc - Dc)
 421 #define CHROMA_SUBTRACT(mx, Sc, Sa, Dc, Da) (Sc - Dc)
 422
 423 // MULTIPLY     [(Sa * Da), Sc * Dc]
 424 #define ALPHA_MULTIPLY(mx, Sa, Da) ((Sa * Da) / mx)
 425 #define COLOR_MULTIPLY(mx, Sc, Sa, Dc, Da) ((Sc * Dc) / mx)
 426 #define CHROMA_MULTIPLY(mx, Sc, Sa, Dc, Da) ((Sc * Dc) / mx)
 427
 428 // DIVIDE       [(Sa / Da), (Sc / Dc)]
 429 #define ALPHA_DIVIDE(mx, Sa, Da) (Da ? ((Sa * mx) / Da) : mx)
 430 #define COLOR_DIVIDE(mx, Sc, Sa, Dc, Da) (Dc ? ((Sc * mx) / Dc) : mx)
 431 #define CHROMA_DIVIDE(mx, Sc, Sa, Dc, Da) (Dc ? ((Sc * mx) / Dc) : mx)
 432
 433 // REPLACE      [Sa, Sc] (fade = 1)
 434 #define ALPHA_REPLACE(mx, Sa, Da) Sa
 435 #define COLOR_REPLACE(mx, Sc, Sa, Dc, Da) Sc
 436 #define CHROMA_REPLACE COLOR_REPLACE
 437
 438 // MAX          [max(Sa, Da), MAX(Sc, Dc)]
 439 #define ALPHA_MAX(mx, Sa, Da) (Sa > Da ? Sa : Da)
 440 #define COLOR_MAX(mx, Sc, Sa, Dc, Da) (Sc > Dc ? Sc : Dc)
 441 #define CHROMA_MAX(mx, Sc, Sa, Dc, Da) (mabs(Sc) > mabs(Dc) ? Sc : Dc)
 442
 443 // MIN          [min(Sa, Da), MIN(Sc, Dc)]
 444 #define ALPHA_MIN(mx, Sa, Da) (Sa < Da ? Sa : Da)
 445 #define COLOR_MIN(mx, Sc, Sa, Dc, Da) (Sc < Dc ? Sc : Dc)
 446 #define CHROMA_MIN(mx, Sc, Sa, Dc, Da) (mabs(Sc) < mabs(Dc) ? Sc : Dc)
 447
 448 // AVERAGE      [(Sa + Da) * 0.5, (Sc + Dc) * 0.5]
 449 #define ALPHA_AVERAGE(mx, Sa, Da) ((Sa + Da) / 2)
 450 #define COLOR_AVERAGE(mx, Sc, Sa, Dc, Da) ((Sc + Dc) / 2)
 451 #define CHROMA_AVERAGE COLOR_AVERAGE
 452
 453 // DARKEN       [Sa + Da - Sa*Da, Sc*(1 - Da) + Dc*(1 - Sa) + min(Sc, Dc)]
 454 #define ALPHA_DARKEN(mx, Sa, Da) (Sa + Da - (Sa * Da) / mx)
 455 #define COLOR_DARKEN(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * (mx - Sa)) / mx + (Sc < Dc ? Sc : Dc))
 456 #define CHROMA_DARKEN(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * (mx - Sa)) / mx + (mabs(Sc) < mabs(Dc) ? Sc : Dc))
 457
 458 // LIGHTEN      [Sa + Da - Sa*Da, Sc*(1 - Da) + Dc*(1 - Sa) + max(Sc, Dc)]
 459 #define ALPHA_LIGHTEN(mx, Sa, Da) (Sa + Da - Sa * Da / mx)
 460 #define COLOR_LIGHTEN(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * (mx - Sa)) / mx + (Sc > Dc ? Sc : Dc))
 461 #define CHROMA_LIGHTEN(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * (mx - Sa)) / mx + (mabs(Sc) > mabs(Dc) ? Sc : Dc))
 462
 463 // DST          [Da, Dc]
 464 #define ALPHA_DST(mx, Sa, Da) Da
 465 #define COLOR_DST(mx, Sc, Sa, Dc, Da) Dc
 466 #define CHROMA_DST COLOR_DST
 467
 468 // DST_ATOP     [Sa, Sc * (1 - Da) + Dc * Sa]
 469 #define ALPHA_DST_ATOP(mx, Sa, Da) Sa
 470 #define COLOR_DST_ATOP(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * Sa) / mx)
 471 #define CHROMA_DST_ATOP COLOR_DST_ATOP
 472
 473 // DST_IN       [Da * Sa, Dc * Sa]
 474 #define ALPHA_DST_IN(mx, Sa, Da) ((Da * Sa) / mx)
 475 #define COLOR_DST_IN(mx, Sc, Sa, Dc, Da) ((Dc * Sa) / mx)
 476 #define CHROMA_DST_IN COLOR_DST_IN
 477
 478 // DST_OUT      [Da * (1 - Sa), Dc * (1 - Sa)]
 479 #define ALPHA_DST_OUT(mx, Sa, Da) (Da * (mx - Sa) / mx)
 480 #define COLOR_DST_OUT(mx, Sc, Sa, Dc, Da) (Dc * (mx - Sa) / mx)
 481 #define CHROMA_DST_OUT COLOR_DST_OUT
 482
 483 // DST_OVER     [Sa * (1 - Da) + Da, Sc * (1 - Da) + Dc]
 484 #define ALPHA_DST_OVER(mx, Sa, Da) ((Sa * (mx - Da)) / mx + Da)
 485 #define COLOR_DST_OVER(mx, Sc, Sa, Dc, Da) (Sc * (mx - Da)/ mx + Dc)
 486 #define CHROMA_DST_OVER COLOR_DST_OVER
 487
 488 // SRC                  [Sa, Sc]
 489 #define ALPHA_SRC(mx, Sa, Da) Sa
 490 #define COLOR_SRC(mx, Sc, Sa, Dc, Da) Sc
 491 #define CHROMA_SRC COLOR_SRC
 492
 493 // SRC_ATOP     [Da, Sc * Da + Dc * (1 - Sa)]
 494 #define ALPHA_SRC_ATOP(mx, Sa, Da) Da
 495 #define COLOR_SRC_ATOP(mx, Sc, Sa, Dc, Da) ((Sc * Da + Dc * (mx - Sa)) / mx)
 496 #define CHROMA_SRC_ATOP COLOR_SRC_ATOP
 497
 498 // SRC_IN       [Sa * Da, Sc * Da]
 499 #define ALPHA_SRC_IN(mx, Sa, Da) ((Sa * Da) / mx)
 500 #define COLOR_SRC_IN(mx, Sc, Sa, Dc, Da) (Sc * Da / mx)
 501 #define CHROMA_SRC_IN COLOR_SRC_IN
 502
 503 // SRC_OUT      [Sa * (1 - Da), Sc * (1 - Da)]
 504 #define ALPHA_SRC_OUT(mx, Sa, Da) (Sa * (mx - Da) / mx)
 505 #define COLOR_SRC_OUT(mx, Sc, Sa, Dc, Da) (Sc * (mx - Da) / mx)
 506 #define CHROMA_SRC_OUT COLOR_SRC_OUT
 507
 508 // SRC_OVER     [Sa + Da * (1 - Sa), Sc + (1 - Sa) * Dc]
 509 #define ALPHA_SRC_OVER(mx, Sa, Da) (Sa + Da * (mx - Sa) / mx)
 510 #define COLOR_SRC_OVER(mx, Sc, Sa, Dc, Da) (Sc + Dc * (mx - Sa) / mx)
 511 #define CHROMA_SRC_OVER COLOR_SRC_OVER
 512
 513 // OR   [Sa + Da - Sa * Da, Sc + Dc - Sc * Dc]
 514 #define ALPHA_OR(mx, Sa, Da) (Sa + Da - (Sa * Da) / mx)
 515 #define COLOR_OR(mx, Sc, Sa, Dc, Da) (Sc + Dc - (Sc * Dc) / mx)
 516 #define CHROMA_OR COLOR_OR
 517
 518 // XOR          [Sa * (1 - Da) + Da * (1 - Sa), Sc * (1 - Da) + Dc * (1 - Sa)]
 519 #define ALPHA_XOR(mx, Sa, Da) ((Sa * (mx - Da) + Da * (mx - Sa)) / mx)
 520 #define COLOR_XOR(mx, Sc, Sa, Dc, Da) ((Sc * (mx - Da) + Dc * (mx - Sa)) / mx)
 521 #define CHROMA_XOR COLOR_XOR
 522
 523 #define ZTYP(ty) typedef ty z_##ty __attribute__ ((__unused__))
 524 ZTYP(int8_t);   ZTYP(uint8_t);
 525 ZTYP(int16_t);  ZTYP(uint16_t);
 526 ZTYP(int32_t);  ZTYP(uint32_t);
 527 ZTYP(int64_t);  ZTYP(uint64_t);
 528 ZTYP(float);    ZTYP(double);
 529
 530 #define ALPHA3_BLEND(FN, typ, inp, out, mx, iofs, oofs, rnd) \
 531   typ inp0 = (typ)inp[0], inp1 = (typ)inp[1] - iofs; \
 532   typ inp2 = (typ)inp[2] - iofs, inp3 = mx; \
 533   typ out0 = (typ)out[0], out1 = (typ)out[1] - oofs; \
 534   typ out2 = (typ)out[2] - oofs, out3 = mx; \
 535   r = COLOR_##FN(mx, inp0, inp3, out0, out3); \
 536   if( oofs ) { \
 537     g = CHROMA_##FN(mx, inp1, inp3, out1, out3); \
 538     b = CHROMA_##FN(mx, inp2, inp3, out2, out3); \
 539   } \
 540   else { \
 541     g = COLOR_##FN(mx, inp1, inp3, out1, out3); \
 542     b = COLOR_##FN(mx, inp2, inp3, out2, out3); \
 543   }
 544
 545 #define ALPHA4_BLEND(FN, typ, inp, out, mx, iofs, oofs, rnd) \
 546   typ inp0 = (typ)inp[0], inp1 = (typ)inp[1] - iofs; \
 547   typ inp2 = (typ)inp[2] - iofs, inp3 = inp[3]; \
 548   typ out0 = (typ)out[0], out1 = (typ)out[1] - oofs; \
 549   typ out2 = (typ)out[2] - oofs, out3 = out[3]; \
 550   r = COLOR_##FN(mx, inp0, inp3, out0, out3); \
 551   if( oofs ) { \
 552     g = CHROMA_##FN(mx, inp1, inp3, out1, out3); \
 553     b = CHROMA_##FN(mx, inp2, inp3, out2, out3); \
 554   } \
 555   else { \
 556     g = COLOR_##FN(mx, inp1, inp3, out1, out3); \
 557     b = COLOR_##FN(mx, inp2, inp3, out2, out3); \
 558   } \
 559   a = ALPHA_##FN(mx, inp3, out3)
 560
 561 #define ALPHA_STORE(out, ofs, mx) \
 562   out[0] = r; \
 563   out[1] = g + ofs; \
 564   out[2] = b + ofs
 565
 566 #define ALPHA3_STORE(out, ofs, mx) \
 567   r = aclip(r, mx); \
 568   g = ofs ? cclip(g, mx) : aclip(g, mx); \
 569   b = ofs ? cclip(b, mx) : aclip(b, mx); \
 570   if( trnsp ) { \
 571     r = (r * opcty + out0 * trnsp) / mx; \
 572     g = (g * opcty + out1 * trnsp) / mx; \
 573     b = (b * opcty + out2 * trnsp) / mx; \
 574   } \
 575   ALPHA_STORE(out, ofs, mx)
 576
 577 #define ALPHA4_STORE(out, ofs, mx) \
 578   r = aclip(r, mx); \
 579   g = ofs ? cclip(g, mx) : aclip(g, mx); \
 580   b = ofs ? cclip(b, mx) : aclip(b, mx); \
 581   if( trnsp ) { \
 582     r = (r * opcty + out0 * trnsp) / mx; \
 583     g = (g * opcty + out1 * trnsp) / mx; \
 584     b = (b * opcty + out2 * trnsp) / mx; \
 585     a = (a * opcty + out3 * trnsp) / mx; \
 586   } \
 587   ALPHA_STORE(out, ofs, mx); \
 588   out[3] = aclip(a, mx)
 589
 590 #define XBLEND(FN, temp_type, type, max, components, ofs, round) { \
 591         temp_type opcty = fade * max + round, trnsp = max - opcty; \
 592         type** output_rows = (type**)output->get_rows(); \
 593         type** input_rows = (type**)input->get_rows(); \
 594         ix *= components;  ox *= components; \
 595  \
 596         for(int i = pkg->out_row1; i < pkg->out_row2; i++) { \
 597                 type* in_row = input_rows[i + iy] + ix; \
 598                 type* output = output_rows[i] + ox; \
 599                 for(int j = 0; j < ow; j++) { \
 600                         if( components == 4 ) { \
 601                                 temp_type r, g, b, a; \
 602                                 ALPHA4_BLEND(FN, temp_type, in_row, output, max, ofs, ofs, round); \
 603                                 ALPHA4_STORE(output, ofs, max); \
 604                         } \
 605                         else { \
 606                                 temp_type r, g, b; \
 607                                 ALPHA3_BLEND(FN, temp_type, in_row, output, max, ofs, ofs, round); \
 608                                 ALPHA3_STORE(output, ofs, max); \
 609                         } \
 610                         in_row += components;  output += components; \
 611                 } \
 612         } \
 613         break; \
 614 }
 615
 616 #define XBLEND_ONLY(FN) { \
 617         switch(input->get_color_model()) { \
 618         case BC_RGB_FLOAT:      XBLEND(FN, z_float,   z_float,    1.f,    3, 0,      0.f); \
 619         case BC_RGBA_FLOAT:     XBLEND(FN, z_float,   z_float,    1.f,    4, 0,      0.f); \
 620         case BC_RGB888:         XBLEND(FN, z_int32_t, z_uint8_t,  0xff,   3, 0,      .5f); \
 621         case BC_YUV888:         XBLEND(FN, z_int32_t, z_uint8_t,  0xff,   3, 0x80,   .5f); \
 622         case BC_RGBA8888:       XBLEND(FN, z_int32_t, z_uint8_t,  0xff,   4, 0,      .5f); \
 623         case BC_YUVA8888:       XBLEND(FN, z_int32_t, z_uint8_t,  0xff,   4, 0x80,   .5f); \
 624         case BC_RGB161616:      XBLEND(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0,      .5f); \
 625         case BC_YUV161616:      XBLEND(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0x8000, .5f); \
 626         case BC_RGBA16161616:   XBLEND(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0,      .5f); \
 627         case BC_YUVA16161616:   XBLEND(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0x8000, .5f); \
 628         } \
 629         break; \
 630 }
 631
 632 /* Direct translate / blend **********************************************/
 633
 634 DirectPackage::DirectPackage()
 635 {
 636 }
 637
 638 DirectUnit::DirectUnit(DirectEngine *server)
 639  : LoadClient(server)
 640 {
 641         this->engine = server;
 642 }
 643
 644 DirectUnit::~DirectUnit()
 645 {
 646 }
 647
 648 void DirectUnit::process_package(LoadPackage *package)
 649 {
 650         DirectPackage *pkg = (DirectPackage*)package;
 651
 652         VFrame *output = engine->output;
 653         VFrame *input = engine->input;
 654         int mode = engine->mode;
 655         float fade =
 656                 BC_CModels::has_alpha(input->get_color_model()) &&
 657                 mode == TRANSFER_REPLACE ? 1.f : engine->alpha;
 658
 659         int ix = engine->in_x1;
 660         int ox = engine->out_x1;
 661         int ow = engine->out_x2 - ox;
 662         int iy = engine->in_y1 - engine->out_y1;
 663
 664         switch( mode ) {
 665         case TRANSFER_NORMAL:           XBLEND_ONLY(NORMAL);
 666         case TRANSFER_ADDITION:         XBLEND_ONLY(ADDITION);
 667         case TRANSFER_SUBTRACT:         XBLEND_ONLY(SUBTRACT);
 668         case TRANSFER_MULTIPLY:         XBLEND_ONLY(MULTIPLY);
 669         case TRANSFER_DIVIDE:           XBLEND_ONLY(DIVIDE);
 670         case TRANSFER_REPLACE:          XBLEND_ONLY(REPLACE);
 671         case TRANSFER_MAX:              XBLEND_ONLY(MAX);
 672         case TRANSFER_MIN:              XBLEND_ONLY(MIN);
 673         case TRANSFER_AVERAGE:          XBLEND_ONLY(AVERAGE);
 674         case TRANSFER_DARKEN:           XBLEND_ONLY(DARKEN);
 675         case TRANSFER_LIGHTEN:          XBLEND_ONLY(LIGHTEN);
 676         case TRANSFER_DST:              XBLEND_ONLY(DST);
 677         case TRANSFER_DST_ATOP:         XBLEND_ONLY(DST_ATOP);
 678         case TRANSFER_DST_IN:           XBLEND_ONLY(DST_IN);
 679         case TRANSFER_DST_OUT:          XBLEND_ONLY(DST_OUT);
 680         case TRANSFER_DST_OVER:         XBLEND_ONLY(DST_OVER);
 681         case TRANSFER_SRC:              XBLEND_ONLY(SRC);
 682         case TRANSFER_SRC_ATOP:         XBLEND_ONLY(SRC_ATOP);
 683         case TRANSFER_SRC_IN:           XBLEND_ONLY(SRC_IN);
 684         case TRANSFER_SRC_OUT:          XBLEND_ONLY(SRC_OUT);
 685         case TRANSFER_SRC_OVER:         XBLEND_ONLY(SRC_OVER);
 686         case TRANSFER_OR:               XBLEND_ONLY(OR);
 687         case TRANSFER_XOR:              XBLEND_ONLY(XOR);
 688         }
 689 }
 690
 691 DirectEngine::DirectEngine(int cpus)
 692  : LoadServer(cpus, cpus)
 693 {
 694 }
 695
 696 DirectEngine::~DirectEngine()
 697 {
 698 }
 699
 700 void DirectEngine::init_packages()
 701 {
 702         if(in_x1 < 0) { out_x1 -= in_x1; in_x1 = 0; }
 703         if(in_y1 < 0) { out_y1 -= in_y1; in_y1 = 0; }
 704         if(out_x1 < 0) { in_x1 -= out_x1; out_x1 = 0; }
 705         if(out_y1 < 0) { in_y1 -= out_y1; out_y1 = 0; }
 706         if(out_x2 > output->get_w()) out_x2 = output->get_w();
 707         if(out_y2 > output->get_h()) out_y2 = output->get_h();
 708         int out_w = out_x2 - out_x1;
 709         int out_h = out_y2 - out_y1;
 710         if( !out_w || !out_h ) return;
 711
 712         int rows = out_h;
 713         int pkgs = get_total_packages();
 714         int row1 = out_y1, row2 = row1;
 715         for(int i = 0; i < pkgs; row1=row2 ) {
 716                 DirectPackage *package = (DirectPackage*)get_package(i);
 717                 row2 = ++i * rows / pkgs + out_y1;
 718                 package->out_row1 = row1;
 719                 package->out_row2 = row2;
 720         }
 721 }
 722
 723 LoadClient* DirectEngine::new_client()
 724 {
 725         return new DirectUnit(this);
 726 }
 727
 728 LoadPackage* DirectEngine::new_package()
 729 {
 730         return new DirectPackage;
 731 }
 732
 733 /* Nearest Neighbor scale / translate / blend ********************/
 734
 735 #define XBLEND_3NN(FN, temp_type, type, max, components, ofs, round) { \
 736         temp_type opcty = fade * max + round, trnsp = max - opcty; \
 737         type** output_rows = (type**)output->get_rows(); \
 738         type** input_rows = (type**)input->get_rows(); \
 739         ox *= components; \
 740  \
 741         for(int i = pkg->out_row1; i < pkg->out_row2; i++) { \
 742                 int *lx = engine->in_lookup_x; \
 743                 type* in_row = input_rows[*ly++]; \
 744                 type* output = output_rows[i] + ox; \
 745                 for(int j = 0; j < ow; j++) { \
 746                         in_row += *lx++; \
 747                         if( components == 4 ) { \
 748                                 temp_type r, g, b, a; \
 749                                 ALPHA4_BLEND(FN, temp_type, in_row, output, max, ofs, ofs, round); \
 750                                 ALPHA4_STORE(output, ofs, max); \
 751                         } \
 752                         else { \
 753                                 temp_type r, g, b; \
 754                                 ALPHA3_BLEND(FN, temp_type, in_row, output, max, ofs, ofs, round); \
 755                                 ALPHA3_STORE(output, ofs, max); \
 756                         } \
 757                         output += components; \
 758                 } \
 759         } \
 760         break; \
 761 }
 762
 763 #define XBLEND_NN(FN) { \
 764         switch(input->get_color_model()) { \
 765         case BC_RGB_FLOAT:      XBLEND_3NN(FN, z_float,   z_float,    1.f,    3, 0,       0.f); \
 766         case BC_RGBA_FLOAT:     XBLEND_3NN(FN, z_float,   z_float,    1.f,    4, 0,       0.f); \
 767         case BC_RGB888:         XBLEND_3NN(FN, z_int32_t, z_uint8_t,  0xff,   3, 0,      .5f); \
 768         case BC_YUV888:         XBLEND_3NN(FN, z_int32_t, z_uint8_t,  0xff,   3, 0x80,   .5f); \
 769         case BC_RGBA8888:       XBLEND_3NN(FN, z_int32_t, z_uint8_t,  0xff,   4, 0,      .5f); \
 770         case BC_YUVA8888:       XBLEND_3NN(FN, z_int32_t, z_uint8_t,  0xff,   4, 0x80,   .5f); \
 771         case BC_RGB161616:      XBLEND_3NN(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0,      .5f); \
 772         case BC_YUV161616:      XBLEND_3NN(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0x8000, .5f); \
 773         case BC_RGBA16161616:   XBLEND_3NN(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0,      .5f); \
 774         case BC_YUVA16161616:   XBLEND_3NN(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0x8000, .5f); \
 775         } \
 776         break; \
 777 }
 778
 779 NNPackage::NNPackage()
 780 {
 781 }
 782
 783 NNUnit::NNUnit(NNEngine *server)
 784  : LoadClient(server)
 785 {
 786         this->engine = server;
 787 }
 788
 789 NNUnit::~NNUnit()
 790 {
 791 }
 792
 793 void NNUnit::process_package(LoadPackage *package)
 794 {
 795         NNPackage *pkg = (NNPackage*)package;
 796         VFrame *output = engine->output;
 797         VFrame *input = engine->input;
 798         int mode = engine->mode;
 799         float fade =
 800                 BC_CModels::has_alpha(input->get_color_model()) &&
 801                 mode == TRANSFER_REPLACE ? 1.f : engine->alpha;
 802
 803         int ox = engine->out_x1i;
 804         int ow = engine->out_x2i - ox;
 805         int *ly = engine->in_lookup_y + pkg->out_row1;
 806
 807         switch( mode ) {
 808         case TRANSFER_NORMAL:           XBLEND_NN(NORMAL);
 809         case TRANSFER_ADDITION:         XBLEND_NN(ADDITION);
 810         case TRANSFER_SUBTRACT:         XBLEND_NN(SUBTRACT);
 811         case TRANSFER_MULTIPLY:         XBLEND_NN(MULTIPLY);
 812         case TRANSFER_DIVIDE:           XBLEND_NN(DIVIDE);
 813         case TRANSFER_REPLACE:          XBLEND_NN(REPLACE);
 814         case TRANSFER_MAX:              XBLEND_NN(MAX);
 815         case TRANSFER_MIN:              XBLEND_NN(MIN);
 816         case TRANSFER_AVERAGE:          XBLEND_NN(AVERAGE);
 817         case TRANSFER_DARKEN:           XBLEND_NN(DARKEN);
 818         case TRANSFER_LIGHTEN:          XBLEND_NN(LIGHTEN);
 819         case TRANSFER_DST:              XBLEND_NN(DST);
 820         case TRANSFER_DST_ATOP:         XBLEND_NN(DST_ATOP);
 821         case TRANSFER_DST_IN:           XBLEND_NN(DST_IN);
 822         case TRANSFER_DST_OUT:          XBLEND_NN(DST_OUT);
 823         case TRANSFER_DST_OVER:         XBLEND_NN(DST_OVER);
 824         case TRANSFER_SRC:              XBLEND_NN(SRC);
 825         case TRANSFER_SRC_ATOP:         XBLEND_NN(SRC_ATOP);
 826         case TRANSFER_SRC_IN:           XBLEND_NN(SRC_IN);
 827         case TRANSFER_SRC_OUT:          XBLEND_NN(SRC_OUT);
 828         case TRANSFER_SRC_OVER:         XBLEND_NN(SRC_OVER);
 829         case TRANSFER_OR:               XBLEND_NN(OR);
 830         case TRANSFER_XOR:              XBLEND_NN(XOR);
 831         }
 832 }
 833
 834 NNEngine::NNEngine(int cpus)
 835  : LoadServer(cpus, cpus)
 836 {
 837         in_lookup_x = 0;
 838         in_lookup_y = 0;
 839 }
 840
 841 NNEngine::~NNEngine()
 842 {
 843         if(in_lookup_x)
 844                 delete[] in_lookup_x;
 845         if(in_lookup_y)
 846                 delete[] in_lookup_y;
 847 }
 848
 849 void NNEngine::init_packages()
 850 {
 851         int in_w = input->get_w();
 852         int in_h = input->get_h();
 853         int out_w = output->get_w();
 854         int out_h = output->get_h();
 855
 856         float in_subw = in_x2 - in_x1;
 857         float in_subh = in_y2 - in_y1;
 858         float out_subw = out_x2 - out_x1;
 859         float out_subh = out_y2 - out_y1;
 860         int first, last, count, i;
 861         int components = 3;
 862
 863         out_x1i = rint(out_x1);
 864         out_x2i = rint(out_x2);
 865         if(out_x1i < 0) out_x1i = 0;
 866         if(out_x1i > out_w) out_x1i = out_w;
 867         if(out_x2i < 0) out_x2i = 0;
 868         if(out_x2i > out_w) out_x2i = out_w;
 869         int out_wi = out_x2i - out_x1i;
 870         if( !out_wi ) return;
 871
 872         delete[] in_lookup_x;
 873         in_lookup_x = new int[out_wi];
 874         delete[] in_lookup_y;
 875         in_lookup_y = new int[out_h];
 876
 877         switch(input->get_color_model()) {
 878         case BC_RGBA_FLOAT:
 879         case BC_RGBA8888:
 880         case BC_YUVA8888:
 881         case BC_RGBA16161616:
 882                 components = 4;
 883                 break;
 884         }
 885
 886         first = count = 0;
 887
 888         for(i = out_x1i; i < out_x2i; i++) {
 889                 int in = (i - out_x1 + .5) * in_subw / out_subw + in_x1;
 890                 if(in < in_x1)
 891                         in = in_x1;
 892                 if(in > in_x2)
 893                         in = in_x2;
 894
 895                 if(in >= 0 && in < in_w && in >= in_x1 && i >= 0 && i < out_w) {
 896                         if(count == 0) {
 897                                 first = i;
 898                                 in_lookup_x[0] = in * components;
 899                         }
 900                         else {
 901                                 in_lookup_x[count] = (in-last)*components;
 902                         }
 903                         last = in;
 904                         count++;
 905                 }
 906                 else if(count)
 907                         break;
 908         }
 909         out_x1i = first;
 910         out_x2i = first + count;
 911         first = count = 0;
 912
 913         for(i = out_y1; i < out_y2; i++) {
 914                 int in = (i - out_y1+.5) * in_subh / out_subh + in_y1;
 915                 if(in < in_y1) in = in_y1;
 916                 if(in > in_y2) in = in_y2;
 917                 if(in >= 0 && in < in_h && i >= 0 && i < out_h) {
 918                         if(count == 0) first = i;
 919                         in_lookup_y[i] = in;
 920                         count++;
 921                 }
 922                 else if(count)
 923                         break;
 924         }
 925         out_y1 = first;
 926         out_y2 = first + count;
 927
 928         int rows = count;
 929         int pkgs = get_total_packages();
 930         int row1 = out_y1, row2 = row1;
 931         for(int i = 0; i < pkgs; row1=row2 ) {
 932                 NNPackage *package = (NNPackage*)get_package(i);
 933                 row2 = ++i * rows / pkgs + out_y1;
 934                 package->out_row1 = row1;
 935                 package->out_row2 = row2;
 936         }
 937 }
 938
 939 LoadClient* NNEngine::new_client()
 940 {
 941         return new NNUnit(this);
 942 }
 943
 944 LoadPackage* NNEngine::new_package()
 945 {
 946         return new NNPackage;
 947 }
 948
 949 /* Fully resampled scale / translate / blend ******************************/
 950 /* resample into a temporary row vector, then blend */
 951
 952 #define XSAMPLE(FN, temp_type, type, max, components, ofs, round) { \
 953         float temp[oh*components]; \
 954         temp_type opcty = fade * max + round, trnsp = max - opcty; \
 955         type **output_rows = (type**)voutput->get_rows() + o1i; \
 956         type **input_rows = (type**)vinput->get_rows(); \
 957  \
 958         for(int i = pkg->out_col1; i < pkg->out_col2; i++) { \
 959                 type *input = input_rows[i - engine->col_out1 + engine->row_in]; \
 960                 float *tempp = temp; \
 961                 if( !k ) { /* direct copy case */ \
 962                         type *ip = input + i1i * components; \
 963                         for(int j = 0; j < oh; j++) { \
 964                                 *tempp++ = *ip++; \
 965                                 *tempp++ = *ip++ - ofs; \
 966                                 *tempp++ = *ip++ - ofs; \
 967                                 if( components == 4 ) *tempp++ = *ip++; \
 968                         } \
 969                 } \
 970                 else { /* resample */ \
 971                         for(int j = 0; j < oh; j++) { \
 972                                 float racc=0.f, gacc=0.f, bacc=0.f, aacc=0.f; \
 973                                 int ki = lookup_sk[j], x = lookup_sx0[j]; \
 974                                 type *ip = input + x * components; \
 975                                 float wacc = 0, awacc = 0; \
 976                                 while(x++ < lookup_sx1[j]) { \
 977                                         float kv = k[abs(ki >> INDEX_FRACTION)]; \
 978                                         /* handle fractional pixels on edges of input */ \
 979                                         if(x == i1i) kv *= i1f; \
 980                                         if(x + 1 == i2i) kv *= i2f; \
 981                                         if( components == 4 ) { awacc += kv;  kv *= ip[3]; } \
 982                                         wacc += kv; \
 983                                         racc += kv * *ip++; \
 984                                         gacc += kv * (*ip++ - ofs); \
 985                                         bacc += kv * (*ip++ - ofs); \
 986                                         if( components == 4 ) { aacc += kv;  ++ip; } \
 987                                         ki += kd; \
 988                                 } \
 989                                 if(wacc > 0.) wacc = 1. / wacc; \
 990                                 *tempp++ = racc * wacc; \
 991                                 *tempp++ = gacc * wacc; \
 992                                 *tempp++ = bacc * wacc; \
 993                                 if( components == 4 ) { \
 994                                         if(awacc > 0.) awacc = 1. / awacc; \
 995                                         *tempp++ = aacc * awacc; \
 996                                 } \
 997                         } \
 998                 } \
 999  \
1000                 /* handle fractional pixels on edges of output */ \
1001                 temp[0] *= o1f;   temp[1] *= o1f;   temp[2] *= o1f; \
1002                 if( components == 4 ) temp[3] *= o1f; \
1003                 tempp = temp + (oh-1)*components; \
1004                 tempp[0] *= o2f;  tempp[1] *= o2f;  tempp[2] *= o2f; \
1005                 if( components == 4 ) tempp[3] *= o2f; \
1006                 tempp = temp; \
1007                 /* blend output */ \
1008                 for(int j = 0; j < oh; j++) { \
1009                         type *output = output_rows[j] + i * components; \
1010                         if( components == 4 ) { \
1011                                 temp_type r, g, b, a; \
1012                                 ALPHA4_BLEND(FN, temp_type, tempp, output, max, 0, ofs, round); \
1013                                 ALPHA4_STORE(output, ofs, max); \
1014                         } \
1015                         else { \
1016                                 temp_type r, g, b; \
1017                                 ALPHA3_BLEND(FN, temp_type, tempp, output, max, 0, ofs, round); \
1018                                 ALPHA3_STORE(output, ofs, max); \
1019                         } \
1020                         tempp += components; \
1021                 } \
1022         } \
1023         break; \
1024 }
1025
1026 #define XBLEND_SAMPLE(FN) { \
1027         switch(vinput->get_color_model()) { \
1028         case BC_RGB_FLOAT:      XSAMPLE(FN, z_float,   z_float,    1.f,    3, 0.f,    0.f); \
1029         case BC_RGBA_FLOAT:     XSAMPLE(FN, z_float,   z_float,    1.f,    4, 0.f,    0.f); \
1030         case BC_RGB888:         XSAMPLE(FN, z_int32_t, z_uint8_t,  0xff,   3, 0,      .5f); \
1031         case BC_YUV888:         XSAMPLE(FN, z_int32_t, z_uint8_t,  0xff,   3, 0x80,   .5f); \
1032         case BC_RGBA8888:       XSAMPLE(FN, z_int32_t, z_uint8_t,  0xff,   4, 0,      .5f); \
1033         case BC_YUVA8888:       XSAMPLE(FN, z_int32_t, z_uint8_t,  0xff,   4, 0x80,   .5f); \
1034         case BC_RGB161616:      XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0,      .5f); \
1035         case BC_YUV161616:      XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 3, 0x8000, .5f); \
1036         case BC_RGBA16161616:   XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0,      .5f); \
1037         case BC_YUVA16161616:   XSAMPLE(FN, z_int64_t, z_uint16_t, 0xffff, 4, 0x8000, .5f); \
1038         } \
1039         break; \
1040 }
1041
1042
1043 SamplePackage::SamplePackage()
1044 {
1045 }
1046
1047 SampleUnit::SampleUnit(SampleEngine *server)
1048  : LoadClient(server)
1049 {
1050         this->engine = server;
1051 }
1052
1053 SampleUnit::~SampleUnit()
1054 {
1055 }
1056
1057 void SampleUnit::process_package(LoadPackage *package)
1058 {
1059         SamplePackage *pkg = (SamplePackage*)package;
1060
1061         float i1  = engine->in1;
1062         float i2  = engine->in2;
1063         float o1  = engine->out1;
1064         float o2  = engine->out2;
1065
1066         if(i2 - i1 <= 0 || o2 - o1 <= 0)
1067                 return;
1068
1069         VFrame *voutput = engine->output;
1070         VFrame *vinput = engine->input;
1071         int mode = engine->mode;
1072         float fade =
1073                 BC_CModels::has_alpha(vinput->get_color_model()) &&
1074                 mode == TRANSFER_REPLACE ? 1.f : engine->alpha;
1075
1076         //int   iw  = vinput->get_w();
1077         int   i1i = floor(i1);
1078         int   i2i = ceil(i2);
1079         float i1f = 1.f - i1 + i1i;
1080         float i2f = 1.f - i2i + i2;
1081
1082         int   o1i = floor(o1);
1083         int   o2i = ceil(o2);
1084         float o1f = 1.f - o1 + o1i;
1085         float o2f = 1.f - o2i + o2;
1086         int   oh  = o2i - o1i;
1087
1088         float *k  = engine->kernel->lookup;
1089         //float kw  = engine->kernel->width;
1090         //int   kn  = engine->kernel->n;
1091         int   kd = engine->kd;
1092
1093         int *lookup_sx0 = engine->lookup_sx0;
1094         int *lookup_sx1 = engine->lookup_sx1;
1095         int *lookup_sk = engine->lookup_sk;
1096         //float *lookup_wacc = engine->lookup_wacc;
1097
1098         switch( mode ) {
1099         case TRANSFER_NORMAL:           XBLEND_SAMPLE(NORMAL);
1100         case TRANSFER_ADDITION:         XBLEND_SAMPLE(ADDITION);
1101         case TRANSFER_SUBTRACT:         XBLEND_SAMPLE(SUBTRACT);
1102         case TRANSFER_MULTIPLY:         XBLEND_SAMPLE(MULTIPLY);
1103         case TRANSFER_DIVIDE:           XBLEND_SAMPLE(DIVIDE);
1104         case TRANSFER_REPLACE:          XBLEND_SAMPLE(REPLACE);
1105         case TRANSFER_MAX:              XBLEND_SAMPLE(MAX);
1106         case TRANSFER_MIN:              XBLEND_SAMPLE(MIN);
1107         case TRANSFER_AVERAGE:          XBLEND_SAMPLE(AVERAGE);
1108         case TRANSFER_DARKEN:           XBLEND_SAMPLE(DARKEN);
1109         case TRANSFER_LIGHTEN:          XBLEND_SAMPLE(LIGHTEN);
1110         case TRANSFER_DST:              XBLEND_SAMPLE(DST);
1111         case TRANSFER_DST_ATOP:         XBLEND_SAMPLE(DST_ATOP);
1112         case TRANSFER_DST_IN:           XBLEND_SAMPLE(DST_IN);
1113         case TRANSFER_DST_OUT:          XBLEND_SAMPLE(DST_OUT);
1114         case TRANSFER_DST_OVER:         XBLEND_SAMPLE(DST_OVER);
1115         case TRANSFER_SRC:              XBLEND_SAMPLE(SRC);
1116         case TRANSFER_SRC_ATOP:         XBLEND_SAMPLE(SRC_ATOP);
1117         case TRANSFER_SRC_IN:           XBLEND_SAMPLE(SRC_IN);
1118         case TRANSFER_SRC_OUT:          XBLEND_SAMPLE(SRC_OUT);
1119         case TRANSFER_SRC_OVER:         XBLEND_SAMPLE(SRC_OVER);
1120         case TRANSFER_OR:               XBLEND_SAMPLE(OR);
1121         case TRANSFER_XOR:              XBLEND_SAMPLE(XOR);
1122         }
1123 }
1124
1125
1126 SampleEngine::SampleEngine(int cpus)
1127  : LoadServer(cpus, cpus)
1128 {
1129         lookup_sx0 = 0;
1130         lookup_sx1 = 0;
1131         lookup_sk = 0;
1132         lookup_wacc = 0;
1133         kd = 0;
1134 }
1135
1136 SampleEngine::~SampleEngine()
1137 {
1138         if(lookup_sx0) delete [] lookup_sx0;
1139         if(lookup_sx1) delete [] lookup_sx1;
1140         if(lookup_sk) delete [] lookup_sk;
1141         if(lookup_wacc) delete [] lookup_wacc;
1142 }
1143
1144 /*
1145  * unlike the Direct and NN engines, the Sample engine works across
1146  * output columns (it makes for more economical memory addressing
1147  * during convolution)
1148  */
1149 void SampleEngine::init_packages()
1150 {
1151         int   iw  = input->get_w();
1152         int   i1i = floor(in1);
1153         int   i2i = ceil(in2);
1154         float i1f = 1.f - in1 + i1i;
1155         float i2f = 1.f - i2i + in2;
1156
1157         int   oy  = floor(out1);
1158         float oyf = out1 - oy;
1159         int   oh  = ceil(out2) - oy;
1160
1161         float *k  = kernel->lookup;
1162         float kw  = kernel->width;
1163         int   kn  = kernel->n;
1164
1165         if(in2 - in1 <= 0 || out2 - out1 <= 0)
1166                 return;
1167
1168         /* determine kernel spatial coverage */
1169         float scale = (out2 - out1) / (in2 - in1);
1170         float iscale = (in2 - in1) / (out2 - out1);
1171         float coverage = fabs(1.f / scale);
1172         float bound = (coverage < 1.f ? kw : kw * coverage) - (.5f / TRANSFORM_SPP);
1173         float coeff = (coverage < 1.f ? 1.f : scale) * TRANSFORM_SPP;
1174
1175         delete [] lookup_sx0;
1176         delete [] lookup_sx1;
1177         delete [] lookup_sk;
1178         delete [] lookup_wacc;
1179
1180         lookup_sx0 = new int[oh];
1181         lookup_sx1 = new int[oh];
1182         lookup_sk = new int[oh];
1183         lookup_wacc = new float[oh];
1184
1185         kd = (double)coeff * (1 << INDEX_FRACTION) + .5;
1186
1187         /* precompute kernel values and weight sums */
1188         for(int i = 0; i < oh; i++) {
1189                 /* map destination back to source */
1190                 double sx = (i - oyf + .5) * iscale + in1 - .5;
1191
1192                 /*
1193                  * clip iteration to source area but not source plane. Points
1194                  * outside the source plane count as transparent. Points outside
1195                  * the source area don't count at all.  The actual convolution
1196                  * later will be clipped to both, but we need to compute
1197                  * weights.
1198                  */
1199                 int sx0 = MAX((int)floor(sx - bound) + 1, i1i);
1200                 int sx1 = MIN((int)ceil(sx + bound), i2i);
1201                 int ki = (double)(sx0 - sx) * coeff * (1 << INDEX_FRACTION)
1202                                 + (1 << (INDEX_FRACTION - 1)) + .5;
1203                 float wacc=0.;
1204
1205                 lookup_sx0[i] = -1;
1206                 lookup_sx1[i] = -1;
1207
1208                 for(int j= sx0; j < sx1; j++) {
1209                         int kv = (ki >> INDEX_FRACTION);
1210                         if(kv > kn) break;
1211                         if(kv >= -kn) {
1212                                 /*
1213                                  * the contribution of the first and last input pixel (if
1214                                  * fractional) are linearly weighted by the fraction
1215                                  */
1216                                 if(j == i1i)
1217                                         wacc += k[abs(kv)] * i1f;
1218                                 else if(j + 1 == i2i)
1219                                         wacc += k[abs(kv)] * i2f;
1220                                 else
1221                                         wacc += k[abs(kv)];
1222
1223                                 /* this is where we clip the kernel convolution to the source plane */
1224                                 if(j >= 0 && j < iw) {
1225                                         if(lookup_sx0[i] == -1) {
1226                                                 lookup_sx0[i] = j;
1227                                                 lookup_sk[i] = ki;
1228                                         }
1229                                         lookup_sx1[i] = j + 1;
1230                                 }
1231                         }
1232                         ki += kd;
1233                 }
1234                 lookup_wacc[i] = wacc > 0. ? 1. / wacc : 0.;
1235         }
1236
1237         int cols = col_out2 - col_out1;
1238         int pkgs = get_total_packages();
1239         int col1 = col_out1, col2 = col1;
1240         for(int i = 0; i < pkgs; col1=col2 ) {
1241                 SamplePackage *package = (SamplePackage*)get_package(i);
1242                 col2 = ++i * cols / pkgs + col_out1;
1243                 package->out_col1 = col1;
1244                 package->out_col2 = col2;
1245         }
1246 }
1247
1248 LoadClient* SampleEngine::new_client()
1249 {
1250         return new SampleUnit(this);
1251 }
1252
1253 LoadPackage* SampleEngine::new_package()
1254 {
1255         return new SamplePackage;
1256 }