cinelerra-5.1/mpeg2enc/motion.c

   1 /* motion.c, motion estimation                                              */
   2
   3 /* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
   4
   5 /*
   6  * Disclaimer of Warranty
   7  *
   8  * These software programs are available to the user without any license fee or
   9  * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
  10  * any and all warranties, whether express, implied, or statuary, including any
  11  * implied warranties or merchantability or of fitness for a particular
  12  * purpose.  In no event shall the copyright-holder be liable for any
  13  * incidental, punitive, or consequential damages of any kind whatsoever
  14  * arising from the use of these programs.
  15  *
  16  * This disclaimer of warranty extends to the user of these programs and user's
  17  * customers, employees, agents, transferees, successors, and assigns.
  18  *
  19  * The MPEG Software Simulation Group does not represent or warrant that the
  20  * programs furnished hereunder are free of infringement of any third-party
  21  * patents.
  22  *
  23  * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
  24  * are subject to royalty fees to patent holders.  Many of these patents are
  25  * general enough such that they are unavoidable regardless of implementation
  26  * design.
  27  *
  28  */
  29
  30 #include <limits.h>
  31 #include <stdio.h>
  32 #include <string.h>
  33 #include "config.h"
  34 #include "global.h"
  35 #include "cpu_accel.h"
  36 #include "simd.h"
  37
  38 /* Macro-block Motion compensation results record */
  39
  40 typedef struct _blockcrd {
  41         int16_t x;
  42         int16_t y;
  43 } blockxy;
  44
  45 struct mb_motion
  46 {
  47         blockxy pos;        // Half-pel co-ordinates of source block
  48         int sad;                        // Sum of absolute difference
  49         int var;
  50         uint8_t *blk ;          // Source block data (in luminace data array)
  51         int hx, hy;                     // Half-pel offsets
  52         int fieldsel;           // 0 = top 1 = bottom
  53         int fieldoff;       // Offset from start of frame data to first line
  54                                                 // of field.    (top = 0, bottom = width );
  55 };
  56
  57 typedef struct mb_motion mb_motion_s;
  58
  59
  60 struct subsampled_mb
  61 {
  62         uint8_t *mb;            // One pel
  63         uint8_t *fmb;           // Two-pel subsampled
  64         uint8_t *qmb;           // Four-pel subsampled
  65         uint8_t *umb;           // U compoenent one-pel
  66         uint8_t *vmb;       // V component  one-pel
  67 };
  68
  69 typedef struct subsampled_mb subsampled_mb_s;
  70
  71
  72 static void frame_ME (motion_engine_t *engine,
  73                                                                         pict_data_s *picture,
  74                                                                   motion_comp_s *mc,
  75                                                                   int mboffset,
  76                                                                   int i, int j, struct mbinfo *mbi);
  77
  78 static void field_ME (motion_engine_t *engine,
  79                                                                         pict_data_s *picture,
  80                                                                   motion_comp_s *mc,
  81                                                                   int mboffset,
  82                                                                   int i, int j,
  83                                                                   struct mbinfo *mbi,
  84                                                                   int secondfield,
  85                                                                   int ipflag);
  86
  87 static void frame_estimate (motion_engine_t *engine,
  88         uint8_t *org,
  89          uint8_t *ref,
  90          subsampled_mb_s *ssmb,
  91          int i, int j,
  92          int sx, int sy,
  93           mb_motion_s *bestfr,
  94           mb_motion_s *besttop,
  95           mb_motion_s *bestbot,
  96          int imins[2][2], int jmins[2][2]);
  97
  98 static void field_estimate (motion_engine_t *engine,
  99                                                         pict_data_s *picture,
 100                                                         uint8_t *toporg,
 101                                                         uint8_t *topref,
 102                                                         uint8_t *botorg,
 103                                                         uint8_t *botref,
 104                                                         subsampled_mb_s *ssmb,
 105                                                         int i, int j, int sx, int sy, int ipflag,
 106                                                         mb_motion_s *bestfr,
 107                                                         mb_motion_s *best8u,
 108                                                         mb_motion_s *best8l,
 109                                                         mb_motion_s *bestsp);
 110
 111 static void dpframe_estimate (motion_engine_t *engine,
 112         pict_data_s *picture,
 113         uint8_t *ref,
 114         subsampled_mb_s *ssmb,
 115         int i, int j, int iminf[2][2], int jminf[2][2],
 116         mb_motion_s *dpbest,
 117         int *imindmvp, int *jmindmvp,
 118         int *vmcp);
 119
 120 static void dpfield_estimate (motion_engine_t *engine,
 121         pict_data_s *picture,
 122         uint8_t *topref,
 123         uint8_t *botref,
 124         uint8_t *mb,
 125         int i, int j,
 126         int imins, int jmins,
 127         mb_motion_s *dpbest,
 128         int *vmcp);
 129
 130 static void fullsearch (motion_engine_t *engine,
 131         uint8_t *org, uint8_t *ref,
 132         subsampled_mb_s *ssblk,
 133         int lx, int i0, int j0,
 134         int sx, int sy, int h,
 135         int xmax, int ymax,
 136         mb_motion_s *motion );
 137
 138 static void find_best_one_pel( motion_engine_t *engine,
 139                                                                 uint8_t *org, uint8_t *blk,
 140                                                            int searched_size,
 141                                                            int i0, int j0,
 142                                                            int ilow, int jlow,
 143                                                            int xmax, int ymax,
 144                                                            int lx, int h,
 145                                                            mb_motion_s *res
 146         );
 147 static int build_sub22_mcomps( motion_engine_t *engine,
 148                                                                 int i0,  int j0, int ihigh, int jhigh,
 149                                                                 int null_mc_sad,
 150                                                                 uint8_t *s22org,  uint8_t *s22blk,
 151                                                            int flx, int fh,  int searched_sub44_size );
 152
 153 #ifdef X86_CPU
 154 static void find_best_one_pel_mmxe( motion_engine_t *engine,
 155                                                                 uint8_t *org, uint8_t *blk,
 156                                                            int searched_size,
 157                                                            int i0, int j0,
 158                                                            int ilow, int jlow,
 159                                                            int xmax, int ymax,
 160                                                            int lx, int h,
 161                                                            mb_motion_s *res
 162         );
 163
 164 static int build_sub22_mcomps_mmxe( motion_engine_t *engine, int i0,  int j0, int ihigh, int jhigh,
 165                                                          int null_mc_sad,
 166                                                          uint8_t *s22org,  uint8_t *s22blk,
 167                                                          int flx, int fh,  int searched_sub44_size );
 168 static int (*pmblock_sub44_dists)( uint8_t *blk,  uint8_t *ref,
 169                                                         int ilow, int jlow,
 170                                                         int ihigh, int jhigh,
 171                                                         int h, int rowstride,
 172                                                         int threshold,
 173                                                         mc_result_s *resvec);
 174 #endif
 175
 176 static int unidir_pred_var( const mb_motion_s *motion,
 177                                                         uint8_t *mb,  int lx, int h);
 178 static int bidir_pred_var( const mb_motion_s *motion_f,
 179                                                    const mb_motion_s *motion_b,
 180                                                    uint8_t *mb,  int lx, int h);
 181 static int bidir_pred_sad( const mb_motion_s *motion_f,
 182                                                    const mb_motion_s *motion_b,
 183                                                    uint8_t *mb,  int lx, int h);
 184
 185 static int variance(  uint8_t *mb, int size, int lx);
 186
 187 static int dist22 ( uint8_t *blk1, uint8_t *blk2,  int qlx, int qh);
 188
 189 static int dist44 ( uint8_t *blk1, uint8_t *blk2,  int flx, int fh);
 190 static int dist2_22( uint8_t *blk1, uint8_t *blk2,
 191                                          int lx, int h);
 192 static int bdist2_22( uint8_t *blk1f, uint8_t *blk1b,
 193                                           uint8_t *blk2,
 194                                           int lx, int h);
 195
 196
 197 static void (*pfind_best_one_pel)( motion_engine_t *engine,
 198                                                                 uint8_t *org, uint8_t *blk,
 199                                                            int searched_size,
 200                                                            int i0, int j0,
 201                                                            int ilow, int jlow,
 202                                                            int xmax, int ymax,
 203                                                            int lx, int h,
 204                                                            mb_motion_s *res
 205         );
 206 static int (*pbuild_sub22_mcomps)( motion_engine_t *engine,
 207                                                                         int i0,  int j0, int ihigh, int jhigh,
 208                                                                    int null_mc_sad,
 209                                                                    uint8_t *s22org,  uint8_t *s22blk,
 210                                                                    int flx, int fh,  int searched_sub44_size );
 211
 212 static int (*pdist2_22)( uint8_t *blk1, uint8_t *blk2,
 213                                                  int lx, int h);
 214 static int (*pbdist2_22)( uint8_t *blk1f, uint8_t *blk1b,
 215                                                   uint8_t *blk2,
 216                                                   int lx, int h);
 217
 218 static int dist1_00( uint8_t *blk1, uint8_t *blk2,  int lx, int h, int distlim);
 219 static int dist1_01(uint8_t *blk1, uint8_t *blk2, int lx, int h);
 220 static int dist1_10(uint8_t *blk1, uint8_t *blk2, int lx, int h);
 221 static int dist1_11(uint8_t *blk1, uint8_t *blk2, int lx, int h);
 222 static int dist2 (uint8_t *blk1, uint8_t *blk2,
 223                                                           int lx, int hx, int hy, int h);
 224 static int bdist2 (uint8_t *pf, uint8_t *pb,
 225         uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
 226 static int bdist1 (uint8_t *pf, uint8_t *pb,
 227                                    uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
 228
 229
 230 static int (*pdist22) ( uint8_t *blk1, uint8_t *blk2,  int flx, int fh);
 231 static int (*pdist44) ( uint8_t *blk1, uint8_t *blk2,  int qlx, int qh);
 232 static int (*pdist1_00) ( uint8_t *blk1, uint8_t *blk2,  int lx, int h, int distlim);
 233 static int (*pdist1_01) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
 234 static int (*pdist1_10) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
 235 static int (*pdist1_11) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
 236
 237 static int (*pdist2) (uint8_t *blk1, uint8_t *blk2,
 238                                           int lx, int hx, int hy, int h);
 239
 240
 241 static int (*pbdist2) (uint8_t *pf, uint8_t *pb,
 242                                            uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
 243
 244 static int (*pbdist1) (uint8_t *pf, uint8_t *pb,
 245                                            uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
 246
 247
 248 /* DEBUGGER...
 249 static void check_mc( const mb_motion_s *motion, int rx, int ry, int i, int j, char *str )
 250 {
 251         rx *= 2; ry *= 2;
 252         if( motion->sad < 0 || motion->sad > 0x10000 )
 253         {
 254                 printf( "SAD ooops %s\n", str );
 255                 exit(1);
 256         }
 257         if( motion->pos.x-i*2 < -rx || motion->pos.x-i*2 >= rx )
 258         {
 259                 printf( "X MC ooops %s = %d [%d]\n", str, motion->pos.x-i*2,rx );
 260                 exit(1);
 261         }
 262         if( motion->pos.y-j*2 < -ry || motion->pos.y-j*2 >= ry )
 263         {
 264                 printf( "Y MC ooops %s %d [%d]\n", str, motion->pos.y-j*2, ry );
 265                 exit(1);
 266         }
 267 }
 268
 269 static void init_mc( mb_motion_s *motion )
 270 {
 271         motion->sad = -123;
 272         motion->pos.x = -1000;
 273         motion->pos.y = -1000;
 274 }
 275 */
 276
 277 /* unidir_NI_var_sum
 278    Compute the combined variance of luminance and chrominance information
 279    for a particular non-intra macro block after unidirectional
 280    motion compensation...
 281
 282    Note: results are scaled to give chrominance equal weight to
 283    chrominance.  The variance of the luminance portion is computed
 284    at the time the motion compensation is computed.
 285
 286    TODO: Perhaps we should compute the whole thing in fullsearch not
 287    seperate it.  However, that would involve a lot of fiddling with
 288    field_* and until its thoroughly debugged and tested I think I'll
 289    leave that alone. Furthermore, it is unclear if its really worth
 290    doing these computations for B *and* P frames.
 291
 292    TODO: BUG: ONLY works for 420 video...
 293
 294 */
 295
 296 static int unidir_chrom_var_sum( mb_motion_s *lum_mc,
 297                                                           uint8_t **ref,
 298                                                           subsampled_mb_s *ssblk,
 299                                                           int lx, int h )
 300 {
 301         int uvlx = (lx>>1);
 302         int uvh = (h>>1);
 303         /* N.b. MC co-ordinates are computed in half-pel units! */
 304         int cblkoffset = (lum_mc->fieldoff>>1) +
 305                 (lum_mc->pos.x>>2) + (lum_mc->pos.y>>2)*uvlx;
 306
 307         return  ((*pdist2_22)( ref[1] + cblkoffset, ssblk->umb, uvlx, uvh) +
 308                          (*pdist2_22)( ref[2] + cblkoffset, ssblk->vmb, uvlx, uvh))*2;
 309 }
 310
 311 /*
 312   bidir_NI_var_sum
 313    Compute the combined variance of luminance and chrominance information
 314    for a particular non-intra macro block after unidirectional
 315    motion compensation...
 316
 317    Note: results are scaled to give chrominance equal weight to
 318    chrominance.  The variance of the luminance portion is computed
 319    at the time the motion compensation is computed.
 320
 321    Note: results scaled to give chrominance equal weight to chrominance.
 322
 323   TODO: BUG: ONLY works for 420 video...
 324
 325   NOTE: Currently unused but may be required if it turns out that taking
 326   chrominance into account in B frames is needed.
 327
 328  */
 329
 330 int bidir_chrom_var_sum( mb_motion_s *lum_mc_f,
 331                                            mb_motion_s *lum_mc_b,
 332                                            uint8_t **ref_f,
 333                                            uint8_t **ref_b,
 334                                            subsampled_mb_s *ssblk,
 335                                            int lx, int h )
 336 {
 337         int uvlx = (lx>>1);
 338         int uvh = (h>>1);
 339         /* N.b. MC co-ordinates are computed in half-pel units! */
 340         int cblkoffset_f = (lum_mc_f->fieldoff>>1) +
 341                 (lum_mc_f->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
 342         int cblkoffset_b = (lum_mc_b->fieldoff>>1) +
 343                 (lum_mc_b->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
 344
 345         return  (
 346                 (*pbdist2_22)( ref_f[1] + cblkoffset_f, ref_b[1] + cblkoffset_b,
 347                                            ssblk->umb, uvlx, uvh ) +
 348                 (*pbdist2_22)( ref_f[2] + cblkoffset_f, ref_b[2] + cblkoffset_b,
 349                                            ssblk->vmb, uvlx, uvh ))*2;
 350
 351 }
 352
 353 static int chrom_var_sum( subsampled_mb_s *ssblk, int h, int lx )
 354 {
 355         return (variance(ssblk->umb,(h>>1),(lx>>1)) +
 356                         variance(ssblk->vmb,(h>>1),(lx>>1))) * 2;
 357 }
 358
 359 /*
 360  * frame picture motion estimation
 361  *
 362  * org: top left pel of source reference frame
 363  * ref: top left pel of reconstructed reference frame
 364  * ssmb:  macroblock to be matched
 365  * i,j: location of mb relative to ref (=center of search window)
 366  * sx,sy: half widths of search window
 367  * besfr: location and SAD of best frame prediction
 368  * besttop: location of best field pred. for top field of mb
 369  * bestbo : location of best field pred. for bottom field of mb
 370  */
 371
 372 static void frame_estimate(motion_engine_t *engine,
 373         uint8_t *org,
 374         uint8_t *ref,
 375         subsampled_mb_s *ssmb,
 376         int i, int j, int sx, int sy,
 377         mb_motion_s *bestfr,
 378         mb_motion_s *besttop,
 379         mb_motion_s *bestbot,
 380         int imins[2][2],
 381         int jmins[2][2]
 382         )
 383 {
 384         subsampled_mb_s  botssmb;
 385         mb_motion_s topfld_mc;
 386         mb_motion_s botfld_mc;
 387
 388         botssmb.mb = ssmb->mb+width;
 389         botssmb.fmb = ssmb->mb+(width>>1);
 390         botssmb.qmb = ssmb->qmb+(width>>2);
 391         botssmb.umb = ssmb->umb+(width>>1);
 392         botssmb.vmb = ssmb->vmb+(width>>1);
 393
 394         /* frame prediction */
 395         fullsearch(engine, org,ref,ssmb,width,i,j,sx,sy,16,width,height,
 396                                                   bestfr );
 397         bestfr->fieldsel = 0;
 398         bestfr->fieldoff = 0;
 399
 400         /* predict top field from top field */
 401         fullsearch(engine, org,ref,ssmb,width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
 402                            &topfld_mc);
 403
 404         /* predict top field from bottom field */
 405         fullsearch(engine, org+width,ref+width,ssmb, width<<1,i,j>>1,sx,sy>>1,8,
 406                            width,height>>1, &botfld_mc);
 407
 408         /* set correct field selectors... */
 409         topfld_mc.fieldsel = 0;
 410         botfld_mc.fieldsel = 1;
 411         topfld_mc.fieldoff = 0;
 412         botfld_mc.fieldoff = width;
 413
 414         imins[0][0] = topfld_mc.pos.x;
 415         jmins[0][0] = topfld_mc.pos.y;
 416         imins[1][0] = botfld_mc.pos.x;
 417         jmins[1][0] = botfld_mc.pos.y;
 418
 419         /* select prediction for top field */
 420         if (topfld_mc.sad<=botfld_mc.sad)
 421         {
 422                 *besttop = topfld_mc;
 423         }
 424         else
 425         {
 426                 *besttop = botfld_mc;
 427         }
 428
 429         /* predict bottom field from top field */
 430         fullsearch(engine, org,ref,&botssmb,
 431                                         width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
 432                                         &topfld_mc);
 433
 434         /* predict bottom field from bottom field */
 435         fullsearch(engine, org+width,ref+width,&botssmb,
 436                                         width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
 437                                         &botfld_mc);
 438
 439         /* set correct field selectors... */
 440         topfld_mc.fieldsel = 0;
 441         botfld_mc.fieldsel = 1;
 442         topfld_mc.fieldoff = 0;
 443         botfld_mc.fieldoff = width;
 444
 445         imins[0][1] = topfld_mc.pos.x;
 446         jmins[0][1] = topfld_mc.pos.y;
 447         imins[1][1] = botfld_mc.pos.x;
 448         jmins[1][1] = botfld_mc.pos.y;
 449
 450         /* select prediction for bottom field */
 451         if (botfld_mc.sad<=topfld_mc.sad)
 452         {
 453                 *bestbot = botfld_mc;
 454         }
 455         else
 456         {
 457                 *bestbot = topfld_mc;
 458         }
 459 }
 460
 461 /*
 462  * field picture motion estimation subroutine
 463  *
 464  * toporg: address of original top reference field
 465  * topref: address of reconstructed top reference field
 466  * botorg: address of original bottom reference field
 467  * botref: address of reconstructed bottom reference field
 468  * ssmmb:  macroblock to be matched
 469  * i,j: location of mb (=center of search window)
 470  * sx,sy: half width/height of search window
 471  *
 472  * bestfld: location and distance of best field prediction
 473  * best8u: location of best 16x8 pred. for upper half of mb
 474  * best8lp: location of best 16x8 pred. for lower half of mb
 475  * bdestsp: location and distance of best same parity field
 476  *                    prediction (needed for dual prime, only valid if
 477  *                    ipflag==0)
 478  */
 479
 480 static void field_estimate (motion_engine_t *engine,
 481         pict_data_s *picture,
 482         uint8_t *toporg,
 483         uint8_t *topref,
 484         uint8_t *botorg,
 485         uint8_t *botref,
 486         subsampled_mb_s *ssmb,
 487         int i, int j, int sx, int sy, int ipflag,
 488         mb_motion_s *bestfld,
 489         mb_motion_s *best8u,
 490         mb_motion_s *best8l,
 491         mb_motion_s *bestsp)
 492
 493 {
 494         mb_motion_s topfld_mc;
 495         mb_motion_s botfld_mc;
 496         int dt, db;
 497         int notop, nobot;
 498         subsampled_mb_s botssmb;
 499
 500         botssmb.mb = ssmb->mb+width;
 501         botssmb.umb = ssmb->umb+(width>>1);
 502         botssmb.vmb = ssmb->vmb+(width>>1);
 503         botssmb.fmb = ssmb->fmb+(width>>1);
 504         botssmb.qmb = ssmb->qmb+(width>>2);
 505
 506         /* if ipflag is set, predict from field of opposite parity only */
 507         notop = ipflag && (picture->pict_struct==TOP_FIELD);
 508         nobot = ipflag && (picture->pict_struct==BOTTOM_FIELD);
 509
 510         /* field prediction */
 511
 512         /* predict current field from top field */
 513         if (notop)
 514                 topfld_mc.sad = dt = 65536; /* infinity */
 515         else
 516                 fullsearch(engine, toporg,topref,ssmb,width<<1,
 517                                    i,j,sx,sy>>1,16,width,height>>1,
 518                                    &topfld_mc);
 519         dt = topfld_mc.sad;
 520         /* predict current field from bottom field */
 521         if (nobot)
 522                 botfld_mc.sad = db = 65536; /* infinity */
 523         else
 524                 fullsearch(engine, botorg,botref,ssmb,width<<1,
 525                                    i,j,sx,sy>>1,16,width,height>>1,
 526                                    &botfld_mc);
 527         db = botfld_mc.sad;
 528         /* Set correct field selectors */
 529         topfld_mc.fieldsel = 0;
 530         botfld_mc.fieldsel = 1;
 531         topfld_mc.fieldoff = 0;
 532         botfld_mc.fieldoff = width;
 533
 534         /* same parity prediction (only valid if ipflag==0) */
 535         if (picture->pict_struct==TOP_FIELD)
 536         {
 537                 *bestsp = topfld_mc;
 538         }
 539         else
 540         {
 541                 *bestsp = botfld_mc;
 542         }
 543
 544         /* select field prediction */
 545         if (dt<=db)
 546         {
 547                 *bestfld = topfld_mc;
 548         }
 549         else
 550         {
 551                 *bestfld = botfld_mc;
 552         }
 553
 554
 555         /* 16x8 motion compensation */
 556
 557         /* predict upper half field from top field */
 558         if (notop)
 559                 topfld_mc.sad = dt = 65536;
 560         else
 561                 fullsearch(engine, toporg,topref,ssmb,width<<1,
 562                                    i,j,sx,sy>>1,8,width,height>>1,
 563                                     &topfld_mc);
 564         dt = topfld_mc.sad;
 565         /* predict upper half field from bottom field */
 566         if (nobot)
 567                 botfld_mc.sad = db = 65536;
 568         else
 569                 fullsearch(engine, botorg,botref,ssmb,width<<1,
 570                                    i,j,sx,sy>>1,8,width,height>>1,
 571                                     &botfld_mc);
 572         db = botfld_mc.sad;
 573
 574         /* Set correct field selectors */
 575         topfld_mc.fieldsel = 0;
 576         botfld_mc.fieldsel = 1;
 577         topfld_mc.fieldoff = 0;
 578         botfld_mc.fieldoff = width;
 579
 580         /* select prediction for upper half field */
 581         if (dt<=db)
 582         {
 583                 *best8u = topfld_mc;
 584         }
 585         else
 586         {
 587                 *best8u = botfld_mc;
 588         }
 589
 590         /* predict lower half field from top field */
 591         /*
 592           N.b. For interlaced data width<<4 (width*16) takes us 8 rows
 593           down in the same field.
 594           Thus for the fast motion data (2*2
 595           sub-sampled) we need to go 4 rows down in the same field.
 596           This requires adding width*4 = (width<<2).
 597           For the 4*4 sub-sampled motion data we need to go down 2 rows.
 598           This requires adding width = width
 599
 600         */
 601         if (notop)
 602                 topfld_mc.sad = dt = 65536;
 603         else
 604                 fullsearch(engine, toporg,topref,&botssmb,
 605                                                 width<<1,
 606                                                 i,j+8,sx,sy>>1,8,width,height>>1,
 607                                    /* &imint,&jmint, &dt,*/ &topfld_mc);
 608         dt = topfld_mc.sad;
 609         /* predict lower half field from bottom field */
 610         if (nobot)
 611                 botfld_mc.sad = db = 65536;
 612         else
 613                 fullsearch(engine, botorg,botref,&botssmb,width<<1,
 614                                                 i,j+8,sx,sy>>1,8,width,height>>1,
 615                                    /* &iminb,&jminb, &db,*/ &botfld_mc);
 616         db = botfld_mc.sad;
 617         /* Set correct field selectors */
 618         topfld_mc.fieldsel = 0;
 619         botfld_mc.fieldsel = 1;
 620         topfld_mc.fieldoff = 0;
 621         botfld_mc.fieldoff = width;
 622
 623         /* select prediction for lower half field */
 624         if (dt<=db)
 625         {
 626                 *best8l = topfld_mc;
 627         }
 628         else
 629         {
 630                 *best8l = botfld_mc;
 631         }
 632 }
 633
 634 static void dpframe_estimate (motion_engine_t *engine,
 635         pict_data_s *picture,
 636         uint8_t *ref,
 637         subsampled_mb_s *ssmb,
 638
 639         int i, int j, int iminf[2][2], int jminf[2][2],
 640         mb_motion_s *best_mc,
 641         int *imindmvp, int *jmindmvp,
 642         int *vmcp)
 643 {
 644         int pref,ppred,delta_x,delta_y;
 645         int is,js,it,jt,ib,jb,it0,jt0,ib0,jb0;
 646         int imins,jmins,imint,jmint,iminb,jminb,imindmv,jmindmv;
 647         int vmc,local_dist;
 648
 649         /* Calculate Dual Prime distortions for 9 delta candidates
 650          * for each of the four minimum field vectors
 651          * Note: only for P pictures!
 652          */
 653
 654         /* initialize minimum dual prime distortion to large value */
 655         vmc = INT_MAX;
 656
 657         for (pref=0; pref<2; pref++)
 658         {
 659                 for (ppred=0; ppred<2; ppred++)
 660                 {
 661                         /* convert Cartesian absolute to relative motion vector
 662                          * values (wrt current macroblock address (i,j)
 663                          */
 664                         is = iminf[pref][ppred] - (i<<1);
 665                         js = jminf[pref][ppred] - (j<<1);
 666
 667                         if (pref!=ppred)
 668                         {
 669                                 /* vertical field shift adjustment */
 670                                 if (ppred==0)
 671                                         js++;
 672                                 else
 673                                         js--;
 674
 675                                 /* mvxs and mvys scaling*/
 676                                 is<<=1;
 677                                 js<<=1;
 678                                 if (picture->topfirst == ppred)
 679                                 {
 680                                         /* second field: scale by 1/3 */
 681                                         is = (is>=0) ? (is+1)/3 : -((-is+1)/3);
 682                                         js = (js>=0) ? (js+1)/3 : -((-js+1)/3);
 683                                 }
 684                                 else
 685                                         continue;
 686                         }
 687
 688                         /* vector for prediction from field of opposite 'parity' */
 689                         if (picture->topfirst)
 690                         {
 691                                 /* vector for prediction of top field from bottom field */
 692                                 it0 = ((is+(is>0))>>1);
 693                                 jt0 = ((js+(js>0))>>1) - 1;
 694
 695                                 /* vector for prediction of bottom field from top field */
 696                                 ib0 = ((3*is+(is>0))>>1);
 697                                 jb0 = ((3*js+(js>0))>>1) + 1;
 698                         }
 699                         else
 700                         {
 701                                 /* vector for prediction of top field from bottom field */
 702                                 it0 = ((3*is+(is>0))>>1);
 703                                 jt0 = ((3*js+(js>0))>>1) - 1;
 704
 705                                 /* vector for prediction of bottom field from top field */
 706                                 ib0 = ((is+(is>0))>>1);
 707                                 jb0 = ((js+(js>0))>>1) + 1;
 708                         }
 709
 710                         /* convert back to absolute half-pel field picture coordinates */
 711                         is += i<<1;
 712                         js += j<<1;
 713                         it0 += i<<1;
 714                         jt0 += j<<1;
 715                         ib0 += i<<1;
 716                         jb0 += j<<1;
 717
 718                         if (is >= 0 && is <= (width-16)<<1 &&
 719                                 js >= 0 && js <= (height-16))
 720                         {
 721                                 for (delta_y=-1; delta_y<=1; delta_y++)
 722                                 {
 723                                         for (delta_x=-1; delta_x<=1; delta_x++)
 724                                         {
 725                                                 /* opposite field coordinates */
 726                                                 it = it0 + delta_x;
 727                                                 jt = jt0 + delta_y;
 728                                                 ib = ib0 + delta_x;
 729                                                 jb = jb0 + delta_y;
 730
 731                                                 if (it >= 0 && it <= (width-16)<<1 &&
 732                                                         jt >= 0 && jt <= (height-16) &&
 733                                                         ib >= 0 && ib <= (width-16)<<1 &&
 734                                                         jb >= 0 && jb <= (height-16))
 735                                                 {
 736                                                         /* compute prediction error */
 737                                                         local_dist = (*pbdist2)(
 738                                                                 ref + (is>>1) + (width<<1)*(js>>1),
 739                                                                 ref + width + (it>>1) + (width<<1)*(jt>>1),
 740                                                                 ssmb->mb,             /* current mb location */
 741                                                                 width<<1,       /* adjacent line distance */
 742                                                                 is&1, js&1, it&1, jt&1, /* half-pel flags */
 743                                                                 8);             /* block height */
 744                                                         local_dist += (*pbdist2)(
 745                                                                 ref + width + (is>>1) + (width<<1)*(js>>1),
 746                                                                 ref + (ib>>1) + (width<<1)*(jb>>1),
 747                                                                 ssmb->mb + width,     /* current mb location */
 748                                                                 width<<1,       /* adjacent line distance */
 749                                                                 is&1, js&1, ib&1, jb&1, /* half-pel flags */
 750                                                                 8);             /* block height */
 751
 752                                                         /* update delta with least distortion vector */
 753                                                         if (local_dist < vmc)
 754                                                         {
 755                                                                 imins = is;
 756                                                                 jmins = js;
 757                                                                 imint = it;
 758                                                                 jmint = jt;
 759                                                                 iminb = ib;
 760                                                                 jminb = jb;
 761                                                                 imindmv = delta_x;
 762                                                                 jmindmv = delta_y;
 763                                                                 vmc = local_dist;
 764                                                         }
 765                                                 }
 766                                         }  /* end delta x loop */
 767                                 } /* end delta y loop */
 768                         }
 769                 }
 770         }
 771
 772         /* TODO: This is now likely to be obsolete... */
 773         /* Compute L1 error for decision purposes */
 774         local_dist = (*pbdist1)(
 775                 ref + (imins>>1) + (width<<1)*(jmins>>1),
 776                 ref + width + (imint>>1) + (width<<1)*(jmint>>1),
 777                 ssmb->mb,
 778                 width<<1,
 779                 imins&1, jmins&1, imint&1, jmint&1,
 780                 8);
 781 //printf("motion 1 %p\n", pbdist1);
 782         local_dist += (*pbdist1)(
 783                 ref + width + (imins>>1) + (width<<1)*(jmins>>1),
 784                 ref + (iminb>>1) + (width<<1)*(jminb>>1),
 785                 ssmb->mb + width,
 786                 width<<1,
 787                 imins&1, jmins&1, iminb&1, jminb&1,
 788                 8);
 789 //printf("motion 2\n");
 790
 791         best_mc->sad = local_dist;
 792         best_mc->pos.x = imins;
 793         best_mc->pos.y = jmins;
 794         *vmcp = vmc;
 795         *imindmvp = imindmv;
 796         *jmindmvp = jmindmv;
 797 //printf("motion 2\n");
 798 }
 799
 800 static void dpfield_estimate(motion_engine_t *engine,
 801         pict_data_s *picture,
 802         uint8_t *topref,
 803         uint8_t *botref,
 804         uint8_t *mb,
 805         int i, int j, int imins, int jmins,
 806         mb_motion_s *bestdp_mc,
 807         int *vmcp
 808         )
 809
 810 {
 811         uint8_t *sameref, *oppref;
 812         int io0,jo0,io,jo,delta_x,delta_y,mvxs,mvys,mvxo0,mvyo0;
 813         int imino,jmino,imindmv,jmindmv,vmc_dp,local_dist;
 814
 815         /* Calculate Dual Prime distortions for 9 delta candidates */
 816         /* Note: only for P pictures! */
 817
 818         /* Assign opposite and same reference pointer */
 819         if (picture->pict_struct==TOP_FIELD)
 820         {
 821                 sameref = topref;
 822                 oppref = botref;
 823         }
 824         else
 825         {
 826                 sameref = botref;
 827                 oppref = topref;
 828         }
 829
 830         /* convert Cartesian absolute to relative motion vector
 831          * values (wrt current macroblock address (i,j)
 832          */
 833         mvxs = imins - (i<<1);
 834         mvys = jmins - (j<<1);
 835
 836         /* vector for prediction from field of opposite 'parity' */
 837         mvxo0 = (mvxs+(mvxs>0)) >> 1;  /* mvxs / / */
 838         mvyo0 = (mvys+(mvys>0)) >> 1;  /* mvys / / 2*/
 839
 840                         /* vertical field shift correction */
 841         if (picture->pict_struct==TOP_FIELD)
 842                 mvyo0--;
 843         else
 844                 mvyo0++;
 845
 846                         /* convert back to absolute coordinates */
 847         io0 = mvxo0 + (i<<1);
 848         jo0 = mvyo0 + (j<<1);
 849
 850                         /* initialize minimum dual prime distortion to large value */
 851         vmc_dp = 1 << 30;
 852
 853         for (delta_y = -1; delta_y <= 1; delta_y++)
 854         {
 855                 for (delta_x = -1; delta_x <=1; delta_x++)
 856                 {
 857                         /* opposite field coordinates */
 858                         io = io0 + delta_x;
 859                         jo = jo0 + delta_y;
 860
 861                         if (io >= 0 && io <= (width-16)<<1 &&
 862                                 jo >= 0 && jo <= (height2-16)<<1)
 863                         {
 864                                 /* compute prediction error */
 865                                 local_dist = (*pbdist2)(
 866                                         sameref + (imins>>1) + width2*(jmins>>1),
 867                                         oppref  + (io>>1)    + width2*(jo>>1),
 868                                         mb,             /* current mb location */
 869                                         width2,         /* adjacent line distance */
 870                                         imins&1, jmins&1, io&1, jo&1, /* half-pel flags */
 871                                         16);            /* block height */
 872
 873                                 /* update delta with least distortion vector */
 874                                 if (local_dist < vmc_dp)
 875                                 {
 876                                         imino = io;
 877                                         jmino = jo;
 878                                         imindmv = delta_x;
 879                                         jmindmv = delta_y;
 880                                         vmc_dp = local_dist;
 881                                 }
 882                         }
 883                 }  /* end delta x loop */
 884         } /* end delta y loop */
 885
 886         /* Compute L1 error for decision purposes */
 887         bestdp_mc->sad =
 888                 (*pbdist1)(
 889                         sameref + (imins>>1) + width2*(jmins>>1),
 890                         oppref  + (imino>>1) + width2*(jmino>>1),
 891                         mb,             /* current mb location */
 892                         width2,         /* adjacent line distance */
 893                         imins&1, jmins&1, imino&1, jmino&1, /* half-pel flags */
 894                         16);            /* block height */
 895
 896         bestdp_mc->pos.x = imindmv;
 897         bestdp_mc->pos.x = imindmv;
 898         *vmcp = vmc_dp;
 899 }
 900
 901
 902 /*
 903  *   Vectors of motion compensations
 904  */
 905
 906 /*
 907         Take a vector of motion compensations and repeatedly make passes
 908         discarding all elements whose sad "weight" is above the current mean weight.
 909 */
 910
 911 static void sub_mean_reduction( mc_result_s *matches, int len,
 912                                                                 int times,
 913                                                             int *newlen_res, int *minweight_res)
 914 {
 915         int i,j;
 916         int weight_sum;
 917         int mean_weight;
 918         int min_weight = 100000;
 919         if( len == 0 )
 920         {
 921                 *minweight_res = 100000;
 922                 *newlen_res = 0;
 923                 return;
 924         }
 925
 926         for(;;)
 927         {
 928                 weight_sum = 0;
 929                 for( i = 0; i < len ; ++i )
 930                         weight_sum += matches[i].weight;
 931                 mean_weight = weight_sum / len;
 932
 933                 if( times <= 0)
 934                         break;
 935
 936                 j = 0;
 937                 for( i =0; i < len; ++i )
 938                 {
 939                         if( matches[i].weight <= mean_weight )
 940                         {
 941                                 if( times == 1)
 942                                 {
 943                                         min_weight = matches[i].weight ;
 944                                 }
 945                                 matches[j] = matches[i];
 946                                 ++j;
 947                         }
 948                 }
 949                 len = j;
 950                 --times;
 951         }
 952         *newlen_res = len;
 953         *minweight_res = mean_weight;
 954 }
 955
 956 /*
 957   Build a vector of the top   4*4 sub-sampled motion compensations in the box
 958   (ilow,jlow) to (ihigh,jhigh).
 959
 960         The algorithm is as follows:
 961         1. coarse matches on an 8*8 grid of positions are collected that fall below
 962         a (very conservative) sad threshold (basically 50% more than moving average of
 963         the mean sad of such matches).
 964
 965         2. The worse than-average matches are discarded.
 966
 967         3. The remaining coarse matches are expanded with the left/lower neighbouring
 968         4*4 grid matches. Again only those better than a threshold (this time the mean
 969         of the 8*8 grid matches are retained.
 970
 971         4. Multiple passes are made discarding worse than-average matches.  The number of
 972         passes is specified by the user.  The default it 2 (leaving roughly 1/4 of the matches).
 973
 974         The net result is very fast and find good matches if they're to be found.  I.e. the
 975         penalty over exhaustive search is pretty low.
 976
 977         NOTE: The "discard below average" trick depends critically on having some variation in
 978         the matches.  The slight penalty imposed for distant matches (reasonably since the
 979         motion vectors have to be encoded) is *vital* as otherwise pathologically bad
 980         performance results on highly uniform images.
 981
 982         TODO: We should probably allow the user to eliminate the initial thinning of 8*8
 983         grid matches if ultimate quality is demanded (e.g. for low bit-rate applications).
 984
 985 */
 986
 987
 988 static int build_sub44_mcomps(motion_engine_t *engine,
 989          int ilow, int jlow, int ihigh, int jhigh,
 990                                                         int i0, int j0,
 991                                                                 int null_mc_sad,
 992                                                         uint8_t *s44org, uint8_t *s44blk, int qlx, int qh )
 993 {
 994         uint8_t *s44orgblk;
 995         int istrt = ilow-i0;
 996         int jstrt = jlow-j0;
 997         int iend = ihigh-i0;
 998         int jend = jhigh-j0;
 999         int mean_weight;
1000         int threshold;
1001
1002 #ifdef X86_CPU
1003
1004         /*int rangex, rangey;
1005         static int rough_num_mcomps;
1006         static mc_result_s rough_mcomps[MAX_44_MATCHES];
1007         int k;
1008         */
1009 #else
1010         int i,j;
1011         int s1;
1012         uint8_t *old_s44orgblk;
1013 #endif
1014         /* N.b. we may ignore the right hand block of the pair going over the
1015            right edge as we have carefully allocated the buffer oversize to ensure
1016            no memory faults.  The later motion compensation calculations
1017            performed on the results of this pass will filter out
1018            out-of-range blocks...
1019         */
1020
1021
1022         engine->sub44_num_mcomps = 0;
1023
1024         threshold = 6*null_mc_sad / (4*4*mc_44_red);
1025         s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
1026
1027         /* Exhaustive search on 4*4 sub-sampled data.  This is affordable because
1028                 (a)     it is only 16th of the size of the real 1-pel data
1029                 (b) we ignore those matches with an sad above our threshold.
1030         */
1031 #ifndef X86_CPU
1032
1033                 /* Invariant:  s44orgblk = s44org+(i>>2)+qlx*(j>>2) */
1034                 s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
1035                 for( j = jstrt; j <= jend; j += 4 )
1036                 {
1037                         old_s44orgblk = s44orgblk;
1038                         for( i = istrt; i <= iend; i += 4 )
1039                         {
1040                                 s1 = ((*pdist44)( s44orgblk,s44blk,qlx,qh) & 0xffff) + abs(i-i0) + abs(j-j0);
1041                                 if( s1 < threshold )
1042                                 {
1043                                         engine->sub44_mcomps[engine->sub44_num_mcomps].x = i;
1044                                         engine->sub44_mcomps[engine->sub44_num_mcomps].y = j;
1045                                         engine->sub44_mcomps[engine->sub44_num_mcomps].weight = s1 ;
1046                                         ++engine->sub44_num_mcomps;
1047                                 }
1048                                 s44orgblk += 1;
1049                         }
1050                         s44orgblk = old_s44orgblk + qlx;
1051                 }
1052
1053 #else
1054
1055                 engine->sub44_num_mcomps
1056                         = (*pmblock_sub44_dists)( s44orgblk, s44blk,
1057                                                                   istrt, jstrt,
1058                                                                   iend, jend,
1059                                                                   qh, qlx,
1060                                                                   threshold,
1061                                                                   engine->sub44_mcomps);
1062
1063 #endif
1064                 /* If we're really pushing quality we reduce once otherwise twice. */
1065
1066                 sub_mean_reduction( engine->sub44_mcomps, engine->sub44_num_mcomps, 1+(mc_44_red>1),
1067                                                     &engine->sub44_num_mcomps, &mean_weight);
1068
1069
1070         return engine->sub44_num_mcomps;
1071 }
1072
1073
1074 /* Build a vector of the best 2*2 sub-sampled motion
1075   compensations using the best 4*4 matches as starting points.  As
1076   with with the 4*4 matches We don't collect them densely as they're
1077   just search starting points for 1-pel search and ones that are 1 out
1078   should still give better than average matches...
1079
1080 */
1081
1082
1083 static int build_sub22_mcomps(motion_engine_t *engine,
1084                 int i0,  int j0, int ihigh, int jhigh,
1085                 int null_mc_sad,
1086                 uint8_t *s22org,  uint8_t *s22blk,
1087                 int flx, int fh,  int searched_sub44_size )
1088 {
1089         int i,k,s;
1090         int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
1091
1092         int min_weight;
1093         int ilim = ihigh-i0;
1094         int jlim = jhigh-j0;
1095         blockxy matchrec;
1096         uint8_t *s22orgblk;
1097
1098         engine->sub22_num_mcomps = 0;
1099         for( k = 0; k < searched_sub44_size; ++k )
1100         {
1101
1102                 matchrec.x = engine->sub44_mcomps[k].x;
1103                 matchrec.y = engine->sub44_mcomps[k].y;
1104
1105                 s22orgblk =  s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
1106                 for( i = 0; i < 4; ++i )
1107                 {
1108                         if( matchrec.x <= ilim && matchrec.y <= jlim )
1109                         {
1110                                 s = (*pdist22)( s22orgblk,s22blk,flx,fh);
1111                                 if( s < threshold )
1112                                 {
1113                                         engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
1114                                         engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
1115                                         engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
1116                                         ++engine->sub22_num_mcomps;
1117                                 }
1118                         }
1119
1120                         if( i == 1 )
1121                         {
1122                                 s22orgblk += flx-1;
1123                                 matchrec.x -= 2;
1124                                 matchrec.y += 2;
1125                         }
1126                         else
1127                         {
1128                                 s22orgblk += 1;
1129                                 matchrec.x += 2;
1130
1131                         }
1132                 }
1133
1134         }
1135
1136
1137         sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
1138                                                 &engine->sub22_num_mcomps, &min_weight );
1139         return engine->sub22_num_mcomps;
1140 }
1141
1142 #ifdef X86_CPU
1143 int build_sub22_mcomps_mmxe(motion_engine_t *engine,
1144         int i0,  int j0, int ihigh, int jhigh,
1145         int null_mc_sad,
1146         uint8_t *s22org,  uint8_t *s22blk,
1147         int flx, int fh,  int searched_sub44_size )
1148 {
1149         int i,k,s;
1150         int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
1151
1152         int min_weight;
1153         int ilim = ihigh-i0;
1154         int jlim = jhigh-j0;
1155         blockxy matchrec;
1156         uint8_t *s22orgblk;
1157         int resvec[4];
1158
1159         /* TODO: The calculation of the lstrow offset really belongs in
1160        asm code... */
1161         int lstrow=(fh-1)*flx;
1162
1163         engine->sub22_num_mcomps = 0;
1164         for( k = 0; k < searched_sub44_size; ++k )
1165         {
1166
1167                 matchrec.x = engine->sub44_mcomps[k].x;
1168                 matchrec.y = engine->sub44_mcomps[k].y;
1169
1170                 s22orgblk =  s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
1171                 mblockq_dist22_mmxe(s22orgblk+lstrow, s22blk+lstrow, flx, fh, resvec);
1172                 for( i = 0; i < 4; ++i )
1173                 {
1174                         if( matchrec.x <= ilim && matchrec.y <= jlim )
1175                         {
1176                                 s =resvec[i];
1177                                 if( s < threshold )
1178                                 {
1179                                         engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
1180                                         engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
1181                                         engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
1182                                         ++engine->sub22_num_mcomps;
1183                                 }
1184                         }
1185
1186                         if( i == 1 )
1187                         {
1188                                 matchrec.x -= 2;
1189                                 matchrec.y += 2;
1190                         }
1191                         else
1192                         {
1193                                 matchrec.x += 2;
1194                         }
1195                 }
1196
1197         }
1198
1199
1200         sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
1201                                                 &engine->sub22_num_mcomps, &min_weight );
1202         return engine->sub22_num_mcomps;
1203 }
1204
1205 #endif
1206
1207 /*
1208   Search for the best 1-pel match within 1-pel of a good 2*2-pel match.
1209         TODO: Its a bit silly to cart around absolute M/C co-ordinates that
1210         eventually get turned into relative ones anyway...
1211 */
1212
1213
1214 static void find_best_one_pel(motion_engine_t *engine,
1215          uint8_t *org, uint8_t *blk,
1216          int searched_size,
1217          int i0, int j0,
1218          int ilow, int jlow,
1219          int xmax, int ymax,
1220          int lx, int h,
1221          mb_motion_s *res
1222         )
1223
1224 {
1225         int i,k;
1226         int d;
1227         blockxy minpos = res->pos;
1228         int dmin = INT_MAX;
1229         uint8_t *orgblk;
1230         int penalty;
1231         int init_search;
1232         int init_size;
1233         blockxy matchrec;
1234
1235         init_search = searched_size;
1236         init_size = engine->sub22_num_mcomps;
1237         for( k = 0; k < searched_size; ++k )
1238         {
1239                 matchrec.x = i0 + engine->sub22_mcomps[k].x;
1240                 matchrec.y = j0 + engine->sub22_mcomps[k].y;
1241                 orgblk = org + matchrec.x+lx*matchrec.y;
1242                 penalty = abs(matchrec.x)+abs(matchrec.y);
1243
1244                 for( i = 0; i < 4; ++i )
1245                 {
1246                         if( matchrec.x <= xmax && matchrec.y <= ymax )
1247                         {
1248
1249                                 d = penalty+(*pdist1_00)(orgblk,blk,lx,h, dmin);
1250                                 if (d<dmin)
1251                                 {
1252                                         dmin = d;
1253                                         minpos = matchrec;
1254                                 }
1255                         }
1256                         if( i == 1 )
1257                         {
1258                                 orgblk += lx-1;
1259                                 matchrec.x -= 1;
1260                                 matchrec.y += 1;
1261                         }
1262                         else
1263                         {
1264                                 orgblk += 1;
1265                                 matchrec.x += 1;
1266                         }
1267                 }
1268         }
1269
1270         res->pos = minpos;
1271         res->blk = org + minpos.x+lx*minpos.y;
1272         res->sad = dmin;
1273
1274 }
1275
1276 #ifdef X86_CPU
1277 void find_best_one_pel_mmxe(motion_engine_t *engine,
1278         uint8_t *org, uint8_t *blk,
1279         int searched_size,
1280         int i0, int j0,
1281         int ilow, int jlow,
1282         int xmax, int ymax,
1283         int lx, int h,
1284         mb_motion_s *res
1285         )
1286
1287 {
1288         int i,k;
1289         int d;
1290         blockxy minpos = res->pos;
1291         int dmin = INT_MAX;
1292         uint8_t *orgblk;
1293         int penalty;
1294         int init_search;
1295         int init_size;
1296         blockxy matchrec;
1297         int resvec[4];
1298
1299
1300         init_search = searched_size;
1301         init_size = engine->sub22_num_mcomps;
1302         for( k = 0; k < searched_size; ++k )
1303         {
1304                 matchrec.x = i0 + engine->sub22_mcomps[k].x;
1305                 matchrec.y = j0 + engine->sub22_mcomps[k].y;
1306                 orgblk = org + matchrec.x+lx*matchrec.y;
1307                 penalty = abs(matchrec.x)+abs(matchrec.y);
1308
1309
1310                 mblockq_dist1_mmxe(orgblk,blk,lx,h, resvec);
1311
1312                 for( i = 0; i < 4; ++i )
1313                 {
1314                         if( matchrec.x <= xmax && matchrec.y <= ymax )
1315                         {
1316
1317                                 d = penalty+resvec[i];
1318                                 if (d<dmin)
1319                                 {
1320                                         dmin = d;
1321                                         minpos = matchrec;
1322                                 }
1323                         }
1324                         if( i == 1 )
1325                         {
1326                                 orgblk += lx-1;
1327                                 matchrec.x -= 1;
1328                                 matchrec.y += 1;
1329                         }
1330                         else
1331                         {
1332                                 orgblk += 1;
1333                                 matchrec.x += 1;
1334                         }
1335                 }
1336         }
1337
1338         res->pos = minpos;
1339         res->blk = org + minpos.x+lx*minpos.y;
1340         res->sad = dmin;
1341
1342 }
1343 #endif
1344
1345 /*
1346  * full search block matching
1347  *
1348  * A.Stevens 2000: This is now a big misnomer.  The search is now a hierarchical/sub-sampling
1349  * search not a full search.  However, experiments have shown it is always close to
1350  * optimal and almost always very close or optimal.
1351  *
1352  * blk: top left pel of (16*h) block
1353  * s22blk: top element of fast motion compensation block corresponding to blk
1354  * h: height of block
1355  * lx: distance (in bytes) of vertically adjacent pels in ref,blk
1356  * org: top left pel of source reference picture
1357  * ref: top left pel of reconstructed reference picture
1358  * i0,j0: center of search window
1359  * sx,sy: half widths of search window
1360  * xmax,ymax: right/bottom limits of search area
1361  * iminp,jminp: pointers to where the result is stored
1362  *              result is given as half pel offset from ref(0,0)
1363  *              i.e. NOT relative to (i0,j0)
1364  */
1365
1366
1367 static void fullsearch(motion_engine_t *engine,
1368         uint8_t *org,
1369         uint8_t *ref,
1370         subsampled_mb_s *ssblk,
1371         int lx, int i0, int j0,
1372         int sx, int sy, int h,
1373         int xmax, int ymax,
1374         /* int *iminp, int *jminp, int *sadminp, */
1375         mb_motion_s *res
1376         )
1377 {
1378         mb_motion_s best;
1379         /* int imin, jmin, dmin */
1380         int i,j,ilow,ihigh,jlow,jhigh;
1381         int d;
1382
1383         /* NOTE: Surprisingly, the initial motion compensation search
1384            works better when the original image not the reference (reconstructed)
1385            image is used.
1386         */
1387         uint8_t *s22org = (uint8_t*)(org+fsubsample_offset);
1388         uint8_t *s44org = (uint8_t*)(org+qsubsample_offset);
1389         uint8_t *orgblk;
1390         int flx = lx >> 1;
1391         int qlx = lx >> 2;
1392         int fh = h >> 1;
1393         int qh = h >> 2;
1394
1395         /* xmax and ymax into more useful form... */
1396         xmax -= 16;
1397         ymax -= h;
1398
1399
1400         /* The search radii are *always* multiples of 4 to avoid messiness
1401            in the initial 4*4 pel search.  This is handled by the
1402            parameter checking/processing code in readparmfile() */
1403
1404         /* Create a distance-order mcomps of possible motion compensations
1405           based on the fast estimation data - 4*4 pel sums (4*4
1406           sub-sampled) rather than actual pel's.  1/16 the size...  */
1407         jlow = j0-sy;
1408         jlow = jlow < 0 ? 0 : jlow;
1409         jhigh =  j0+(sy-1);
1410         jhigh = jhigh > ymax ? ymax : jhigh;
1411         ilow = i0-sx;
1412         ilow = ilow < 0 ? 0 : ilow;
1413         ihigh =  i0+(sx-1);
1414         ihigh = ihigh > xmax ? xmax : ihigh;
1415
1416         /*
1417            Very rarely this may fail to find matchs due to all the good
1418            looking ones being over threshold. hence we make sure we
1419            fall back to a 0 motion compensation in this case.
1420
1421                  The sad for the 0 motion compensation is also very useful as
1422                  a basis for setting thresholds for rejecting really dud 4*4
1423                  and 2*2 sub-sampled matches.
1424         */
1425         memset(&best, 0, sizeof(best));
1426         best.sad = (*pdist1_00)(ref + i0 + j0 * lx,
1427                 ssblk->mb,
1428                 lx,
1429                 h,
1430                 best.sad);
1431         best.pos.x = i0;
1432         best.pos.y = j0;
1433
1434         engine->sub44_num_mcomps = build_sub44_mcomps(engine,
1435                                                                          ilow, jlow, ihigh, jhigh,
1436                                                                           i0, j0,
1437                                                                           best.sad,
1438                                                                           s44org,
1439                                                                           ssblk->qmb, qlx, qh );
1440
1441
1442         /* Now create a distance-ordered mcomps of possible motion
1443            compensations based on the fast estimation data - 2*2 pel sums
1444            using the best fraction of the 4*4 estimates However we cover
1445            only coarsely... on 4-pel boundaries...  */
1446
1447         engine->sub22_num_mcomps = (*pbuild_sub22_mcomps)( engine, i0, j0, ihigh,  jhigh,
1448                                                                                            best.sad,
1449                                                                                            s22org, ssblk->fmb, flx, fh,
1450                                                                                            engine->sub44_num_mcomps );
1451
1452
1453     /* Now choose best 1-pel match from what approximates (not exact
1454            due to the pre-processing trick with the mean) the top 1/2 of
1455            the 2*2 matches
1456         */
1457
1458
1459         (*pfind_best_one_pel)( engine, ref, ssblk->mb, engine->sub22_num_mcomps,
1460                                            i0, j0,
1461                                            ilow, jlow, xmax, ymax,
1462                                            lx, h, &best );
1463
1464         /* Final polish: half-pel search of best candidate against
1465            reconstructed image.
1466         */
1467
1468         best.pos.x <<= 1;
1469         best.pos.y <<= 1;
1470         best.hx = 0;
1471         best.hy = 0;
1472
1473         ilow = best.pos.x - (best.pos.x>(ilow<<1));
1474         ihigh = best.pos.x + (best.pos.x<((ihigh)<<1));
1475         jlow = best.pos.y - (best.pos.y>(jlow<<1));
1476         jhigh =  best.pos.y+ (best.pos.y<((jhigh)<<1));
1477
1478         for (j=jlow; j<=jhigh; j++)
1479         {
1480                 for (i=ilow; i<=ihigh; i++)
1481                 {
1482                         orgblk = ref+(i>>1)+((j>>1)*lx);
1483                         if( i&1 )
1484                         {
1485                                 if( j & 1 )
1486                                         d = (*pdist1_11)(orgblk,ssblk->mb,lx,h);
1487                                 else
1488                                         d = (*pdist1_01)(orgblk,ssblk->mb,lx,h);
1489                         }
1490                         else
1491                         {
1492                                 if( j & 1 )
1493                                         d = (*pdist1_10)(orgblk,ssblk->mb,lx,h);
1494                                 else
1495                                         d = (*pdist1_00)(orgblk,ssblk->mb,lx,h,best.sad);
1496                         }
1497                         if (d<best.sad)
1498                         {
1499                                 best.sad = d;
1500                                 best.pos.x = i;
1501                                 best.pos.y = j;
1502                                 best.blk = orgblk;
1503                                 best.hx = i&1;
1504                                 best.hy = j&1;
1505                         }
1506                 }
1507         }
1508         best.var = (*pdist2)(best.blk, ssblk->mb, lx, best.hx, best.hy, h);
1509         *res = best;
1510 }
1511
1512 /*
1513  * total absolute difference between two (16*h) blocks
1514  * including optional half pel interpolation of blk1 (hx,hy)
1515  * blk1,blk2: addresses of top left pels of both blocks
1516  * lx:        distance (in bytes) of vertically adjacent pels
1517  * hx,hy:     flags for horizontal and/or vertical interpolation
1518  * h:         height of block (usually 8 or 16)
1519  * distlim:   bail out if sum exceeds this value
1520  */
1521
1522 /* A.Stevens 2000: New version for highly pipelined CPUs where branching is
1523    costly.  Really it sucks that C doesn't define a stdlib abs that could
1524    be realised as a compiler intrinsic using appropriate CPU instructions.
1525    That 1970's heritage...
1526 */
1527
1528
1529 static int dist1_00(uint8_t *blk1,uint8_t *blk2,
1530                                         int lx, int h,int distlim)
1531 {
1532         uint8_t *p1,*p2;
1533         int j;
1534         int s;
1535         register int v;
1536
1537         s = 0;
1538         p1 = blk1;
1539         p2 = blk2;
1540         for (j=0; j<h; j++)
1541         {
1542
1543 #define pipestep(o) v = p1[o]-p2[o]; s+= abs(v);
1544                 pipestep(0);  pipestep(1);  pipestep(2);  pipestep(3);
1545                 pipestep(4);  pipestep(5);  pipestep(6);  pipestep(7);
1546                 pipestep(8);  pipestep(9);  pipestep(10); pipestep(11);
1547                 pipestep(12); pipestep(13); pipestep(14); pipestep(15);
1548 #undef pipestep
1549
1550                 if (s >= distlim)
1551                         break;
1552
1553                 p1+= lx;
1554                 p2+= lx;
1555         }
1556         return s;
1557 }
1558
1559 static int dist1_01(uint8_t *blk1,uint8_t *blk2,
1560                                         int lx, int h)
1561 {
1562         uint8_t *p1,*p2;
1563         int i,j;
1564         int s;
1565         register int v;
1566
1567         s = 0;
1568         p1 = blk1;
1569         p2 = blk2;
1570         for (j=0; j<h; j++)
1571         {
1572                 for (i=0; i<16; i++)
1573                 {
1574
1575                         v = ((unsigned int)(p1[i]+p1[i+1])>>1) - p2[i];
1576                         /*
1577                           v = ((p1[i]>>1)+(p1[i+1]>>1)>>1) - (p2[i]>>1);
1578                         */
1579                         s+=abs(v);
1580                 }
1581                 p1+= lx;
1582                 p2+= lx;
1583         }
1584         return s;
1585 }
1586
1587 static int dist1_10(uint8_t *blk1,uint8_t *blk2,
1588                                         int lx, int h)
1589 {
1590         uint8_t *p1,*p1a,*p2;
1591         int i,j;
1592         int s;
1593         register int v;
1594
1595         s = 0;
1596         p1 = blk1;
1597         p2 = blk2;
1598         p1a = p1 + lx;
1599         for (j=0; j<h; j++)
1600         {
1601                 for (i=0; i<16; i++)
1602                 {
1603                         v = ((unsigned int)(p1[i]+p1a[i])>>1) - p2[i];
1604                         s+=abs(v);
1605                 }
1606                 p1 = p1a;
1607                 p1a+= lx;
1608                 p2+= lx;
1609         }
1610
1611         return s;
1612 }
1613
1614 static int dist1_11(uint8_t *blk1,uint8_t *blk2,
1615                                         int lx, int h)
1616 {
1617         uint8_t *p1,*p1a,*p2;
1618         int i,j;
1619         int s;
1620         register int v;
1621
1622         s = 0;
1623         p1 = blk1;
1624         p2 = blk2;
1625         p1a = p1 + lx;
1626
1627         for (j=0; j<h; j++)
1628         {
1629                 for (i=0; i<16; i++)
1630                 {
1631                         v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1])>>2) - p2[i];
1632                         s+=abs(v);
1633                 }
1634                 p1 = p1a;
1635                 p1a+= lx;
1636                 p2+= lx;
1637         }
1638         return s;
1639 }
1640
1641 /* USED only during debugging...
1642 void check_fast_motion_data(uint8_t *blk, char *label )
1643 {
1644   uint8_t *b, *nb;
1645   uint8_t *pb,*p;
1646   uint8_t *qb;
1647   uint8_t *start_s22blk, *start_s44blk;
1648   int i;
1649   unsigned int mismatch;
1650   int nextfieldline;
1651   start_s22blk = (blk+height*width);
1652   start_s44blk = (blk+height*width+(height>>1)*(width>>1));
1653
1654   if (pict_struct==FRAME_PICTURE)
1655         {
1656           nextfieldline = width;
1657         }
1658   else
1659         {
1660           nextfieldline = 2*width;
1661         }
1662
1663         mismatch = 0;
1664         pb = start_s22blk;
1665         qb = start_s44blk;
1666         p = blk;
1667         while( pb < qb )
1668         {
1669                 for( i = 0; i < nextfieldline/2; ++i )
1670                 {
1671                         mismatch |= ((p[0] + p[1] + p[nextfieldline] + p[nextfieldline+1])>>2) != *pb;
1672                         p += 2;
1673                         ++pb;
1674                 }
1675                 p += nextfieldline;
1676         }
1677                 if( mismatch )
1678                         {
1679                                 printf( "Mismatch detected check %s for buffer %08x\n", label,  blk );
1680                                         exit(1);
1681                         }
1682 }
1683 */
1684
1685 /*
1686    Append fast motion estimation data to original luminance
1687    data.  N.b. memory allocation for luminance data allows space
1688    for this information...
1689  */
1690
1691 void fast_motion_data(uint8_t *blk, int picture_struct )
1692 {
1693         uint8_t *b, *nb;
1694         uint8_t *pb;
1695         uint8_t *qb;
1696         uint8_t *start_s22blk, *start_s44blk;
1697         uint16_t *start_rowblk, *start_colblk;
1698         int i;
1699         int nextfieldline;
1700 #ifdef TEST_RCSEARCH
1701         uint16_t *pc, *pr,*p;
1702         int rowsum;
1703         int j,s;
1704         int down16 = width*16;
1705         uint16_t sums[32];
1706         uint16_t rowsums[2048];
1707         uint16_t colsums[2048];  /* TODO: BUG: should resize with width */
1708 #endif
1709
1710
1711         /* In an interlaced field the "next" line is 2 width's down
1712            rather than 1 width down                                 */
1713
1714         if (picture_struct==FRAME_PICTURE)
1715         {
1716                 nextfieldline = width;
1717         }
1718         else
1719         {
1720                 nextfieldline = 2*width;
1721         }
1722
1723         start_s22blk   = blk+fsubsample_offset;
1724         start_s44blk   = blk+qsubsample_offset;
1725         start_rowblk = (uint16_t *)blk+rowsums_offset;
1726         start_colblk = (uint16_t *)blk+colsums_offset;
1727         b = blk;
1728         nb = (blk+nextfieldline);
1729         /* Sneaky stuff here... we can do lines in both fields at once */
1730         pb = (uint8_t *) start_s22blk;
1731
1732         while( nb < start_s22blk )
1733         {
1734                 for( i = 0; i < nextfieldline/4; ++i ) /* We're doing 4 pels horizontally at once */
1735                 {
1736                         /* TODO: A.Stevens this has to be the most word-length dependent
1737                            code in the world.  Better than MMX assembler though I guess... */
1738                         pb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
1739                         pb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;
1740                         pb += 2;
1741                         b += 4;
1742                         nb += 4;
1743                 }
1744                 b += nextfieldline;
1745                 nb = b + nextfieldline;
1746         }
1747
1748
1749         /* Now create the 4*4 sub-sampled data from the 2*2
1750            N.b. the 2*2 sub-sampled motion data preserves the interlace structure of the
1751            original.  Albeit half as many lines and pixels...
1752         */
1753
1754         nextfieldline = nextfieldline >> 1;
1755
1756         qb = start_s44blk;
1757         b  = start_s22blk;
1758         nb = (start_s22blk+nextfieldline);
1759
1760         while( nb < start_s44blk )
1761         {
1762                 for( i = 0; i < nextfieldline/4; ++i )
1763                 {
1764                         /* TODO: BRITTLE: A.Stevens - this only works for uint8_t = uint8_t */
1765                         qb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
1766                         qb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;
1767                         qb += 2;
1768                         b += 4;
1769                         nb += 4;
1770                 }
1771                 b += nextfieldline;
1772                 nb = b + nextfieldline;
1773         }
1774
1775 #ifdef TEST_RCSEARCH
1776         /* TODO: BUG: THIS CODE DOES NOT YET ALLOW FOR INTERLACED FIELDS.... */
1777
1778         /*
1779           Initial row sums....
1780         */
1781         pb = blk;
1782         for(j = 0; j < height; ++j )
1783         {
1784                 rowsum = 0;
1785                 for( i = 0; i < 16; ++ i )
1786                 {
1787                         rowsum += pb[i];
1788                 }
1789                 rowsums[j] = rowsum;
1790                 pb += width;
1791         }
1792
1793         /*
1794           Initial column sums
1795         */
1796         for( i = 0; i < width; ++i )
1797         {
1798                 colsums[i] = 0;
1799         }
1800         pb = blk;
1801         for( j = 0; j < 16; ++j )
1802         {
1803                 for( i = 0; i < width; ++i )
1804                 {
1805                         colsums[i] += *pb;
1806                         ++pb;
1807                 }
1808         }
1809
1810         /* Now fill in the row/column sum tables...
1811            Note: to allow efficient construction of sum/col differences for a
1812            given position row sums are held in a *column major* array
1813            (changing y co-ordinate makes for small index changes)
1814            the col sums are held in a *row major* array
1815         */
1816
1817         pb = blk;
1818         pc = start_colblk;
1819         for(j = 0; j <32; ++j )
1820         {
1821                 pr = start_rowblk;
1822                 rowsum = rowsums[j];
1823                 for( i = 0; i < width-16; ++i )
1824                 {
1825                         pc[i] = colsums[i];
1826                         pr[j] = rowsum;
1827                         colsums[i] = (colsums[i] + pb[down16] )-pb[0];
1828                         rowsum = (rowsum + pb[16]) - pb[0];
1829                         ++pb;
1830                         pr += height;
1831                 }
1832                 pb += 16;   /* move pb on to next row... rememember we only did width-16! */
1833                 pc += width;
1834         }
1835 #endif
1836 }
1837
1838
1839 static int dist22( uint8_t *s22blk1, uint8_t *s22blk2,int flx,int fh)
1840 {
1841         uint8_t *p1 = s22blk1;
1842         uint8_t *p2 = s22blk2;
1843         int s = 0;
1844         int j;
1845
1846         for( j = 0; j < fh; ++j )
1847         {
1848                 register int diff;
1849 #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff)
1850                 pipestep(0); pipestep(1);
1851                 pipestep(2); pipestep(3);
1852                 pipestep(4); pipestep(5);
1853                 pipestep(6); pipestep(7);
1854                 p1 += flx;
1855                 p2 += flx;
1856 #undef pipestep
1857         }
1858
1859         return s;
1860 }
1861
1862
1863
1864 /*
1865   Sum absolute differences for 4*4 sub-sampled data.
1866
1867   TODO: currently assumes  only 16*16 or 16*8 motion compensation will be used...
1868   I.e. 4*4 or 4*2 sub-sampled blocks will be compared.
1869  */
1870
1871
1872 static int dist44( uint8_t *s44blk1, uint8_t *s44blk2,int qlx,int qh)
1873 {
1874         register uint8_t *p1 = s44blk1;
1875         register uint8_t *p2 = s44blk2;
1876         int s = 0;
1877         register int diff;
1878
1879         /* #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff) */
1880 #define pipestep(o) diff = p1[o]-p2[o]; s += diff < 0 ? -diff : diff;
1881         pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
1882         if( qh > 1 )
1883         {
1884                 p1 += qlx; p2 += qlx;
1885                 pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
1886                 if( qh > 2 )
1887                 {
1888                         p1 += qlx; p2 += qlx;
1889                         pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
1890                         p1 += qlx; p2 += qlx;
1891                         pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
1892                 }
1893         }
1894
1895
1896         return s;
1897 }
1898
1899 /*
1900  * total squared difference between two (8*h) blocks of 2*2 sub-sampled pels
1901  * blk1,blk2: addresses of top left pels of both blocks
1902  * lx:        distance (in bytes) of vertically adjacent pels
1903  * h:         height of block (usually 8 or 16)
1904  */
1905
1906 static int dist2_22(uint8_t *blk1, uint8_t *blk2, int lx, int h)
1907 {
1908         uint8_t *p1 = blk1, *p2 = blk2;
1909         int i,j,v;
1910         int s = 0;
1911         for (j=0; j<h; j++)
1912         {
1913                 for (i=0; i<8; i++)
1914                 {
1915                         v = p1[i] - p2[i];
1916                         s+= v*v;
1917                 }
1918                 p1+= lx;
1919                 p2+= lx;
1920         }
1921         return s;
1922 }
1923
1924 /* total squared difference between bidirection prediction of (8*h)
1925  * blocks of 2*2 sub-sampled pels and reference
1926  * blk1f, blk1b,blk2: addresses of top left
1927  * pels of blocks
1928  * lx: distance (in bytes) of vertically adjacent
1929  * pels
1930  * h: height of block (usually 4 or 8)
1931  */
1932
1933 static int bdist2_22(uint8_t *blk1f, uint8_t *blk1b, uint8_t *blk2,
1934                                          int lx, int h)
1935 {
1936         uint8_t *p1f = blk1f,*p1b = blk1b,*p2 = blk2;
1937         int i,j,v;
1938         int s = 0;
1939         for (j=0; j<h; j++)
1940         {
1941                 for (i=0; i<8; i++)
1942                 {
1943                         v = ((p1f[i]+p1b[i]+1)>>1) - p2[i];
1944                         s+= v*v;
1945                 }
1946                 p1f+= lx;
1947                 p1b+= lx;
1948                 p2+= lx;
1949         }
1950         return s;
1951 }
1952
1953 /*
1954  * total squared difference between two (16*h) blocks
1955  * including optional half pel interpolation of blk1 (hx,hy)
1956  * blk1,blk2: addresses of top left pels of both blocks
1957  * lx:        distance (in bytes) of vertically adjacent pels
1958  * hx,hy:     flags for horizontal and/or vertical interpolation
1959  * h:         height of block (usually 8 or 16)
1960  */
1961
1962
1963 static int dist2(blk1,blk2,lx,hx,hy,h)
1964         uint8_t *blk1,*blk2;
1965         int lx,hx,hy,h;
1966 {
1967         uint8_t *p1,*p1a,*p2;
1968         int i,j;
1969         int s,v;
1970
1971         s = 0;
1972         p1 = blk1;
1973         p2 = blk2;
1974         if (!hx && !hy)
1975                 for (j=0; j<h; j++)
1976                 {
1977                         for (i=0; i<16; i++)
1978                         {
1979                                 v = p1[i] - p2[i];
1980                                 s+= v*v;
1981                         }
1982                         p1+= lx;
1983                         p2+= lx;
1984                 }
1985         else if (hx && !hy)
1986                 for (j=0; j<h; j++)
1987                 {
1988                         for (i=0; i<16; i++)
1989                         {
1990                                 v = ((unsigned int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
1991                                 s+= v*v;
1992                         }
1993                         p1+= lx;
1994                         p2+= lx;
1995                 }
1996         else if (!hx && hy)
1997         {
1998                 p1a = p1 + lx;
1999                 for (j=0; j<h; j++)
2000                 {
2001                         for (i=0; i<16; i++)
2002                         {
2003                                 v = ((unsigned int)(p1[i]+p1a[i]+1)>>1) - p2[i];
2004                                 s+= v*v;
2005                         }
2006                         p1 = p1a;
2007                         p1a+= lx;
2008                         p2+= lx;
2009                 }
2010         }
2011         else /* if (hx && hy) */
2012         {
2013                 p1a = p1 + lx;
2014                 for (j=0; j<h; j++)
2015                 {
2016                         for (i=0; i<16; i++)
2017                         {
2018                                 v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1]+2)>>2) - p2[i];
2019                                 s+= v*v;
2020                         }
2021                         p1 = p1a;
2022                         p1a+= lx;
2023                         p2+= lx;
2024                 }
2025         }
2026
2027         return s;
2028 }
2029
2030
2031 /*
2032  * absolute difference error between a (16*h) block and a bidirectional
2033  * prediction
2034  *
2035  * p2: address of top left pel of block
2036  * pf,hxf,hyf: address and half pel flags of forward ref. block
2037  * pb,hxb,hyb: address and half pel flags of backward ref. block
2038  * h: height of block
2039  * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
2040  */
2041
2042
2043 static int bdist1(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
2044         uint8_t *pf,*pb,*p2;
2045         int lx,hxf,hyf,hxb,hyb,h;
2046 {
2047         uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
2048         int i,j;
2049         int s,v;
2050
2051         pfa = pf + hxf;
2052         pfb = pf + lx*hyf;
2053         pfc = pfb + hxf;
2054
2055         pba = pb + hxb;
2056         pbb = pb + lx*hyb;
2057         pbc = pbb + hxb;
2058
2059         s = 0;
2060
2061         for (j=0; j<h; j++)
2062         {
2063                 for (i=0; i<16; i++)
2064                 {
2065                         v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
2066                                   ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
2067                                 - *p2++;
2068                         s += abs(v);
2069                 }
2070                 p2+= lx-16;
2071                 pf+= lx-16;
2072                 pfa+= lx-16;
2073                 pfb+= lx-16;
2074                 pfc+= lx-16;
2075                 pb+= lx-16;
2076                 pba+= lx-16;
2077                 pbb+= lx-16;
2078                 pbc+= lx-16;
2079         }
2080
2081         return s;
2082 }
2083
2084 /*
2085  * squared error between a (16*h) block and a bidirectional
2086  * prediction
2087  *
2088  * p2: address of top left pel of block
2089  * pf,hxf,hyf: address and half pel flags of forward ref. block
2090  * pb,hxb,hyb: address and half pel flags of backward ref. block
2091  * h: height of block
2092  * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
2093  */
2094
2095
2096 static int bdist2(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
2097         uint8_t *pf,*pb,*p2;
2098         int lx,hxf,hyf,hxb,hyb,h;
2099 {
2100         uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
2101         int i,j;
2102         int s,v;
2103
2104         pfa = pf + hxf;
2105         pfb = pf + lx*hyf;
2106         pfc = pfb + hxf;
2107
2108         pba = pb + hxb;
2109         pbb = pb + lx*hyb;
2110         pbc = pbb + hxb;
2111
2112         s = 0;
2113
2114         for (j=0; j<h; j++)
2115         {
2116                 for (i=0; i<16; i++)
2117                 {
2118                         v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
2119                                   ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
2120                                 - *p2++;
2121                         s+=v*v;
2122                 }
2123                 p2+= lx-16;
2124                 pf+= lx-16;
2125                 pfa+= lx-16;
2126                 pfb+= lx-16;
2127                 pfc+= lx-16;
2128                 pb+= lx-16;
2129                 pba+= lx-16;
2130                 pbb+= lx-16;
2131                 pbc+= lx-16;
2132         }
2133
2134         return s;
2135 }
2136
2137
2138 /*
2139  * variance of a (size*size) block, multiplied by 256
2140  * p:  address of top left pel of block
2141  * lx: distance (in bytes) of vertically adjacent pels
2142  */
2143 static int variance(uint8_t *p, int size,       int lx)
2144 {
2145         int i,j;
2146         unsigned int v,s,s2;
2147
2148         s = s2 = 0;
2149
2150         for (j=0; j<size; j++)
2151         {
2152                 for (i=0; i<size; i++)
2153                 {
2154                         v = *p++;
2155                         s+= v;
2156                         s2+= v*v;
2157                 }
2158                 p+= lx-size;
2159         }
2160         return s2 - (s*s)/(size*size);
2161 }
2162
2163 /*
2164   Compute the variance of the residual of uni-directionally motion
2165   compensated block.
2166  */
2167
2168 static int unidir_pred_var( const mb_motion_s *motion,
2169                                                         uint8_t *mb,
2170                                                         int lx,
2171                                                         int h)
2172 {
2173         return (*pdist2)(motion->blk, mb, lx, motion->hx, motion->hy, h);
2174 }
2175
2176
2177 /*
2178   Compute the variance of the residual of bi-directionally motion
2179   compensated block.
2180  */
2181
2182 static int bidir_pred_var( const mb_motion_s *motion_f,
2183                                                    const mb_motion_s *motion_b,
2184                                                    uint8_t *mb,
2185                                                    int lx, int h)
2186 {
2187         return (*pbdist2)( motion_f->blk, motion_b->blk,
2188                                            mb, lx,
2189                                            motion_f->hx, motion_f->hy,
2190                                            motion_b->hx, motion_b->hy,
2191                                            h);
2192 }
2193
2194 /*
2195   Compute SAD for bi-directionally motion compensated blocks...
2196  */
2197
2198 static int bidir_pred_sad( const mb_motion_s *motion_f,
2199                                                    const mb_motion_s *motion_b,
2200                                                    uint8_t *mb,
2201                                                    int lx, int h)
2202 {
2203         return (*pbdist1)(motion_f->blk, motion_b->blk,
2204                                          mb, lx,
2205                                          motion_f->hx, motion_f->hy,
2206                                          motion_b->hx, motion_b->hy,
2207                                          h);
2208 }
2209
2210 static void frame_ME(motion_engine_t *engine,
2211                                         pict_data_s *picture,
2212                                          motion_comp_s *mc,
2213                                          int mb_row_start,
2214                                          int i, int j,
2215                                          mbinfo_s *mbi)
2216 {
2217         mb_motion_s framef_mc;
2218         mb_motion_s frameb_mc;
2219         mb_motion_s dualpf_mc;
2220         mb_motion_s topfldf_mc;
2221         mb_motion_s botfldf_mc;
2222         mb_motion_s topfldb_mc;
2223         mb_motion_s botfldb_mc;
2224
2225         int var,v0;
2226         int vmc,vmcf,vmcr,vmci;
2227         int vmcfieldf,vmcfieldr,vmcfieldi;
2228         subsampled_mb_s ssmb;
2229         int imins[2][2],jmins[2][2];
2230         int imindmv,jmindmv,vmc_dp;
2231 //printf("frame_ME 1\n");
2232
2233
2234         /* A.Stevens fast motion estimation data is appended to actual
2235            luminance information
2236         */
2237         ssmb.mb = mc->cur[0] + mb_row_start + i;
2238         ssmb.umb = (uint8_t*)(mc->cur[1] + (i>>1) + (mb_row_start>>2));
2239         ssmb.vmb = (uint8_t*)(mc->cur[2] + (i>>1) + (mb_row_start>>2));
2240         ssmb.fmb = (uint8_t*)(mc->cur[0] + fsubsample_offset +
2241                                                   ((i>>1) + (mb_row_start>>2)));
2242         ssmb.qmb = (uint8_t*)(mc->cur[0] + qsubsample_offset +
2243                                                   (i>>2) + (mb_row_start>>4));
2244
2245         /* Compute variance MB as a measure of Intra-coding complexity
2246            We include chrominance information here, scaled to compensate
2247            for sub-sampling.  Silly MPEG forcing chrom/lum to have same
2248            quantisations...
2249          */
2250         var = variance(ssmb.mb,16,width);
2251
2252 //printf("motion %d\n", picture->pict_type);
2253         if (picture->pict_type==I_TYPE)
2254         {
2255                 mbi->mb_type = MB_INTRA;
2256         }
2257         else if (picture->pict_type==P_TYPE)
2258         {
2259                 /* For P pictures we take into account chrominance. This
2260                    provides much better performance at scene changes */
2261                 var += chrom_var_sum(&ssmb,16,width);
2262
2263                 if (picture->frame_pred_dct)
2264                 {
2265                         fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2266                                            width,i,j,mc->sxf,mc->syf,16,width,height,
2267                                             &framef_mc);
2268                         framef_mc.fieldoff = 0;
2269                         vmc = framef_mc.var +
2270                                 unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
2271                         mbi->motion_type = MC_FRAME;
2272                 }
2273                 else
2274                 {
2275 //printf("frame_ME 2 %p %p\n", mc->oldorg[0], mc->oldref[0]);
2276                         frame_estimate(engine, mc->oldorg[0],
2277                                 mc->oldref[0],
2278                                 &ssmb,
2279                                 i,
2280                                 j,
2281                                 mc->sxf,
2282                                 mc->syf,
2283                                 &framef_mc,
2284                                 &topfldf_mc,
2285                                 &botfldf_mc,
2286                                 imins,
2287                                 jmins);
2288 //printf("frame_ME 3\n");
2289                         vmcf = framef_mc.var +
2290                                 unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
2291                         vmcfieldf =
2292                                 topfldf_mc.var +
2293                                 unidir_chrom_var_sum( &topfldf_mc, mc->oldref, &ssmb, (width<<1), 8 ) +
2294                                 botfldf_mc.var +
2295                                 unidir_chrom_var_sum( &botfldf_mc, mc->oldref, &ssmb, (width<<1), 8 );
2296                         /* DEBUG DP currently disabled... */
2297 //                      if ( M==1)
2298 //                      {
2299 //                              dpframe_estimate(engine, picture,mc->oldref[0],&ssmb,
2300 //                                                               i,j>>1,imins,jmins,
2301 //                                                               &dualpf_mc,
2302 //                                                               &imindmv,&jmindmv, &vmc_dp);
2303 //                      }
2304
2305                         /* NOTE: Typically M =3 so DP actually disabled... */
2306                         /* select between dual prime, frame and field prediction */
2307 //                      if ( M==1 && vmc_dp<vmcf && vmc_dp<vmcfieldf)
2308 //                      {
2309 //                              mbi->motion_type = MC_DMV;
2310 //                              /* No chrominance squared difference  measure yet.
2311 //                                 Assume identical to luminance */
2312 //                              vmc = vmc_dp + vmc_dp;
2313 //                      }
2314 //                      else
2315                         if ( vmcf < vmcfieldf)
2316                         {
2317                                 mbi->motion_type = MC_FRAME;
2318                                 vmc = vmcf;
2319
2320                         }
2321                         else
2322                         {
2323                                 mbi->motion_type = MC_FIELD;
2324                                 vmc = vmcfieldf;
2325                         }
2326                 }
2327
2328
2329                 /* select between intra or non-intra coding:
2330                  *
2331                  * selection is based on intra block variance (var) vs.
2332                  * prediction error variance (vmc)
2333                  *
2334                  * Used to be: blocks with small prediction error are always
2335                  * coded non-intra even if variance is smaller (is this reasonable?
2336                  *
2337                  * TODO: A.Stevens Jul 2000
2338                  * The bbmpeg guys have found this to be *unreasonable*.
2339                  * I'm not sure I buy their solution using vmc*2.  It is probabably
2340                  * the vmc>= 9*256 test that is suspect.
2341                  *
2342                  */
2343
2344
2345                 if (vmc>var /*&& vmc>=(3*3)*16*16*2*/ )
2346                 {
2347                         mbi->mb_type = MB_INTRA;
2348                         mbi->var = var;
2349                 }
2350
2351                 else
2352                 {
2353                         /* select between MC / No-MC
2354                          *
2355                          * use No-MC if var(No-MC) <= 1.25*var(MC)
2356                          * (i.e slightly biased towards No-MC)
2357                          *
2358                          * blocks with small prediction error are always coded as No-MC
2359                          * (requires no motion vectors, allows skipping)
2360                          */
2361                         v0 = (*pdist2)(mc->oldref[0]+i+width*j,ssmb.mb,width,0,0,16);
2362
2363                         if (4*v0>5*vmc )
2364                         {
2365                                 /* use MC */
2366                                 var = vmc;
2367                                 mbi->mb_type = MB_FORWARD;
2368                                 if (mbi->motion_type==MC_FRAME)
2369                                 {
2370                                         mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
2371                                         mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
2372                                 }
2373                                 else if (mbi->motion_type==MC_DMV)
2374                                 {
2375                                         /* these are FRAME vectors */
2376                                         /* same parity vector */
2377                                         mbi->MV[0][0][0] = dualpf_mc.pos.x - (i<<1);
2378                                         mbi->MV[0][0][1] = (dualpf_mc.pos.y<<1) - (j<<1);
2379
2380                                         /* opposite parity vector */
2381                                         mbi->dmvector[0] = imindmv;
2382                                         mbi->dmvector[1] = jmindmv;
2383                                 }
2384                                 else
2385                                 {
2386                                         /* these are FRAME vectors */
2387                                         mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
2388                                         mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
2389                                         mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
2390                                         mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
2391                                         mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
2392                                         mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
2393                                 }
2394                         }
2395                         else
2396                         {
2397
2398                                 /* No-MC */
2399                                 var = v0;
2400                                 mbi->mb_type = 0;
2401                                 mbi->motion_type = MC_FRAME;
2402                                 mbi->MV[0][0][0] = 0;
2403                                 mbi->MV[0][0][1] = 0;
2404                         }
2405                 }
2406         }
2407         else /* if (pict_type==B_TYPE) */
2408         {
2409                 if (picture->frame_pred_dct)
2410                 {
2411                         var = variance(ssmb.mb,16,width);
2412                         /* forward */
2413                         fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2414                                            width,i,j,mc->sxf,mc->syf,
2415                                            16,width,height,
2416                                            &framef_mc
2417                                            );
2418                         framef_mc.fieldoff = 0;
2419                         vmcf = framef_mc.var;
2420
2421                         /* backward */
2422                         fullsearch(engine, mc->neworg[0],mc->newref[0],&ssmb,
2423                                            width,i,j,mc->sxb,mc->syb,
2424                                            16,width,height,
2425                                            &frameb_mc);
2426                         frameb_mc.fieldoff = 0;
2427                         vmcr = frameb_mc.var;
2428
2429                         /* interpolated (bidirectional) */
2430
2431                         vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
2432
2433                         /* decisions */
2434
2435                         /* select between forward/backward/interpolated prediction:
2436                          * use the one with smallest mean sqaured prediction error
2437                          */
2438                         if (vmcf<=vmcr && vmcf<=vmci)
2439                         {
2440                                 vmc = vmcf;
2441                                 mbi->mb_type = MB_FORWARD;
2442                         }
2443                         else if (vmcr<=vmci)
2444                         {
2445                                 vmc = vmcr;
2446                                 mbi->mb_type = MB_BACKWARD;
2447                         }
2448                         else
2449                         {
2450                                 vmc = vmci;
2451                                 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2452                         }
2453
2454                         mbi->motion_type = MC_FRAME;
2455                 }
2456                 else
2457                 {
2458                         /* forward prediction */
2459                         frame_estimate(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2460                                                    i,j,mc->sxf,mc->syf,
2461                                                    &framef_mc,
2462                                                    &topfldf_mc,
2463                                                    &botfldf_mc,
2464                                                    imins,jmins);
2465
2466
2467                         /* backward prediction */
2468                         frame_estimate(engine, mc->neworg[0],mc->newref[0],&ssmb,
2469                                                    i,j,mc->sxb,mc->syb,
2470                                                    &frameb_mc,
2471                                                    &topfldb_mc,
2472                                                    &botfldb_mc,
2473                                                    imins,jmins);
2474
2475                         vmcf = framef_mc.var;
2476                         vmcr = frameb_mc.var;
2477                         vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
2478
2479                         vmcfieldf = topfldf_mc.var + botfldf_mc.var;
2480                         vmcfieldr = topfldb_mc.var + botfldb_mc.var;
2481                         vmcfieldi = bidir_pred_var( &topfldf_mc, &topfldb_mc, ssmb.mb,
2482                                                                                 width<<1, 8) +
2483                                         bidir_pred_var( &botfldf_mc, &botfldb_mc, ssmb.mb,
2484                                                                                 width<<1, 8);
2485
2486                         /* select prediction type of minimum distance from the
2487                          * six candidates (field/frame * forward/backward/interpolated)
2488                          */
2489                         if (vmci<vmcfieldi && vmci<vmcf && vmci<vmcfieldf
2490                                   && vmci<vmcr && vmci<vmcfieldr)
2491                         {
2492                                 /* frame, interpolated */
2493                                 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2494                                 mbi->motion_type = MC_FRAME;
2495                                 vmc = vmci;
2496                         }
2497                         else if ( vmcfieldi<vmcf && vmcfieldi<vmcfieldf
2498                                            && vmcfieldi<vmcr && vmcfieldi<vmcfieldr)
2499                         {
2500                                 /* field, interpolated */
2501                                 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2502                                 mbi->motion_type = MC_FIELD;
2503                                 vmc = vmcfieldi;
2504                         }
2505                         else if (vmcf<vmcfieldf && vmcf<vmcr && vmcf<vmcfieldr)
2506                         {
2507                                 /* frame, forward */
2508                                 mbi->mb_type = MB_FORWARD;
2509                                 mbi->motion_type = MC_FRAME;
2510                                 vmc = vmcf;
2511
2512                         }
2513                         else if ( vmcfieldf<vmcr && vmcfieldf<vmcfieldr)
2514                         {
2515                                 /* field, forward */
2516                                 mbi->mb_type = MB_FORWARD;
2517                                 mbi->motion_type = MC_FIELD;
2518                                 vmc = vmcfieldf;
2519                         }
2520                         else if (vmcr<vmcfieldr)
2521                         {
2522                                 /* frame, backward */
2523                                 mbi->mb_type = MB_BACKWARD;
2524                                 mbi->motion_type = MC_FRAME;
2525                                 vmc = vmcr;
2526                         }
2527                         else
2528                         {
2529                                 /* field, backward */
2530                                 mbi->mb_type = MB_BACKWARD;
2531                                 mbi->motion_type = MC_FIELD;
2532                                 vmc = vmcfieldr;
2533
2534                         }
2535                 }
2536
2537                 /* select between intra or non-intra coding:
2538                  *
2539                  * selection is based on intra block variance (var) vs.
2540                  * prediction error variance (vmc)
2541                  *
2542                  * Used to be: blocks with small prediction error are always
2543                  * coded non-intra even if variance is smaller (is this reasonable?
2544                  *
2545                  * TODO: A.Stevens Jul 2000
2546                  * The bbmpeg guys have found this to be *unreasonable*.
2547                  * I'm not sure I buy their solution using vmc*2 in the first comparison.
2548                  * It is probabably the vmc>= 9*256 test that is suspect.
2549                  *
2550                  */
2551                 if (vmc>var && vmc>=9*256)
2552                         mbi->mb_type = MB_INTRA;
2553                 else
2554                 {
2555                         var = vmc;
2556                         if (mbi->motion_type==MC_FRAME)
2557                         {
2558                                 /* forward */
2559                                 mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
2560                                 mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
2561                                 /* backward */
2562                                 mbi->MV[0][1][0] = frameb_mc.pos.x - (i<<1);
2563                                 mbi->MV[0][1][1] = frameb_mc.pos.y - (j<<1);
2564                         }
2565                         else
2566                         {
2567                                 /* these are FRAME vectors */
2568                                 /* forward */
2569                                 mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
2570                                 mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
2571                                 mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
2572                                 mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
2573                                 mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
2574                                 mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
2575                                 /* backward */
2576                                 mbi->MV[0][1][0] = topfldb_mc.pos.x - (i<<1);
2577                                 mbi->MV[0][1][1] = (topfldb_mc.pos.y<<1) - (j<<1);
2578                                 mbi->MV[1][1][0] = botfldb_mc.pos.x - (i<<1);
2579                                 mbi->MV[1][1][1] = (botfldb_mc.pos.y<<1) - (j<<1);
2580                                 mbi->mv_field_sel[0][1] = topfldb_mc.fieldsel;
2581                                 mbi->mv_field_sel[1][1] = botfldb_mc.fieldsel;
2582                         }
2583                 }
2584         }
2585 }
2586
2587
2588
2589 /*
2590  * motion estimation for field pictures
2591  *
2592  * mbi:    pointer to macroblock info structure
2593  * secondfield: indicates second field of a frame (in P fields this means
2594  *              that reference field of opposite parity is in curref instead
2595  *              of oldref)
2596  * ipflag: indicates a P type field which is the second field of a frame
2597  *         in which the first field is I type (this restricts predictions
2598  *         to be based only on the opposite parity (=I) field)
2599  *
2600  * results:
2601  * mbi->
2602  *  mb_type: 0, MB_INTRA, MB_FORWARD, MB_BACKWARD, MB_FORWARD|MB_BACKWARD
2603  *  MV[][][]: motion vectors (field format)
2604  *  mv_field_sel: top/bottom field
2605  *  motion_type: MC_FIELD, MC_16X8
2606  *
2607  * uses global vars: pict_type, pict_struct
2608  */
2609 static void field_ME(motion_engine_t *engine,
2610         pict_data_s *picture,
2611         motion_comp_s *mc,
2612         int mb_row_start,
2613         int i, int j,
2614         mbinfo_s *mbi,
2615         int secondfield, int ipflag)
2616 {
2617         int w2;
2618         uint8_t *toporg, *topref, *botorg, *botref;
2619         subsampled_mb_s ssmb;
2620         mb_motion_s fields_mc, dualp_mc;
2621         mb_motion_s fieldf_mc, fieldb_mc;
2622         mb_motion_s field8uf_mc, field8lf_mc;
2623         mb_motion_s field8ub_mc, field8lb_mc;
2624         int var,vmc,v0,dmc,dmcfieldi,dmcfield,dmcfieldf,dmcfieldr,dmc8i;
2625         int imins,jmins;
2626         int dmc8f,dmc8r;
2627         int vmc_dp,dmc_dp;
2628
2629         w2 = width<<1;
2630
2631         /* Fast motion data sits at the end of the luminance buffer */
2632         ssmb.mb = mc->cur[0] + i + w2*j;
2633         ssmb.umb = mc->cur[1] + ((i>>1)+(w2>>1)*(j>>1));
2634         ssmb.vmb = mc->cur[2] + ((i>>1)+(w2>>1)*(j>>1));
2635         ssmb.fmb = mc->cur[0] + fsubsample_offset+((i>>1)+(w2>>1)*(j>>1));
2636         ssmb.qmb = mc->cur[0] + qsubsample_offset+ (i>>2)+(w2>>2)*(j>>2);
2637
2638         if (picture->pict_struct==BOTTOM_FIELD)
2639         {
2640                 ssmb.mb += width;
2641                 ssmb.umb += (width >> 1);
2642                 ssmb.vmb += (width >> 1);
2643                 ssmb.fmb += (width >> 1);
2644                 ssmb.qmb += (width >> 2);
2645         }
2646
2647         var = variance(ssmb.mb,16,w2) +
2648                 ( variance(ssmb.umb,8,(width>>1)) + variance(ssmb.vmb,8,(width>>1)))*2;
2649
2650         if (picture->pict_type==I_TYPE)
2651                 mbi->mb_type = MB_INTRA;
2652         else if (picture->pict_type==P_TYPE)
2653         {
2654                 toporg = mc->oldorg[0];
2655                 topref = mc->oldref[0];
2656                 botorg = mc->oldorg[0] + width;
2657                 botref = mc->oldref[0] + width;
2658
2659                 if (secondfield)
2660                 {
2661                         /* opposite parity field is in same frame */
2662                         if (picture->pict_struct==TOP_FIELD)
2663                         {
2664                                 /* current is top field */
2665                                 botorg = mc->cur[0] + width;
2666                                 botref = mc->curref[0] + width;
2667                         }
2668                         else
2669                         {
2670                                 /* current is bottom field */
2671                                 toporg = mc->cur[0];
2672                                 topref = mc->curref[0];
2673                         }
2674                 }
2675
2676                 field_estimate(engine,
2677                                                 picture,
2678                                            toporg,topref,botorg,botref,&ssmb,
2679                                            i,j,mc->sxf,mc->syf,ipflag,
2680                                            &fieldf_mc,
2681                                            &field8uf_mc,
2682                                            &field8lf_mc,
2683                                            &fields_mc);
2684                 dmcfield = fieldf_mc.sad;
2685                 dmc8f = field8uf_mc.sad + field8lf_mc.sad;
2686
2687 //              if (M==1 && !ipflag)  /* generic condition which permits Dual Prime */
2688 //              {
2689 //                      dpfield_estimate(engine,
2690 //                                                      picture,
2691 //                                                       topref,botref,ssmb.mb,i,j,imins,jmins,
2692 //                                                       &dualp_mc,
2693 //                                                       &vmc_dp);
2694 //                      dmc_dp = dualp_mc.sad;
2695 //              }
2696
2697 //              /* select between dual prime, field and 16x8 prediction */
2698 //              if (M==1 && !ipflag && dmc_dp<dmc8f && dmc_dp<dmcfield)
2699 //              {
2700 //                      /* Dual Prime prediction */
2701 //                      mbi->motion_type = MC_DMV;
2702 //                      dmc = dualp_mc.sad;
2703 //                      vmc = vmc_dp;
2704 //
2705 //              }
2706 //              else
2707                 if (dmc8f<dmcfield)
2708                 {
2709                         /* 16x8 prediction */
2710                         mbi->motion_type = MC_16X8;
2711                         /* upper and lower half blocks */
2712                         vmc =  unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
2713                         vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
2714                 }
2715                 else
2716                 {
2717                         /* field prediction */
2718                         mbi->motion_type = MC_FIELD;
2719                         vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16 );
2720                 }
2721
2722                 /* select between intra and non-intra coding */
2723
2724                 if ( vmc>var && vmc>=9*256)
2725                         mbi->mb_type = MB_INTRA;
2726                 else
2727                 {
2728                         /* zero MV field prediction from same parity ref. field
2729                          * (not allowed if ipflag is set)
2730                          */
2731                         if (!ipflag)
2732                                 v0 = (*pdist2)(((picture->pict_struct==BOTTOM_FIELD)?botref:topref) + i + w2*j,
2733                                                            ssmb.mb,w2,0,0,16);
2734                         if (ipflag  || (4*v0>5*vmc && v0>=9*256))
2735                         {
2736                                 var = vmc;
2737                                 mbi->mb_type = MB_FORWARD;
2738                                 if (mbi->motion_type==MC_FIELD)
2739                                 {
2740                                         mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
2741                                         mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
2742                                         mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
2743                                 }
2744                                 else if (mbi->motion_type==MC_DMV)
2745                                 {
2746                                         /* same parity vector */
2747                                         mbi->MV[0][0][0] = imins - (i<<1);
2748                                         mbi->MV[0][0][1] = jmins - (j<<1);
2749
2750                                         /* opposite parity vector */
2751                                         mbi->dmvector[0] = dualp_mc.pos.x;
2752                                         mbi->dmvector[1] = dualp_mc.pos.y;
2753                                 }
2754                                 else
2755                                 {
2756                                         mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
2757                                         mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
2758                                         mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
2759                                         mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
2760                                         mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
2761                                         mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
2762                                 }
2763                         }
2764                         else
2765                         {
2766                                 /* No MC */
2767                                 var = v0;
2768                                 mbi->mb_type = 0;
2769                                 mbi->motion_type = MC_FIELD;
2770                                 mbi->MV[0][0][0] = 0;
2771                                 mbi->MV[0][0][1] = 0;
2772                                 mbi->mv_field_sel[0][0] = (picture->pict_struct==BOTTOM_FIELD);
2773                         }
2774                 }
2775         }
2776         else /* if (pict_type==B_TYPE) */
2777         {
2778                 /* forward prediction */
2779                 field_estimate(engine,
2780                                                 picture,
2781                                            mc->oldorg[0],mc->oldref[0],
2782                                            mc->oldorg[0]+width,mc->oldref[0]+width,&ssmb,
2783                                            i,j,mc->sxf,mc->syf,0,
2784                                            &fieldf_mc,
2785                                            &field8uf_mc,
2786                                            &field8lf_mc,
2787                                            &fields_mc);
2788                 dmcfieldf = fieldf_mc.sad;
2789                 dmc8f = field8uf_mc.sad + field8lf_mc.sad;
2790
2791                 /* backward prediction */
2792                 field_estimate(engine,
2793                                                 picture,
2794                                            mc->neworg[0],mc->newref[0],mc->neworg[0]+width,mc->newref[0]+width,
2795                                            &ssmb,
2796                                            i,j,mc->sxb,mc->syb,0,
2797                                            &fieldb_mc,
2798                                            &field8ub_mc,
2799                                            &field8lb_mc,
2800                                            &fields_mc);
2801                 dmcfieldr = fieldb_mc.sad;
2802                 dmc8r = field8ub_mc.sad + field8lb_mc.sad;
2803
2804                 /* calculate distances for bidirectional prediction */
2805                 /* field */
2806                 dmcfieldi = bidir_pred_sad( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
2807
2808                 /* 16x8 upper and lower half blocks */
2809                 dmc8i =  bidir_pred_sad( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 16 );
2810                 dmc8i += bidir_pred_sad( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 16 );
2811
2812                 /* select prediction type of minimum distance */
2813                 if (dmcfieldi<dmc8i && dmcfieldi<dmcfieldf && dmcfieldi<dmc8f
2814                         && dmcfieldi<dmcfieldr && dmcfieldi<dmc8r)
2815                 {
2816                         /* field, interpolated */
2817                         mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2818                         mbi->motion_type = MC_FIELD;
2819                         vmc = bidir_pred_var( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
2820                 }
2821                 else if (dmc8i<dmcfieldf && dmc8i<dmc8f
2822                                  && dmc8i<dmcfieldr && dmc8i<dmc8r)
2823                 {
2824                         /* 16x8, interpolated */
2825                         mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2826                         mbi->motion_type = MC_16X8;
2827
2828                         /* upper and lower half blocks */
2829                         vmc =  bidir_pred_var( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 8);
2830                         vmc += bidir_pred_var( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 8);
2831                 }
2832                 else if (dmcfieldf<dmc8f && dmcfieldf<dmcfieldr && dmcfieldf<dmc8r)
2833                 {
2834                         /* field, forward */
2835                         mbi->mb_type = MB_FORWARD;
2836                         mbi->motion_type = MC_FIELD;
2837                         vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16);
2838                 }
2839                 else if (dmc8f<dmcfieldr && dmc8f<dmc8r)
2840                 {
2841                         /* 16x8, forward */
2842                         mbi->mb_type = MB_FORWARD;
2843                         mbi->motion_type = MC_16X8;
2844
2845                         /* upper and lower half blocks */
2846                         vmc =  unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
2847                         vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
2848                 }
2849                 else if (dmcfieldr<dmc8r)
2850                 {
2851                         /* field, backward */
2852                         mbi->mb_type = MB_BACKWARD;
2853                         mbi->motion_type = MC_FIELD;
2854                         vmc = unidir_pred_var( &fieldb_mc, ssmb.mb, w2, 16 );
2855                 }
2856                 else
2857                 {
2858                         /* 16x8, backward */
2859                         mbi->mb_type = MB_BACKWARD;
2860                         mbi->motion_type = MC_16X8;
2861
2862                         /* upper and lower half blocks */
2863                         vmc =  unidir_pred_var( &field8ub_mc, ssmb.mb, w2, 8);
2864                         vmc += unidir_pred_var( &field8lb_mc, ssmb.mb, w2, 8);
2865
2866                 }
2867
2868                 /* select between intra and non-intra coding */
2869                 if (vmc>var && vmc>=9*256)
2870                         mbi->mb_type = MB_INTRA;
2871                 else
2872                 {
2873                         var = vmc;
2874                         if (mbi->motion_type==MC_FIELD)
2875                         {
2876                                 /* forward */
2877                                 mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
2878                                 mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
2879                                 mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
2880                                 /* backward */
2881                                 mbi->MV[0][1][0] = fieldb_mc.pos.x - (i<<1);
2882                                 mbi->MV[0][1][1] = fieldb_mc.pos.y - (j<<1);
2883                                 mbi->mv_field_sel[0][1] = fieldb_mc.fieldsel;
2884                         }
2885                         else /* MC_16X8 */
2886                         {
2887                                 /* forward */
2888                                 mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
2889                                 mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
2890                                 mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
2891                                 mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
2892                                 mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
2893                                 mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
2894                                 /* backward */
2895                                 mbi->MV[0][1][0] = field8ub_mc.pos.x - (i<<1);
2896                                 mbi->MV[0][1][1] = field8ub_mc.pos.y - (j<<1);
2897                                 mbi->mv_field_sel[0][1] = field8ub_mc.fieldsel;
2898                                 mbi->MV[1][1][0] = field8lb_mc.pos.x - (i<<1);
2899                                 mbi->MV[1][1][1] = field8lb_mc.pos.y - ((j+8)<<1);
2900                                 mbi->mv_field_sel[1][1] = field8lb_mc.fieldsel;
2901                         }
2902                 }
2903         }
2904
2905 }
2906
2907
2908
2909 /*
2910   Initialise motion compensation - currently purely selection of which
2911   versions of the various low level computation routines to use
2912
2913   */
2914
2915 void init_motion()
2916 {
2917         int cpucap = cpu_accel();
2918
2919         if( cpucap  == 0 )      /* No MMX/SSE etc support available */
2920         {
2921                 pdist22 = dist22;
2922                 pdist44 = dist44;
2923                 pdist1_00 = dist1_00;
2924                 pdist1_01 = dist1_01;
2925                 pdist1_10 = dist1_10;
2926                 pdist1_11 = dist1_11;
2927                 pbdist1 = bdist1;
2928                 pdist2 = dist2;
2929                 pbdist2 = bdist2;
2930                 pdist2_22 = dist2_22;
2931                 pbdist2_22 = bdist2_22;
2932                 pfind_best_one_pel = find_best_one_pel;
2933                 pbuild_sub22_mcomps     = build_sub22_mcomps;
2934          }
2935 #ifdef X86_CPU
2936         else if(cpucap & ACCEL_X86_MMXEXT ) /* AMD MMX or SSE... */
2937         {
2938                 if(verbose) fprintf( stderr, "SETTING EXTENDED MMX for MOTION!\n");
2939                 pdist22 = dist22_mmxe;
2940                 pdist44 = dist44_mmxe;
2941                 pdist1_00 = dist1_00_mmxe;
2942                 pdist1_01 = dist1_01_mmxe;
2943                 pdist1_10 = dist1_10_mmxe;
2944                 pdist1_11 = dist1_11_mmxe;
2945                 pbdist1 = bdist1_mmx;
2946                 pdist2 = dist2_mmx;
2947                 pbdist2 = bdist2_mmx;
2948                 pdist2_22 = dist2_22_mmx;
2949                 pbdist2_22 = bdist2_22_mmx;
2950                 pfind_best_one_pel = find_best_one_pel_mmxe;
2951                 pbuild_sub22_mcomps     = build_sub22_mcomps_mmxe;
2952                 pmblock_sub44_dists = mblock_sub44_dists_mmxe;
2953
2954         }
2955         else if(cpucap & ACCEL_X86_MMX) /* Ordinary MMX CPU */
2956         {
2957                 if(verbose) fprintf( stderr, "SETTING MMX for MOTION!\n");
2958                 pdist22 = dist22_mmx;
2959                 pdist44 = dist44_mmx;
2960                 pdist1_00 = dist1_00_mmx;
2961                 pdist1_01 = dist1_01_mmx;
2962                 pdist1_10 = dist1_10_mmx;
2963                 pdist1_11 = dist1_11_mmx;
2964                 pbdist1 = bdist1_mmx;
2965                 pdist2 = dist2_mmx;
2966                 pbdist2 = bdist2_mmx;
2967                 pdist2_22 = dist2_22_mmx;
2968                 pbdist2_22 = bdist2_22_mmx;
2969                 pfind_best_one_pel = find_best_one_pel;
2970                 pbuild_sub22_mcomps     = build_sub22_mcomps;
2971                 pmblock_sub44_dists = mblock_sub44_dists_mmx;
2972         }
2973 #endif
2974 }
2975
2976
2977 void motion_engine_loop(motion_engine_t *engine)
2978 {
2979         while(!engine->done)
2980         {
2981                 pthread_mutex_lock(&(engine->input_lock));
2982
2983                 if(!engine->done)
2984                 {
2985                         pict_data_s *picture = engine->pict_data;
2986                         motion_comp_s *mc_data = engine->motion_comp;
2987                         int secondfield = engine->secondfield;
2988                         int ipflag = engine->ipflag;
2989                         mbinfo_s *mbi = picture->mbinfo + (engine->start_row / 16) * (width / 16);
2990                         int i, j;
2991                         int mb_row_incr;                        /* Offset increment to go down 1 row of mb's */
2992                         int mb_row_start;
2993
2994                         if (picture->pict_struct == FRAME_PICTURE)
2995                         {
2996                                 mb_row_incr = 16*width;
2997                                 mb_row_start = engine->start_row / 16 * mb_row_incr;
2998 /* loop through all macroblocks of a frame picture */
2999                                 for (j=engine->start_row; j < engine->end_row; j+=16)
3000                                 {
3001                                         for (i=0; i<width; i+=16)
3002                                         {
3003                                                 frame_ME(engine, picture, mc_data, mb_row_start, i,j, mbi);
3004                                                 mbi++;
3005                                         }
3006                                         mb_row_start += mb_row_incr;
3007                                 }
3008                         }
3009                         else
3010                         {
3011                                 mb_row_incr = (16 * 2) * width;
3012                                 mb_row_start = engine->start_row / 16 * mb_row_incr;
3013 /* loop through all macroblocks of a field picture */
3014                                 for (j=engine->start_row; j < engine->end_row; j+=16)
3015                                 {
3016                                         for (i=0; i<width; i+=16)
3017                                         {
3018                                                 field_ME(engine, picture, mc_data, mb_row_start, i,j,
3019                                                                  mbi,secondfield,ipflag);
3020                                                 mbi++;
3021                                         }
3022                                         mb_row_start += mb_row_incr;
3023                                 }
3024                         }
3025                 }
3026
3027                 pthread_mutex_unlock(&(engine->output_lock));
3028         }
3029 }
3030
3031
3032 void motion_estimation(pict_data_s *picture,
3033         motion_comp_s *mc_data,
3034         int secondfield, int ipflag)
3035 {
3036         int i;
3037 /* Start loop */
3038         for(i = 0; i < processors; i++)
3039         {
3040                 motion_engines[i].motion_comp = mc_data;
3041
3042                 motion_engines[i].pict_data = picture;
3043
3044                 motion_engines[i].secondfield = secondfield;
3045                 motion_engines[i].ipflag = ipflag;
3046                 pthread_mutex_unlock(&(motion_engines[i].input_lock));
3047         }
3048
3049 /* Wait for completion */
3050         for(i = 0; i < processors; i++)
3051         {
3052                 pthread_mutex_lock(&(motion_engines[i].output_lock));
3053         }
3054 }
3055
3056 void start_motion_engines()
3057 {
3058         int i;
3059         int rows_per_processor = (int)((float)height2 / 16 / processors + 0.5);
3060         int current_row = 0;
3061         pthread_attr_t  attr;
3062         pthread_mutexattr_t mutex_attr;
3063
3064         pthread_mutexattr_init(&mutex_attr);
3065         pthread_attr_init(&attr);
3066         motion_engines = calloc(1, sizeof(motion_engine_t) * processors);
3067         for(i = 0; i < processors; i++)
3068         {
3069                 motion_engines[i].start_row = current_row * 16;
3070                 current_row += rows_per_processor;
3071                 if(current_row > height2 / 16) current_row = height2 / 16;
3072                 motion_engines[i].end_row = current_row * 16;
3073                 pthread_mutex_init(&(motion_engines[i].input_lock), &mutex_attr);
3074                 pthread_mutex_lock(&(motion_engines[i].input_lock));
3075                 pthread_mutex_init(&(motion_engines[i].output_lock), &mutex_attr);
3076                 pthread_mutex_lock(&(motion_engines[i].output_lock));
3077                 motion_engines[i].done = 0;
3078                 pthread_create(&(motion_engines[i].tid),
3079                         &attr,
3080                         (void*)motion_engine_loop,
3081                         &motion_engines[i]);
3082         }
3083 }
3084
3085 void stop_motion_engines()
3086 {
3087         int i;
3088         for(i = 0; i < processors; i++)
3089         {
3090                 motion_engines[i].done = 1;
3091                 pthread_mutex_unlock(&(motion_engines[i].input_lock));
3092                 pthread_join(motion_engines[i].tid, 0);
3093                 pthread_mutex_destroy(&(motion_engines[i].input_lock));
3094                 pthread_mutex_destroy(&(motion_engines[i].output_lock));
3095         }
3096         free(motion_engines);
3097 }
3098
3099