Merge CV, ver=5.1; ops/methods from HV, and interface from CV where possible
[goodguy/history.git] / cinelerra-5.1 / mpeg2enc / motion.c
diff --git a/cinelerra-5.1/mpeg2enc/motion.c b/cinelerra-5.1/mpeg2enc/motion.c
new file mode 100644 (file)
index 0000000..7431bd9
--- /dev/null
@@ -0,0 +1,3099 @@
+/* motion.c, motion estimation                                              */
+
+/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
+
+/*
+ * Disclaimer of Warranty
+ *
+ * These software programs are available to the user without any license fee or
+ * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
+ * any and all warranties, whether express, implied, or statuary, including any
+ * implied warranties or merchantability or of fitness for a particular
+ * purpose.  In no event shall the copyright-holder be liable for any
+ * incidental, punitive, or consequential damages of any kind whatsoever
+ * arising from the use of these programs.
+ *
+ * This disclaimer of warranty extends to the user of these programs and user's
+ * customers, employees, agents, transferees, successors, and assigns.
+ *
+ * The MPEG Software Simulation Group does not represent or warrant that the
+ * programs furnished hereunder are free of infringement of any third-party
+ * patents.
+ *
+ * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
+ * are subject to royalty fees to patent holders.  Many of these patents are
+ * general enough such that they are unavoidable regardless of implementation
+ * design.
+ *
+ */
+
+#include <limits.h>
+#include <stdio.h>
+#include <string.h>
+#include "config.h"
+#include "global.h"
+#include "cpu_accel.h"
+#include "simd.h"
+
+/* Macro-block Motion compensation results record */
+
+typedef struct _blockcrd {
+       int16_t x;
+       int16_t y;
+} blockxy;
+
+struct mb_motion
+{
+       blockxy pos;        // Half-pel co-ordinates of source block
+       int sad;                        // Sum of absolute difference
+       int var;
+       uint8_t *blk ;          // Source block data (in luminace data array)
+       int hx, hy;                     // Half-pel offsets
+       int fieldsel;           // 0 = top 1 = bottom
+       int fieldoff;       // Offset from start of frame data to first line
+                                               // of field.    (top = 0, bottom = width );
+};
+
+typedef struct mb_motion mb_motion_s;
+
+
+struct subsampled_mb
+{
+       uint8_t *mb;            // One pel
+       uint8_t *fmb;           // Two-pel subsampled
+       uint8_t *qmb;           // Four-pel subsampled
+       uint8_t *umb;           // U compoenent one-pel
+       uint8_t *vmb;       // V component  one-pel
+};
+
+typedef struct subsampled_mb subsampled_mb_s;
+
+
+static void frame_ME (motion_engine_t *engine,
+                                                                       pict_data_s *picture,
+                                                                 motion_comp_s *mc,
+                                                                 int mboffset,
+                                                                 int i, int j, struct mbinfo *mbi);
+
+static void field_ME (motion_engine_t *engine,
+                                                                       pict_data_s *picture,
+                                                                 motion_comp_s *mc,
+                                                                 int mboffset,
+                                                                 int i, int j, 
+                                                                 struct mbinfo *mbi, 
+                                                                 int secondfield, 
+                                                                 int ipflag);
+
+static void frame_estimate (motion_engine_t *engine,
+       uint8_t *org,
+        uint8_t *ref, 
+        subsampled_mb_s *ssmb,
+        int i, int j,
+        int sx, int sy, 
+         mb_motion_s *bestfr,
+         mb_motion_s *besttop,
+         mb_motion_s *bestbot,
+        int imins[2][2], int jmins[2][2]);
+
+static void field_estimate (motion_engine_t *engine,
+                                                       pict_data_s *picture,
+                                                       uint8_t *toporg,
+                                                       uint8_t *topref, 
+                                                       uint8_t *botorg, 
+                                                       uint8_t *botref,
+                                                       subsampled_mb_s *ssmb,
+                                                       int i, int j, int sx, int sy, int ipflag,
+                                                       mb_motion_s *bestfr,
+                                                       mb_motion_s *best8u,
+                                                       mb_motion_s *best8l,
+                                                       mb_motion_s *bestsp);
+
+static void dpframe_estimate (motion_engine_t *engine,
+       pict_data_s *picture,
+       uint8_t *ref,
+       subsampled_mb_s *ssmb,
+       int i, int j, int iminf[2][2], int jminf[2][2],
+       mb_motion_s *dpbest,
+       int *imindmvp, int *jmindmvp, 
+       int *vmcp);
+
+static void dpfield_estimate (motion_engine_t *engine,
+       pict_data_s *picture,
+       uint8_t *topref,
+       uint8_t *botref, 
+       uint8_t *mb,
+       int i, int j, 
+       int imins, int jmins, 
+       mb_motion_s *dpbest,
+       int *vmcp);
+
+static void fullsearch (motion_engine_t *engine, 
+       uint8_t *org, uint8_t *ref,
+       subsampled_mb_s *ssblk,
+       int lx, int i0, int j0, 
+       int sx, int sy, int h, 
+       int xmax, int ymax,
+       mb_motion_s *motion );
+
+static void find_best_one_pel( motion_engine_t *engine, 
+                                                               uint8_t *org, uint8_t *blk,
+                                                          int searched_size,
+                                                          int i0, int j0,
+                                                          int ilow, int jlow,
+                                                          int xmax, int ymax,
+                                                          int lx, int h, 
+                                                          mb_motion_s *res
+       );
+static int build_sub22_mcomps( motion_engine_t *engine, 
+                                                               int i0,  int j0, int ihigh, int jhigh, 
+                                                               int null_mc_sad,
+                                                               uint8_t *s22org,  uint8_t *s22blk, 
+                                                          int flx, int fh,  int searched_sub44_size );
+
+#ifdef X86_CPU
+static void find_best_one_pel_mmxe( motion_engine_t *engine, 
+                                                               uint8_t *org, uint8_t *blk,
+                                                          int searched_size,
+                                                          int i0, int j0,
+                                                          int ilow, int jlow,
+                                                          int xmax, int ymax,
+                                                          int lx, int h, 
+                                                          mb_motion_s *res
+       );
+
+static int build_sub22_mcomps_mmxe( motion_engine_t *engine, int i0,  int j0, int ihigh, int jhigh, 
+                                                        int null_mc_sad,
+                                                        uint8_t *s22org,  uint8_t *s22blk, 
+                                                        int flx, int fh,  int searched_sub44_size );
+static int (*pmblock_sub44_dists)( uint8_t *blk,  uint8_t *ref,
+                                                       int ilow, int jlow,
+                                                       int ihigh, int jhigh, 
+                                                       int h, int rowstride, 
+                                                       int threshold,
+                                                       mc_result_s *resvec);
+#endif
+
+static int unidir_pred_var( const mb_motion_s *motion, 
+                                                       uint8_t *mb,  int lx, int h);
+static int bidir_pred_var( const mb_motion_s *motion_f,  
+                                                  const mb_motion_s *motion_b, 
+                                                  uint8_t *mb,  int lx, int h);
+static int bidir_pred_sad( const mb_motion_s *motion_f,  
+                                                  const mb_motion_s *motion_b, 
+                                                  uint8_t *mb,  int lx, int h);
+
+static int variance(  uint8_t *mb, int size, int lx);
+
+static int dist22 ( uint8_t *blk1, uint8_t *blk2,  int qlx, int qh);
+
+static int dist44 ( uint8_t *blk1, uint8_t *blk2,  int flx, int fh);
+static int dist2_22( uint8_t *blk1, uint8_t *blk2,
+                                        int lx, int h);
+static int bdist2_22( uint8_t *blk1f, uint8_t *blk1b, 
+                                         uint8_t *blk2,
+                                         int lx, int h);
+
+
+static void (*pfind_best_one_pel)( motion_engine_t *engine,
+                                                               uint8_t *org, uint8_t *blk,
+                                                          int searched_size,
+                                                          int i0, int j0,
+                                                          int ilow, int jlow,
+                                                          int xmax, int ymax,
+                                                          int lx, int h, 
+                                                          mb_motion_s *res
+       );
+static int (*pbuild_sub22_mcomps)( motion_engine_t *engine,
+                                                                       int i0,  int j0, int ihigh, int jhigh, 
+                                                                  int null_mc_sad,
+                                                                  uint8_t *s22org,  uint8_t *s22blk, 
+                                                                  int flx, int fh,  int searched_sub44_size );
+
+static int (*pdist2_22)( uint8_t *blk1, uint8_t *blk2,
+                                                int lx, int h);
+static int (*pbdist2_22)( uint8_t *blk1f, uint8_t *blk1b, 
+                                                 uint8_t *blk2,
+                                                 int lx, int h);
+
+static int dist1_00( uint8_t *blk1, uint8_t *blk2,  int lx, int h, int distlim);
+static int dist1_01(uint8_t *blk1, uint8_t *blk2, int lx, int h);
+static int dist1_10(uint8_t *blk1, uint8_t *blk2, int lx, int h);
+static int dist1_11(uint8_t *blk1, uint8_t *blk2, int lx, int h);
+static int dist2 (uint8_t *blk1, uint8_t *blk2,
+                                                         int lx, int hx, int hy, int h);
+static int bdist2 (uint8_t *pf, uint8_t *pb,
+       uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
+static int bdist1 (uint8_t *pf, uint8_t *pb,
+                                  uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
+
+
+static int (*pdist22) ( uint8_t *blk1, uint8_t *blk2,  int flx, int fh);
+static int (*pdist44) ( uint8_t *blk1, uint8_t *blk2,  int qlx, int qh);
+static int (*pdist1_00) ( uint8_t *blk1, uint8_t *blk2,  int lx, int h, int distlim);
+static int (*pdist1_01) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
+static int (*pdist1_10) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
+static int (*pdist1_11) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
+
+static int (*pdist2) (uint8_t *blk1, uint8_t *blk2,
+                                         int lx, int hx, int hy, int h);
+  
+  
+static int (*pbdist2) (uint8_t *pf, uint8_t *pb,
+                                          uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
+
+static int (*pbdist1) (uint8_t *pf, uint8_t *pb,
+                                          uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
+
+
+/* DEBUGGER...
+static void check_mc( const mb_motion_s *motion, int rx, int ry, int i, int j, char *str )
+{
+       rx *= 2; ry *= 2;
+       if( motion->sad < 0 || motion->sad > 0x10000 )
+       {
+               printf( "SAD ooops %s\n", str );
+               exit(1);
+       }
+       if( motion->pos.x-i*2 < -rx || motion->pos.x-i*2 >= rx )
+       {
+               printf( "X MC ooops %s = %d [%d]\n", str, motion->pos.x-i*2,rx );
+               exit(1);
+       }
+       if( motion->pos.y-j*2 < -ry || motion->pos.y-j*2 >= ry )
+       {
+               printf( "Y MC ooops %s %d [%d]\n", str, motion->pos.y-j*2, ry );
+               exit(1);
+       }
+}
+
+static void init_mc( mb_motion_s *motion )
+{
+       motion->sad = -123;
+       motion->pos.x = -1000;
+       motion->pos.y = -1000;
+}
+*/
+
+/* unidir_NI_var_sum
+   Compute the combined variance of luminance and chrominance information
+   for a particular non-intra macro block after unidirectional
+   motion compensation...  
+
+   Note: results are scaled to give chrominance equal weight to
+   chrominance.  The variance of the luminance portion is computed
+   at the time the motion compensation is computed.
+
+   TODO: Perhaps we should compute the whole thing in fullsearch not
+   seperate it.  However, that would involve a lot of fiddling with
+   field_* and until its thoroughly debugged and tested I think I'll
+   leave that alone. Furthermore, it is unclear if its really worth
+   doing these computations for B *and* P frames.
+
+   TODO: BUG: ONLY works for 420 video...
+
+*/
+
+static int unidir_chrom_var_sum( mb_motion_s *lum_mc, 
+                                                         uint8_t **ref, 
+                                                         subsampled_mb_s *ssblk,
+                                                         int lx, int h )
+{
+       int uvlx = (lx>>1);
+       int uvh = (h>>1);
+       /* N.b. MC co-ordinates are computed in half-pel units! */
+       int cblkoffset = (lum_mc->fieldoff>>1) +
+               (lum_mc->pos.x>>2) + (lum_mc->pos.y>>2)*uvlx;
+       
+       return  ((*pdist2_22)( ref[1] + cblkoffset, ssblk->umb, uvlx, uvh) +
+                        (*pdist2_22)( ref[2] + cblkoffset, ssblk->vmb, uvlx, uvh))*2;
+}
+
+/*
+  bidir_NI_var_sum
+   Compute the combined variance of luminance and chrominance information
+   for a particular non-intra macro block after unidirectional
+   motion compensation...  
+
+   Note: results are scaled to give chrominance equal weight to
+   chrominance.  The variance of the luminance portion is computed
+   at the time the motion compensation is computed.
+
+   Note: results scaled to give chrominance equal weight to chrominance.
+  
+  TODO: BUG: ONLY works for 420 video...
+
+  NOTE: Currently unused but may be required if it turns out that taking
+  chrominance into account in B frames is needed.
+
+ */
+
+int bidir_chrom_var_sum( mb_motion_s *lum_mc_f, 
+                                          mb_motion_s *lum_mc_b, 
+                                          uint8_t **ref_f, 
+                                          uint8_t **ref_b,
+                                          subsampled_mb_s *ssblk,
+                                          int lx, int h )
+{
+       int uvlx = (lx>>1);
+       int uvh = (h>>1);
+       /* N.b. MC co-ordinates are computed in half-pel units! */
+       int cblkoffset_f = (lum_mc_f->fieldoff>>1) + 
+               (lum_mc_f->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
+       int cblkoffset_b = (lum_mc_b->fieldoff>>1) + 
+               (lum_mc_b->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
+       
+       return  (
+               (*pbdist2_22)( ref_f[1] + cblkoffset_f, ref_b[1] + cblkoffset_b,
+                                          ssblk->umb, uvlx, uvh ) +
+               (*pbdist2_22)( ref_f[2] + cblkoffset_f, ref_b[2] + cblkoffset_b,
+                                          ssblk->vmb, uvlx, uvh ))*2;
+
+}
+
+static int chrom_var_sum( subsampled_mb_s *ssblk, int h, int lx )
+{
+       return (variance(ssblk->umb,(h>>1),(lx>>1)) + 
+                       variance(ssblk->vmb,(h>>1),(lx>>1))) * 2;
+}
+
+/*
+ * frame picture motion estimation
+ *
+ * org: top left pel of source reference frame
+ * ref: top left pel of reconstructed reference frame
+ * ssmb:  macroblock to be matched
+ * i,j: location of mb relative to ref (=center of search window)
+ * sx,sy: half widths of search window
+ * besfr: location and SAD of best frame prediction
+ * besttop: location of best field pred. for top field of mb
+ * bestbo : location of best field pred. for bottom field of mb
+ */
+
+static void frame_estimate(motion_engine_t *engine, 
+       uint8_t *org,
+       uint8_t *ref,
+       subsampled_mb_s *ssmb,
+       int i, int j, int sx, int sy,
+       mb_motion_s *bestfr,
+       mb_motion_s *besttop,
+       mb_motion_s *bestbot,
+       int imins[2][2],
+       int jmins[2][2]
+       )
+{
+       subsampled_mb_s  botssmb;
+       mb_motion_s topfld_mc;
+       mb_motion_s botfld_mc;
+
+       botssmb.mb = ssmb->mb+width;
+       botssmb.fmb = ssmb->mb+(width>>1);
+       botssmb.qmb = ssmb->qmb+(width>>2);
+       botssmb.umb = ssmb->umb+(width>>1);
+       botssmb.vmb = ssmb->vmb+(width>>1);
+
+       /* frame prediction */
+       fullsearch(engine, org,ref,ssmb,width,i,j,sx,sy,16,width,height,
+                                                 bestfr );
+       bestfr->fieldsel = 0;
+       bestfr->fieldoff = 0;
+
+       /* predict top field from top field */
+       fullsearch(engine, org,ref,ssmb,width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
+                          &topfld_mc);
+
+       /* predict top field from bottom field */
+       fullsearch(engine, org+width,ref+width,ssmb, width<<1,i,j>>1,sx,sy>>1,8,
+                          width,height>>1, &botfld_mc);
+
+       /* set correct field selectors... */
+       topfld_mc.fieldsel = 0;
+       botfld_mc.fieldsel = 1;
+       topfld_mc.fieldoff = 0;
+       botfld_mc.fieldoff = width;
+
+       imins[0][0] = topfld_mc.pos.x;
+       jmins[0][0] = topfld_mc.pos.y;
+       imins[1][0] = botfld_mc.pos.x;
+       jmins[1][0] = botfld_mc.pos.y;
+
+       /* select prediction for top field */
+       if (topfld_mc.sad<=botfld_mc.sad)
+       {
+               *besttop = topfld_mc;
+       }
+       else
+       {
+               *besttop = botfld_mc;
+       }
+
+       /* predict bottom field from top field */
+       fullsearch(engine, org,ref,&botssmb,
+                                       width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
+                                       &topfld_mc);
+
+       /* predict bottom field from bottom field */
+       fullsearch(engine, org+width,ref+width,&botssmb,
+                                       width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
+                                       &botfld_mc);
+
+       /* set correct field selectors... */
+       topfld_mc.fieldsel = 0;
+       botfld_mc.fieldsel = 1;
+       topfld_mc.fieldoff = 0;
+       botfld_mc.fieldoff = width;
+
+       imins[0][1] = topfld_mc.pos.x;
+       jmins[0][1] = topfld_mc.pos.y;
+       imins[1][1] = botfld_mc.pos.x;
+       jmins[1][1] = botfld_mc.pos.y;
+
+       /* select prediction for bottom field */
+       if (botfld_mc.sad<=topfld_mc.sad)
+       {
+               *bestbot = botfld_mc;
+       }
+       else
+       {
+               *bestbot = topfld_mc;
+       }
+}
+
+/*
+ * field picture motion estimation subroutine
+ *
+ * toporg: address of original top reference field
+ * topref: address of reconstructed top reference field
+ * botorg: address of original bottom reference field
+ * botref: address of reconstructed bottom reference field
+ * ssmmb:  macroblock to be matched
+ * i,j: location of mb (=center of search window)
+ * sx,sy: half width/height of search window
+ *
+ * bestfld: location and distance of best field prediction
+ * best8u: location of best 16x8 pred. for upper half of mb
+ * best8lp: location of best 16x8 pred. for lower half of mb
+ * bdestsp: location and distance of best same parity field
+ *                    prediction (needed for dual prime, only valid if
+ *                    ipflag==0)
+ */
+
+static void field_estimate (motion_engine_t *engine,
+       pict_data_s *picture,
+       uint8_t *toporg,
+       uint8_t *topref, 
+       uint8_t *botorg, 
+       uint8_t *botref,
+       subsampled_mb_s *ssmb,
+       int i, int j, int sx, int sy, int ipflag,
+       mb_motion_s *bestfld,
+       mb_motion_s *best8u,
+       mb_motion_s *best8l,
+       mb_motion_s *bestsp)
+
+{
+       mb_motion_s topfld_mc;
+       mb_motion_s botfld_mc;
+       int dt, db;
+       int notop, nobot;
+       subsampled_mb_s botssmb;
+
+       botssmb.mb = ssmb->mb+width;
+       botssmb.umb = ssmb->umb+(width>>1);
+       botssmb.vmb = ssmb->vmb+(width>>1);
+       botssmb.fmb = ssmb->fmb+(width>>1);
+       botssmb.qmb = ssmb->qmb+(width>>2);
+
+       /* if ipflag is set, predict from field of opposite parity only */
+       notop = ipflag && (picture->pict_struct==TOP_FIELD);
+       nobot = ipflag && (picture->pict_struct==BOTTOM_FIELD);
+
+       /* field prediction */
+
+       /* predict current field from top field */
+       if (notop)
+               topfld_mc.sad = dt = 65536; /* infinity */
+       else
+               fullsearch(engine, toporg,topref,ssmb,width<<1,
+                                  i,j,sx,sy>>1,16,width,height>>1,
+                                  &topfld_mc);
+       dt = topfld_mc.sad;
+       /* predict current field from bottom field */
+       if (nobot)
+               botfld_mc.sad = db = 65536; /* infinity */
+       else
+               fullsearch(engine, botorg,botref,ssmb,width<<1,
+                                  i,j,sx,sy>>1,16,width,height>>1,
+                                  &botfld_mc);
+       db = botfld_mc.sad;
+       /* Set correct field selectors */
+       topfld_mc.fieldsel = 0;
+       botfld_mc.fieldsel = 1;
+       topfld_mc.fieldoff = 0;
+       botfld_mc.fieldoff = width;
+
+       /* same parity prediction (only valid if ipflag==0) */
+       if (picture->pict_struct==TOP_FIELD)
+       {
+               *bestsp = topfld_mc;
+       }
+       else
+       {
+               *bestsp = botfld_mc;
+       }
+
+       /* select field prediction */
+       if (dt<=db)
+       {
+               *bestfld = topfld_mc;
+       }
+       else
+       {
+               *bestfld = botfld_mc;
+       }
+
+
+       /* 16x8 motion compensation */
+
+       /* predict upper half field from top field */
+       if (notop)
+               topfld_mc.sad = dt = 65536;
+       else
+               fullsearch(engine, toporg,topref,ssmb,width<<1,
+                                  i,j,sx,sy>>1,8,width,height>>1,
+                                   &topfld_mc);
+       dt = topfld_mc.sad;
+       /* predict upper half field from bottom field */
+       if (nobot)
+               botfld_mc.sad = db = 65536;
+       else
+               fullsearch(engine, botorg,botref,ssmb,width<<1,
+                                  i,j,sx,sy>>1,8,width,height>>1,
+                                   &botfld_mc);
+       db = botfld_mc.sad;
+
+       /* Set correct field selectors */
+       topfld_mc.fieldsel = 0;
+       botfld_mc.fieldsel = 1;
+       topfld_mc.fieldoff = 0;
+       botfld_mc.fieldoff = width;
+
+       /* select prediction for upper half field */
+       if (dt<=db)
+       {
+               *best8u = topfld_mc;
+       }
+       else
+       {
+               *best8u = botfld_mc;
+       }
+
+       /* predict lower half field from top field */
+       /*
+         N.b. For interlaced data width<<4 (width*16) takes us 8 rows
+         down in the same field.  
+         Thus for the fast motion data (2*2
+         sub-sampled) we need to go 4 rows down in the same field.
+         This requires adding width*4 = (width<<2).
+         For the 4*4 sub-sampled motion data we need to go down 2 rows.
+         This requires adding width = width
+        
+       */
+       if (notop)
+               topfld_mc.sad = dt = 65536;
+       else
+               fullsearch(engine, toporg,topref,&botssmb,
+                                               width<<1,
+                                               i,j+8,sx,sy>>1,8,width,height>>1,
+                                  /* &imint,&jmint, &dt,*/ &topfld_mc);
+       dt = topfld_mc.sad;
+       /* predict lower half field from bottom field */
+       if (nobot)
+               botfld_mc.sad = db = 65536;
+       else
+               fullsearch(engine, botorg,botref,&botssmb,width<<1,
+                                               i,j+8,sx,sy>>1,8,width,height>>1,
+                                  /* &iminb,&jminb, &db,*/ &botfld_mc);
+       db = botfld_mc.sad;
+       /* Set correct field selectors */
+       topfld_mc.fieldsel = 0;
+       botfld_mc.fieldsel = 1;
+       topfld_mc.fieldoff = 0;
+       botfld_mc.fieldoff = width;
+
+       /* select prediction for lower half field */
+       if (dt<=db)
+       {
+               *best8l = topfld_mc;
+       }
+       else
+       {
+               *best8l = botfld_mc;
+       }
+}
+
+static void dpframe_estimate (motion_engine_t *engine,
+       pict_data_s *picture,
+       uint8_t *ref,
+       subsampled_mb_s *ssmb,
+       
+       int i, int j, int iminf[2][2], int jminf[2][2],
+       mb_motion_s *best_mc,
+       int *imindmvp, int *jmindmvp,
+       int *vmcp)
+{
+       int pref,ppred,delta_x,delta_y;
+       int is,js,it,jt,ib,jb,it0,jt0,ib0,jb0;
+       int imins,jmins,imint,jmint,iminb,jminb,imindmv,jmindmv;
+       int vmc,local_dist;
+
+       /* Calculate Dual Prime distortions for 9 delta candidates
+        * for each of the four minimum field vectors
+        * Note: only for P pictures!
+        */
+
+       /* initialize minimum dual prime distortion to large value */
+       vmc = INT_MAX;
+
+       for (pref=0; pref<2; pref++)
+       {
+               for (ppred=0; ppred<2; ppred++)
+               {
+                       /* convert Cartesian absolute to relative motion vector
+                        * values (wrt current macroblock address (i,j)
+                        */
+                       is = iminf[pref][ppred] - (i<<1);
+                       js = jminf[pref][ppred] - (j<<1);
+
+                       if (pref!=ppred)
+                       {
+                               /* vertical field shift adjustment */
+                               if (ppred==0)
+                                       js++;
+                               else
+                                       js--;
+
+                               /* mvxs and mvys scaling*/
+                               is<<=1;
+                               js<<=1;
+                               if (picture->topfirst == ppred)
+                               {
+                                       /* second field: scale by 1/3 */
+                                       is = (is>=0) ? (is+1)/3 : -((-is+1)/3);
+                                       js = (js>=0) ? (js+1)/3 : -((-js+1)/3);
+                               }
+                               else
+                                       continue;
+                       }
+
+                       /* vector for prediction from field of opposite 'parity' */
+                       if (picture->topfirst)
+                       {
+                               /* vector for prediction of top field from bottom field */
+                               it0 = ((is+(is>0))>>1);
+                               jt0 = ((js+(js>0))>>1) - 1;
+
+                               /* vector for prediction of bottom field from top field */
+                               ib0 = ((3*is+(is>0))>>1);
+                               jb0 = ((3*js+(js>0))>>1) + 1;
+                       }
+                       else
+                       {
+                               /* vector for prediction of top field from bottom field */
+                               it0 = ((3*is+(is>0))>>1);
+                               jt0 = ((3*js+(js>0))>>1) - 1;
+
+                               /* vector for prediction of bottom field from top field */
+                               ib0 = ((is+(is>0))>>1);
+                               jb0 = ((js+(js>0))>>1) + 1;
+                       }
+
+                       /* convert back to absolute half-pel field picture coordinates */
+                       is += i<<1;
+                       js += j<<1;
+                       it0 += i<<1;
+                       jt0 += j<<1;
+                       ib0 += i<<1;
+                       jb0 += j<<1;
+
+                       if (is >= 0 && is <= (width-16)<<1 &&
+                               js >= 0 && js <= (height-16))
+                       {
+                               for (delta_y=-1; delta_y<=1; delta_y++)
+                               {
+                                       for (delta_x=-1; delta_x<=1; delta_x++)
+                                       {
+                                               /* opposite field coordinates */
+                                               it = it0 + delta_x;
+                                               jt = jt0 + delta_y;
+                                               ib = ib0 + delta_x;
+                                               jb = jb0 + delta_y;
+
+                                               if (it >= 0 && it <= (width-16)<<1 &&
+                                                       jt >= 0 && jt <= (height-16) &&
+                                                       ib >= 0 && ib <= (width-16)<<1 &&
+                                                       jb >= 0 && jb <= (height-16))
+                                               {
+                                                       /* compute prediction error */
+                                                       local_dist = (*pbdist2)(
+                                                               ref + (is>>1) + (width<<1)*(js>>1),
+                                                               ref + width + (it>>1) + (width<<1)*(jt>>1),
+                                                               ssmb->mb,             /* current mb location */
+                                                               width<<1,       /* adjacent line distance */
+                                                               is&1, js&1, it&1, jt&1, /* half-pel flags */
+                                                               8);             /* block height */
+                                                       local_dist += (*pbdist2)(
+                                                               ref + width + (is>>1) + (width<<1)*(js>>1),
+                                                               ref + (ib>>1) + (width<<1)*(jb>>1),
+                                                               ssmb->mb + width,     /* current mb location */
+                                                               width<<1,       /* adjacent line distance */
+                                                               is&1, js&1, ib&1, jb&1, /* half-pel flags */
+                                                               8);             /* block height */
+
+                                                       /* update delta with least distortion vector */
+                                                       if (local_dist < vmc)
+                                                       {
+                                                               imins = is;
+                                                               jmins = js;
+                                                               imint = it;
+                                                               jmint = jt;
+                                                               iminb = ib;
+                                                               jminb = jb;
+                                                               imindmv = delta_x;
+                                                               jmindmv = delta_y;
+                                                               vmc = local_dist;
+                                                       }
+                                               }
+                                       }  /* end delta x loop */
+                               } /* end delta y loop */
+                       }
+               }
+       }
+
+       /* TODO: This is now likely to be obsolete... */
+       /* Compute L1 error for decision purposes */
+       local_dist = (*pbdist1)(
+               ref + (imins>>1) + (width<<1)*(jmins>>1),
+               ref + width + (imint>>1) + (width<<1)*(jmint>>1),
+               ssmb->mb,
+               width<<1,
+               imins&1, jmins&1, imint&1, jmint&1,
+               8);
+//printf("motion 1 %p\n", pbdist1);
+       local_dist += (*pbdist1)(
+               ref + width + (imins>>1) + (width<<1)*(jmins>>1),
+               ref + (iminb>>1) + (width<<1)*(jminb>>1),
+               ssmb->mb + width,
+               width<<1,
+               imins&1, jmins&1, iminb&1, jminb&1,
+               8);
+//printf("motion 2\n");
+
+       best_mc->sad = local_dist;
+       best_mc->pos.x = imins;
+       best_mc->pos.y = jmins;
+       *vmcp = vmc;
+       *imindmvp = imindmv;
+       *jmindmvp = jmindmv;
+//printf("motion 2\n");
+}
+
+static void dpfield_estimate(motion_engine_t *engine,
+       pict_data_s *picture,
+       uint8_t *topref,
+       uint8_t *botref, 
+       uint8_t *mb,
+       int i, int j, int imins, int jmins, 
+       mb_motion_s *bestdp_mc,
+       int *vmcp
+       )
+
+{
+       uint8_t *sameref, *oppref;
+       int io0,jo0,io,jo,delta_x,delta_y,mvxs,mvys,mvxo0,mvyo0;
+       int imino,jmino,imindmv,jmindmv,vmc_dp,local_dist;
+
+       /* Calculate Dual Prime distortions for 9 delta candidates */
+       /* Note: only for P pictures! */
+
+       /* Assign opposite and same reference pointer */
+       if (picture->pict_struct==TOP_FIELD)
+       {
+               sameref = topref;    
+               oppref = botref;
+       }
+       else 
+       {
+               sameref = botref;
+               oppref = topref;
+       }
+
+       /* convert Cartesian absolute to relative motion vector
+        * values (wrt current macroblock address (i,j)
+        */
+       mvxs = imins - (i<<1);
+       mvys = jmins - (j<<1);
+
+       /* vector for prediction from field of opposite 'parity' */
+       mvxo0 = (mvxs+(mvxs>0)) >> 1;  /* mvxs / / */
+       mvyo0 = (mvys+(mvys>0)) >> 1;  /* mvys / / 2*/
+
+                       /* vertical field shift correction */
+       if (picture->pict_struct==TOP_FIELD)
+               mvyo0--;
+       else
+               mvyo0++;
+
+                       /* convert back to absolute coordinates */
+       io0 = mvxo0 + (i<<1);
+       jo0 = mvyo0 + (j<<1);
+
+                       /* initialize minimum dual prime distortion to large value */
+       vmc_dp = 1 << 30;
+
+       for (delta_y = -1; delta_y <= 1; delta_y++)
+       {
+               for (delta_x = -1; delta_x <=1; delta_x++)
+               {
+                       /* opposite field coordinates */
+                       io = io0 + delta_x;
+                       jo = jo0 + delta_y;
+
+                       if (io >= 0 && io <= (width-16)<<1 &&
+                               jo >= 0 && jo <= (height2-16)<<1)
+                       {
+                               /* compute prediction error */
+                               local_dist = (*pbdist2)(
+                                       sameref + (imins>>1) + width2*(jmins>>1),
+                                       oppref  + (io>>1)    + width2*(jo>>1),
+                                       mb,             /* current mb location */
+                                       width2,         /* adjacent line distance */
+                                       imins&1, jmins&1, io&1, jo&1, /* half-pel flags */
+                                       16);            /* block height */
+
+                               /* update delta with least distortion vector */
+                               if (local_dist < vmc_dp)
+                               {
+                                       imino = io;
+                                       jmino = jo;
+                                       imindmv = delta_x;
+                                       jmindmv = delta_y;
+                                       vmc_dp = local_dist;
+                               }
+                       }
+               }  /* end delta x loop */
+       } /* end delta y loop */
+
+       /* Compute L1 error for decision purposes */
+       bestdp_mc->sad =
+               (*pbdist1)(
+                       sameref + (imins>>1) + width2*(jmins>>1),
+                       oppref  + (imino>>1) + width2*(jmino>>1),
+                       mb,             /* current mb location */
+                       width2,         /* adjacent line distance */
+                       imins&1, jmins&1, imino&1, jmino&1, /* half-pel flags */
+                       16);            /* block height */
+
+       bestdp_mc->pos.x = imindmv;
+       bestdp_mc->pos.x = imindmv;
+       *vmcp = vmc_dp;
+}
+
+
+/* 
+ *   Vectors of motion compensations 
+ */
+
+/*
+       Take a vector of motion compensations and repeatedly make passes
+       discarding all elements whose sad "weight" is above the current mean weight.
+*/
+
+static void sub_mean_reduction( mc_result_s *matches, int len, 
+                                                               int times,
+                                                           int *newlen_res, int *minweight_res)
+{
+       int i,j;
+       int weight_sum;
+       int mean_weight;
+       int min_weight = 100000;
+       if( len == 0 )
+       {
+               *minweight_res = 100000;
+               *newlen_res = 0;
+               return;
+       }
+
+       for(;;)
+       {
+               weight_sum = 0;
+               for( i = 0; i < len ; ++i )
+                       weight_sum += matches[i].weight;
+               mean_weight = weight_sum / len;
+               
+               if( times <= 0)
+                       break;
+                       
+               j = 0;
+               for( i =0; i < len; ++i )
+               {
+                       if( matches[i].weight <= mean_weight )
+                       {
+                               if( times == 1)
+                               {
+                                       min_weight = matches[i].weight ;
+                               }
+                               matches[j] = matches[i];
+                               ++j;
+                       }
+               }
+               len = j;
+               --times;
+       }
+       *newlen_res = len;
+       *minweight_res = mean_weight;
+}
+
+/*
+  Build a vector of the top   4*4 sub-sampled motion compensations in the box
+  (ilow,jlow) to (ihigh,jhigh).
+
+       The algorithm is as follows:
+       1. coarse matches on an 8*8 grid of positions are collected that fall below
+       a (very conservative) sad threshold (basically 50% more than moving average of
+       the mean sad of such matches).
+       
+       2. The worse than-average matches are discarded.
+       
+       3. The remaining coarse matches are expanded with the left/lower neighbouring
+       4*4 grid matches. Again only those better than a threshold (this time the mean
+       of the 8*8 grid matches are retained.
+       
+       4. Multiple passes are made discarding worse than-average matches.  The number of
+       passes is specified by the user.  The default it 2 (leaving roughly 1/4 of the matches).
+       
+       The net result is very fast and find good matches if they're to be found.  I.e. the
+       penalty over exhaustive search is pretty low.
+       
+       NOTE: The "discard below average" trick depends critically on having some variation in
+       the matches.  The slight penalty imposed for distant matches (reasonably since the 
+       motion vectors have to be encoded) is *vital* as otherwise pathologically bad
+       performance results on highly uniform images.
+       
+       TODO: We should probably allow the user to eliminate the initial thinning of 8*8
+       grid matches if ultimate quality is demanded (e.g. for low bit-rate applications).
+
+*/
+
+
+static int build_sub44_mcomps(motion_engine_t *engine,
+        int ilow, int jlow, int ihigh, int jhigh, 
+                                                       int i0, int j0,
+                                                               int null_mc_sad,
+                                                       uint8_t *s44org, uint8_t *s44blk, int qlx, int qh )
+{
+       uint8_t *s44orgblk;
+       int istrt = ilow-i0;
+       int jstrt = jlow-j0;
+       int iend = ihigh-i0;
+       int jend = jhigh-j0;
+       int mean_weight;
+       int threshold;
+
+#ifdef X86_CPU
+
+       /*int rangex, rangey;
+       static int rough_num_mcomps;
+       static mc_result_s rough_mcomps[MAX_44_MATCHES];
+       int k;
+       */
+#else
+       int i,j;
+       int s1;
+       uint8_t *old_s44orgblk;
+#endif
+       /* N.b. we may ignore the right hand block of the pair going over the
+          right edge as we have carefully allocated the buffer oversize to ensure
+          no memory faults.  The later motion compensation calculations
+          performed on the results of this pass will filter out
+          out-of-range blocks...
+       */
+
+
+       engine->sub44_num_mcomps = 0;
+       
+       threshold = 6*null_mc_sad / (4*4*mc_44_red);
+       s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
+       
+       /* Exhaustive search on 4*4 sub-sampled data.  This is affordable because
+               (a)     it is only 16th of the size of the real 1-pel data 
+               (b) we ignore those matches with an sad above our threshold.    
+       */
+#ifndef X86_CPU
+
+               /* Invariant:  s44orgblk = s44org+(i>>2)+qlx*(j>>2) */
+               s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
+               for( j = jstrt; j <= jend; j += 4 )
+               {
+                       old_s44orgblk = s44orgblk;
+                       for( i = istrt; i <= iend; i += 4 )
+                       {
+                               s1 = ((*pdist44)( s44orgblk,s44blk,qlx,qh) & 0xffff) + abs(i-i0) + abs(j-j0);
+                               if( s1 < threshold )
+                               {
+                                       engine->sub44_mcomps[engine->sub44_num_mcomps].x = i;
+                                       engine->sub44_mcomps[engine->sub44_num_mcomps].y = j;
+                                       engine->sub44_mcomps[engine->sub44_num_mcomps].weight = s1 ;
+                                       ++engine->sub44_num_mcomps;
+                               }
+                               s44orgblk += 1;
+                       }
+                       s44orgblk = old_s44orgblk + qlx;
+               }
+                       
+#else
+
+               engine->sub44_num_mcomps
+                       = (*pmblock_sub44_dists)( s44orgblk, s44blk,
+                                                                 istrt, jstrt,
+                                                                 iend, jend, 
+                                                                 qh, qlx, 
+                                                                 threshold,
+                                                                 engine->sub44_mcomps);
+
+#endif 
+               /* If we're really pushing quality we reduce once otherwise twice. */
+                       
+               sub_mean_reduction( engine->sub44_mcomps, engine->sub44_num_mcomps, 1+(mc_44_red>1),
+                                                   &engine->sub44_num_mcomps, &mean_weight);
+
+
+       return engine->sub44_num_mcomps;
+}
+
+
+/* Build a vector of the best 2*2 sub-sampled motion
+  compensations using the best 4*4 matches as starting points.  As
+  with with the 4*4 matches We don't collect them densely as they're
+  just search starting points for 1-pel search and ones that are 1 out
+  should still give better than average matches...
+
+*/
+
+
+static int build_sub22_mcomps(motion_engine_t *engine,
+               int i0,  int j0, int ihigh, int jhigh, 
+               int null_mc_sad,
+               uint8_t *s22org,  uint8_t *s22blk, 
+               int flx, int fh,  int searched_sub44_size )
+{
+       int i,k,s;
+       int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
+
+       int min_weight;
+       int ilim = ihigh-i0;
+       int jlim = jhigh-j0;
+       blockxy matchrec;
+       uint8_t *s22orgblk;
+
+       engine->sub22_num_mcomps = 0;
+       for( k = 0; k < searched_sub44_size; ++k )
+       {
+
+               matchrec.x = engine->sub44_mcomps[k].x;
+               matchrec.y = engine->sub44_mcomps[k].y;
+
+               s22orgblk =  s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
+               for( i = 0; i < 4; ++i )
+               {
+                       if( matchrec.x <= ilim && matchrec.y <= jlim )
+                       {       
+                               s = (*pdist22)( s22orgblk,s22blk,flx,fh);
+                               if( s < threshold )
+                               {
+                                       engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
+                                       engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
+                                       engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
+                                       ++engine->sub22_num_mcomps;
+                               }
+                       }
+
+                       if( i == 1 )
+                       {
+                               s22orgblk += flx-1;
+                               matchrec.x -= 2;
+                               matchrec.y += 2;
+                       }
+                       else
+                       {
+                               s22orgblk += 1;
+                               matchrec.x += 2;
+                               
+                       }
+               }
+
+       }
+
+       
+       sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
+                                               &engine->sub22_num_mcomps, &min_weight );
+       return engine->sub22_num_mcomps;
+}
+
+#ifdef X86_CPU
+int build_sub22_mcomps_mmxe(motion_engine_t *engine,
+       int i0,  int j0, int ihigh, int jhigh, 
+       int null_mc_sad,
+       uint8_t *s22org,  uint8_t *s22blk, 
+       int flx, int fh,  int searched_sub44_size )
+{
+       int i,k,s;
+       int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
+
+       int min_weight;
+       int ilim = ihigh-i0;
+       int jlim = jhigh-j0;
+       blockxy matchrec;
+       uint8_t *s22orgblk;
+       int resvec[4];
+
+       /* TODO: The calculation of the lstrow offset really belongs in
+       asm code... */
+       int lstrow=(fh-1)*flx;
+
+       engine->sub22_num_mcomps = 0;
+       for( k = 0; k < searched_sub44_size; ++k )
+       {
+
+               matchrec.x = engine->sub44_mcomps[k].x;
+               matchrec.y = engine->sub44_mcomps[k].y;
+
+               s22orgblk =  s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
+               mblockq_dist22_mmxe(s22orgblk+lstrow, s22blk+lstrow, flx, fh, resvec);
+               for( i = 0; i < 4; ++i )
+               {
+                       if( matchrec.x <= ilim && matchrec.y <= jlim )
+                       {       
+                               s =resvec[i];
+                               if( s < threshold )
+                               {
+                                       engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
+                                       engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
+                                       engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
+                                       ++engine->sub22_num_mcomps;
+                               }
+                       }
+
+                       if( i == 1 )
+                       {
+                               matchrec.x -= 2;
+                               matchrec.y += 2;
+                       }
+                       else
+                       {
+                               matchrec.x += 2;
+                       }
+               }
+
+       }
+
+       
+       sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
+                                               &engine->sub22_num_mcomps, &min_weight );
+       return engine->sub22_num_mcomps;
+}
+
+#endif
+
+/*
+  Search for the best 1-pel match within 1-pel of a good 2*2-pel match.
+       TODO: Its a bit silly to cart around absolute M/C co-ordinates that
+       eventually get turned into relative ones anyway...
+*/
+
+
+static void find_best_one_pel(motion_engine_t *engine,
+        uint8_t *org, uint8_t *blk,
+        int searched_size,
+        int i0, int j0,
+        int ilow, int jlow,
+        int xmax, int ymax,
+        int lx, int h, 
+        mb_motion_s *res
+       )
+
+{
+       int i,k;
+       int d;
+       blockxy minpos = res->pos;
+       int dmin = INT_MAX;
+       uint8_t *orgblk;
+       int penalty;
+       int init_search;
+       int init_size;
+       blockxy matchrec;
+       init_search = searched_size;
+       init_size = engine->sub22_num_mcomps;
+       for( k = 0; k < searched_size; ++k )
+       {       
+               matchrec.x = i0 + engine->sub22_mcomps[k].x;
+               matchrec.y = j0 + engine->sub22_mcomps[k].y;
+               orgblk = org + matchrec.x+lx*matchrec.y;
+               penalty = abs(matchrec.x)+abs(matchrec.y);
+
+               for( i = 0; i < 4; ++i )
+               {
+                       if( matchrec.x <= xmax && matchrec.y <= ymax )
+                       {
+               
+                               d = penalty+(*pdist1_00)(orgblk,blk,lx,h, dmin);
+                               if (d<dmin)
+                               {
+                                       dmin = d;
+                                       minpos = matchrec;
+                               }
+                       }
+                       if( i == 1 )
+                       {
+                               orgblk += lx-1;
+                               matchrec.x -= 1;
+                               matchrec.y += 1;
+                       }
+                       else
+                       {
+                               orgblk += 1;
+                               matchrec.x += 1;
+                       }
+               }
+       }
+
+       res->pos = minpos;
+       res->blk = org + minpos.x+lx*minpos.y;
+       res->sad = dmin;
+
+}
+
+#ifdef X86_CPU
+void find_best_one_pel_mmxe(motion_engine_t *engine,
+       uint8_t *org, uint8_t *blk,
+       int searched_size,
+       int i0, int j0,
+       int ilow, int jlow,
+       int xmax, int ymax,
+       int lx, int h, 
+       mb_motion_s *res
+       )
+
+{
+       int i,k;
+       int d;
+       blockxy minpos = res->pos;
+       int dmin = INT_MAX;
+       uint8_t *orgblk;
+       int penalty;
+       int init_search;
+       int init_size;
+       blockxy matchrec;
+       int resvec[4];
+
+       init_search = searched_size;
+       init_size = engine->sub22_num_mcomps;
+       for( k = 0; k < searched_size; ++k )
+       {       
+               matchrec.x = i0 + engine->sub22_mcomps[k].x;
+               matchrec.y = j0 + engine->sub22_mcomps[k].y;
+               orgblk = org + matchrec.x+lx*matchrec.y;
+               penalty = abs(matchrec.x)+abs(matchrec.y);
+               
+
+               mblockq_dist1_mmxe(orgblk,blk,lx,h, resvec);
+
+               for( i = 0; i < 4; ++i )
+               {
+                       if( matchrec.x <= xmax && matchrec.y <= ymax )
+                       {
+               
+                               d = penalty+resvec[i];
+                               if (d<dmin)
+                               {
+                                       dmin = d;
+                                       minpos = matchrec;
+                               }
+                       }
+                       if( i == 1 )
+                       {
+                               orgblk += lx-1;
+                               matchrec.x -= 1;
+                               matchrec.y += 1;
+                       }
+                       else
+                       {
+                               orgblk += 1;
+                               matchrec.x += 1;
+                       }
+               }
+       }
+
+       res->pos = minpos;
+       res->blk = org + minpos.x+lx*minpos.y;
+       res->sad = dmin;
+
+}
+#endif 
+/*
+ * full search block matching
+ *
+ * A.Stevens 2000: This is now a big misnomer.  The search is now a hierarchical/sub-sampling
+ * search not a full search.  However, experiments have shown it is always close to
+ * optimal and almost always very close or optimal.
+ *
+ * blk: top left pel of (16*h) block
+ * s22blk: top element of fast motion compensation block corresponding to blk
+ * h: height of block
+ * lx: distance (in bytes) of vertically adjacent pels in ref,blk
+ * org: top left pel of source reference picture
+ * ref: top left pel of reconstructed reference picture
+ * i0,j0: center of search window
+ * sx,sy: half widths of search window
+ * xmax,ymax: right/bottom limits of search area
+ * iminp,jminp: pointers to where the result is stored
+ *              result is given as half pel offset from ref(0,0)
+ *              i.e. NOT relative to (i0,j0)
+ */
+
+
+static void fullsearch(motion_engine_t *engine,
+       uint8_t *org,
+       uint8_t *ref,
+       subsampled_mb_s *ssblk,
+       int lx, int i0, int j0, 
+       int sx, int sy, int h,
+       int xmax, int ymax,
+       /* int *iminp, int *jminp, int *sadminp, */
+       mb_motion_s *res
+       )
+{
+       mb_motion_s best;
+       /* int imin, jmin, dmin */
+       int i,j,ilow,ihigh,jlow,jhigh;
+       int d;
+
+       /* NOTE: Surprisingly, the initial motion compensation search
+          works better when the original image not the reference (reconstructed)
+          image is used. 
+       */
+       uint8_t *s22org = (uint8_t*)(org+fsubsample_offset);
+       uint8_t *s44org = (uint8_t*)(org+qsubsample_offset);
+       uint8_t *orgblk;
+       int flx = lx >> 1;
+       int qlx = lx >> 2;
+       int fh = h >> 1;
+       int qh = h >> 2;
+
+       /* xmax and ymax into more useful form... */
+       xmax -= 16;
+       ymax -= h;
+  
+  
+       /* The search radii are *always* multiples of 4 to avoid messiness
+          in the initial 4*4 pel search.  This is handled by the
+          parameter checking/processing code in readparmfile() */
+  
+       /* Create a distance-order mcomps of possible motion compensations
+         based on the fast estimation data - 4*4 pel sums (4*4
+         sub-sampled) rather than actual pel's.  1/16 the size...  */
+       jlow = j0-sy;
+       jlow = jlow < 0 ? 0 : jlow;
+       jhigh =  j0+(sy-1);
+       jhigh = jhigh > ymax ? ymax : jhigh;
+       ilow = i0-sx;
+       ilow = ilow < 0 ? 0 : ilow;
+       ihigh =  i0+(sx-1);
+       ihigh = ihigh > xmax ? xmax : ihigh;
+
+       /*
+          Very rarely this may fail to find matchs due to all the good
+          looking ones being over threshold. hence we make sure we
+          fall back to a 0 motion compensation in this case.
+          
+                The sad for the 0 motion compensation is also very useful as
+                a basis for setting thresholds for rejecting really dud 4*4
+                and 2*2 sub-sampled matches.
+       */
+       memset(&best, 0, sizeof(best));
+       best.sad = (*pdist1_00)(ref + i0 + j0 * lx,
+               ssblk->mb,
+               lx,
+               h,
+               best.sad);
+       best.pos.x = i0;
+       best.pos.y = j0;
+
+       engine->sub44_num_mcomps = build_sub44_mcomps(engine,
+                                                                        ilow, jlow, ihigh, jhigh,
+                                                                         i0, j0,
+                                                                         best.sad,
+                                                                         s44org, 
+                                                                         ssblk->qmb, qlx, qh );
+
+       
+       /* Now create a distance-ordered mcomps of possible motion
+          compensations based on the fast estimation data - 2*2 pel sums
+          using the best fraction of the 4*4 estimates However we cover
+          only coarsely... on 4-pel boundaries...  */
+
+       engine->sub22_num_mcomps = (*pbuild_sub22_mcomps)( engine, i0, j0, ihigh,  jhigh, 
+                                                                                          best.sad,
+                                                                                          s22org, ssblk->fmb, flx, fh, 
+                                                                                          engine->sub44_num_mcomps );
+
+               
+    /* Now choose best 1-pel match from what approximates (not exact
+          due to the pre-processing trick with the mean) the top 1/2 of
+          the 2*2 matches 
+       */
+       
+
+       (*pfind_best_one_pel)( engine, ref, ssblk->mb, engine->sub22_num_mcomps,
+                                          i0, j0,
+                                          ilow, jlow, xmax, ymax, 
+                                          lx, h, &best );
+
+       /* Final polish: half-pel search of best candidate against
+          reconstructed image. 
+       */
+
+       best.pos.x <<= 1; 
+       best.pos.y <<= 1;
+       best.hx = 0;
+       best.hy = 0;
+
+       ilow = best.pos.x - (best.pos.x>(ilow<<1));
+       ihigh = best.pos.x + (best.pos.x<((ihigh)<<1));
+       jlow = best.pos.y - (best.pos.y>(jlow<<1));
+       jhigh =  best.pos.y+ (best.pos.y<((jhigh)<<1));
+
+       for (j=jlow; j<=jhigh; j++)
+       {
+               for (i=ilow; i<=ihigh; i++)
+               {
+                       orgblk = ref+(i>>1)+((j>>1)*lx);
+                       if( i&1 )
+                       {
+                               if( j & 1 )
+                                       d = (*pdist1_11)(orgblk,ssblk->mb,lx,h);
+                               else
+                                       d = (*pdist1_01)(orgblk,ssblk->mb,lx,h);
+                       }
+                       else
+                       {
+                               if( j & 1 )
+                                       d = (*pdist1_10)(orgblk,ssblk->mb,lx,h);
+                               else
+                                       d = (*pdist1_00)(orgblk,ssblk->mb,lx,h,best.sad);
+                       }
+                       if (d<best.sad)
+                       {
+                               best.sad = d;
+                               best.pos.x = i;
+                               best.pos.y = j;
+                               best.blk = orgblk;
+                               best.hx = i&1;
+                               best.hy = j&1;
+                       }
+               }
+       }
+       best.var = (*pdist2)(best.blk, ssblk->mb, lx, best.hx, best.hy, h);
+       *res = best;
+}
+
+/*
+ * total absolute difference between two (16*h) blocks
+ * including optional half pel interpolation of blk1 (hx,hy)
+ * blk1,blk2: addresses of top left pels of both blocks
+ * lx:        distance (in bytes) of vertically adjacent pels
+ * hx,hy:     flags for horizontal and/or vertical interpolation
+ * h:         height of block (usually 8 or 16)
+ * distlim:   bail out if sum exceeds this value
+ */
+
+/* A.Stevens 2000: New version for highly pipelined CPUs where branching is
+   costly.  Really it sucks that C doesn't define a stdlib abs that could
+   be realised as a compiler intrinsic using appropriate CPU instructions.
+   That 1970's heritage...
+*/
+
+
+static int dist1_00(uint8_t *blk1,uint8_t *blk2,
+                                       int lx, int h,int distlim)
+{
+       uint8_t *p1,*p2;
+       int j;
+       int s;
+       register int v;
+
+       s = 0;
+       p1 = blk1;
+       p2 = blk2;
+       for (j=0; j<h; j++)
+       {
+
+#define pipestep(o) v = p1[o]-p2[o]; s+= abs(v);
+               pipestep(0);  pipestep(1);  pipestep(2);  pipestep(3);
+               pipestep(4);  pipestep(5);  pipestep(6);  pipestep(7);
+               pipestep(8);  pipestep(9);  pipestep(10); pipestep(11);
+               pipestep(12); pipestep(13); pipestep(14); pipestep(15);
+#undef pipestep
+
+               if (s >= distlim)
+                       break;
+                       
+               p1+= lx;
+               p2+= lx;
+       }
+       return s;
+}
+
+static int dist1_01(uint8_t *blk1,uint8_t *blk2,
+                                       int lx, int h)
+{
+       uint8_t *p1,*p2;
+       int i,j;
+       int s;
+       register int v;
+
+       s = 0;
+       p1 = blk1;
+       p2 = blk2;
+       for (j=0; j<h; j++)
+       {
+               for (i=0; i<16; i++)
+               {
+
+                       v = ((unsigned int)(p1[i]+p1[i+1])>>1) - p2[i];
+                       /*
+                         v = ((p1[i]>>1)+(p1[i+1]>>1)>>1) - (p2[i]>>1);
+                       */
+                       s+=abs(v);
+               }
+               p1+= lx;
+               p2+= lx;
+       }
+       return s;
+}
+
+static int dist1_10(uint8_t *blk1,uint8_t *blk2,
+                                       int lx, int h)
+{
+       uint8_t *p1,*p1a,*p2;
+       int i,j;
+       int s;
+       register int v;
+
+       s = 0;
+       p1 = blk1;
+       p2 = blk2;
+       p1a = p1 + lx;
+       for (j=0; j<h; j++)
+       {
+               for (i=0; i<16; i++)
+               {
+                       v = ((unsigned int)(p1[i]+p1a[i])>>1) - p2[i];
+                       s+=abs(v);
+               }
+               p1 = p1a;
+               p1a+= lx;
+               p2+= lx;
+       }
+
+       return s;
+}
+
+static int dist1_11(uint8_t *blk1,uint8_t *blk2,
+                                       int lx, int h)
+{
+       uint8_t *p1,*p1a,*p2;
+       int i,j;
+       int s;
+       register int v;
+
+       s = 0;
+       p1 = blk1;
+       p2 = blk2;
+       p1a = p1 + lx;
+         
+       for (j=0; j<h; j++)
+       {
+               for (i=0; i<16; i++)
+               {
+                       v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1])>>2) - p2[i];
+                       s+=abs(v);
+               }
+               p1 = p1a;
+               p1a+= lx;
+               p2+= lx;
+       }
+       return s;
+}
+
+/* USED only during debugging...
+void check_fast_motion_data(uint8_t *blk, char *label )
+{
+  uint8_t *b, *nb;
+  uint8_t *pb,*p;
+  uint8_t *qb;
+  uint8_t *start_s22blk, *start_s44blk;
+  int i;
+  unsigned int mismatch;
+  int nextfieldline;
+  start_s22blk = (blk+height*width);
+  start_s44blk = (blk+height*width+(height>>1)*(width>>1));
+
+  if (pict_struct==FRAME_PICTURE)
+       {
+         nextfieldline = width;
+       }
+  else
+       {
+         nextfieldline = 2*width;
+       }
+
+       mismatch = 0;
+       pb = start_s22blk;
+       qb = start_s44blk;
+       p = blk;
+       while( pb < qb )
+       {
+               for( i = 0; i < nextfieldline/2; ++i )
+               {
+                       mismatch |= ((p[0] + p[1] + p[nextfieldline] + p[nextfieldline+1])>>2) != *pb;
+                       p += 2;
+                       ++pb;                   
+               }
+               p += nextfieldline;
+       }
+               if( mismatch )
+                       {
+                               printf( "Mismatch detected check %s for buffer %08x\n", label,  blk );
+                                       exit(1);
+                       }
+}
+*/
+
+/* 
+   Append fast motion estimation data to original luminance
+   data.  N.b. memory allocation for luminance data allows space
+   for this information...
+ */
+
+void fast_motion_data(uint8_t *blk, int picture_struct )
+{
+       uint8_t *b, *nb;
+       uint8_t *pb;
+       uint8_t *qb;
+       uint8_t *start_s22blk, *start_s44blk;
+       uint16_t *start_rowblk, *start_colblk;
+       int i;
+       int nextfieldline;
+#ifdef TEST_RCSEARCH
+       uint16_t *pc, *pr,*p;
+       int rowsum;
+       int j,s;
+       int down16 = width*16;
+       uint16_t sums[32];
+       uint16_t rowsums[2048];
+       uint16_t colsums[2048];  /* TODO: BUG: should resize with width */
+#endif 
+  
+
+       /* In an interlaced field the "next" line is 2 width's down 
+          rather than 1 width down                                 */
+
+       if (picture_struct==FRAME_PICTURE)
+       {
+               nextfieldline = width;
+       }
+       else
+       {
+               nextfieldline = 2*width;
+       }
+
+       start_s22blk   = blk+fsubsample_offset;
+       start_s44blk   = blk+qsubsample_offset;
+       start_rowblk = (uint16_t *)blk+rowsums_offset;
+       start_colblk = (uint16_t *)blk+colsums_offset;
+       b = blk;
+       nb = (blk+nextfieldline);
+       /* Sneaky stuff here... we can do lines in both fields at once */
+       pb = (uint8_t *) start_s22blk;
+
+       while( nb < start_s22blk )
+       {
+               for( i = 0; i < nextfieldline/4; ++i ) /* We're doing 4 pels horizontally at once */
+               {
+                       /* TODO: A.Stevens this has to be the most word-length dependent
+                          code in the world.  Better than MMX assembler though I guess... */
+                       pb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
+                       pb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;     
+                       pb += 2;
+                       b += 4;
+                       nb += 4;
+               }
+               b += nextfieldline;
+               nb = b + nextfieldline;
+       }
+
+
+       /* Now create the 4*4 sub-sampled data from the 2*2 
+          N.b. the 2*2 sub-sampled motion data preserves the interlace structure of the
+          original.  Albeit half as many lines and pixels...
+       */
+
+       nextfieldline = nextfieldline >> 1;
+
+       qb = start_s44blk;
+       b  = start_s22blk;
+       nb = (start_s22blk+nextfieldline);
+
+       while( nb < start_s44blk )
+       {
+               for( i = 0; i < nextfieldline/4; ++i )
+               {
+                       /* TODO: BRITTLE: A.Stevens - this only works for uint8_t = uint8_t */
+                       qb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
+                       qb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;
+                       qb += 2;
+                       b += 4;
+                       nb += 4;
+               }
+               b += nextfieldline;
+               nb = b + nextfieldline;
+       }
+
+#ifdef TEST_RCSEARCH
+       /* TODO: BUG: THIS CODE DOES NOT YET ALLOW FOR INTERLACED FIELDS.... */
+  
+       /*
+         Initial row sums....
+       */
+       pb = blk;
+       for(j = 0; j < height; ++j )
+       {
+               rowsum = 0;
+               for( i = 0; i < 16; ++ i )
+               {
+                       rowsum += pb[i];
+               }
+               rowsums[j] = rowsum;
+               pb += width;
+       }
+  
+       /*
+         Initial column sums
+       */
+       for( i = 0; i < width; ++i )
+       {
+               colsums[i] = 0;
+       }
+       pb = blk;
+       for( j = 0; j < 16; ++j )
+       {
+               for( i = 0; i < width; ++i )
+               {
+                       colsums[i] += *pb;
+                       ++pb;
+               }
+       }
+  
+       /* Now fill in the row/column sum tables...
+          Note: to allow efficient construction of sum/col differences for a
+          given position row sums are held in a *column major* array
+          (changing y co-ordinate makes for small index changes)
+          the col sums are held in a *row major* array
+       */
+  
+       pb = blk;
+       pc = start_colblk;
+       for(j = 0; j <32; ++j )
+       {
+               pr = start_rowblk;
+               rowsum = rowsums[j];
+               for( i = 0; i < width-16; ++i )
+               {
+                       pc[i] = colsums[i];
+                       pr[j] = rowsum;
+                       colsums[i] = (colsums[i] + pb[down16] )-pb[0];
+                       rowsum = (rowsum + pb[16]) - pb[0];
+                       ++pb;
+                       pr += height;
+               }
+               pb += 16;   /* move pb on to next row... rememember we only did width-16! */
+               pc += width;
+       }
+#endif                 
+}
+
+
+static int dist22( uint8_t *s22blk1, uint8_t *s22blk2,int flx,int fh)
+{
+       uint8_t *p1 = s22blk1;
+       uint8_t *p2 = s22blk2;
+       int s = 0;
+       int j;
+
+       for( j = 0; j < fh; ++j )
+       {
+               register int diff;
+#define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff)
+               pipestep(0); pipestep(1);
+               pipestep(2); pipestep(3);
+               pipestep(4); pipestep(5);
+               pipestep(6); pipestep(7);
+               p1 += flx;
+               p2 += flx;
+#undef pipestep
+       }
+
+       return s;
+}
+
+
+
+/*
+  Sum absolute differences for 4*4 sub-sampled data.  
+
+  TODO: currently assumes  only 16*16 or 16*8 motion compensation will be used...
+  I.e. 4*4 or 4*2 sub-sampled blocks will be compared.
+ */
+
+
+static int dist44( uint8_t *s44blk1, uint8_t *s44blk2,int qlx,int qh)
+{
+       register uint8_t *p1 = s44blk1;
+       register uint8_t *p2 = s44blk2;
+       int s = 0;
+       register int diff;
+
+       /* #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff) */
+#define pipestep(o) diff = p1[o]-p2[o]; s += diff < 0 ? -diff : diff;
+       pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
+       if( qh > 1 )
+       {
+               p1 += qlx; p2 += qlx;
+               pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
+               if( qh > 2 )
+               {
+                       p1 += qlx; p2 += qlx;
+                       pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
+                       p1 += qlx; p2 += qlx;
+                       pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
+               }
+       }
+
+
+       return s;
+}
+
+/*
+ * total squared difference between two (8*h) blocks of 2*2 sub-sampled pels
+ * blk1,blk2: addresses of top left pels of both blocks
+ * lx:        distance (in bytes) of vertically adjacent pels
+ * h:         height of block (usually 8 or 16)
+ */
+static int dist2_22(uint8_t *blk1, uint8_t *blk2, int lx, int h)
+{
+       uint8_t *p1 = blk1, *p2 = blk2;
+       int i,j,v;
+       int s = 0;
+       for (j=0; j<h; j++)
+       {
+               for (i=0; i<8; i++)
+               {
+                       v = p1[i] - p2[i];
+                       s+= v*v;
+               }
+               p1+= lx;
+               p2+= lx;
+       }
+       return s;
+}
+
+/* total squared difference between bidirection prediction of (8*h)
+ * blocks of 2*2 sub-sampled pels and reference 
+ * blk1f, blk1b,blk2: addresses of top left
+ * pels of blocks 
+ * lx: distance (in bytes) of vertically adjacent
+ * pels 
+ * h: height of block (usually 4 or 8)
+ */
+static int bdist2_22(uint8_t *blk1f, uint8_t *blk1b, uint8_t *blk2, 
+                                        int lx, int h)
+{
+       uint8_t *p1f = blk1f,*p1b = blk1b,*p2 = blk2;
+       int i,j,v;
+       int s = 0;
+       for (j=0; j<h; j++)
+       {
+               for (i=0; i<8; i++)
+               {
+                       v = ((p1f[i]+p1b[i]+1)>>1) - p2[i];
+                       s+= v*v;
+               }
+               p1f+= lx;
+               p1b+= lx;
+               p2+= lx;
+       }
+       return s;
+}
+
+/*
+ * total squared difference between two (16*h) blocks
+ * including optional half pel interpolation of blk1 (hx,hy)
+ * blk1,blk2: addresses of top left pels of both blocks
+ * lx:        distance (in bytes) of vertically adjacent pels
+ * hx,hy:     flags for horizontal and/or vertical interpolation
+ * h:         height of block (usually 8 or 16)
+ */
+
+static int dist2(blk1,blk2,lx,hx,hy,h)
+       uint8_t *blk1,*blk2;
+       int lx,hx,hy,h;
+{
+       uint8_t *p1,*p1a,*p2;
+       int i,j;
+       int s,v;
+
+       s = 0;
+       p1 = blk1;
+       p2 = blk2;
+       if (!hx && !hy)
+               for (j=0; j<h; j++)
+               {
+                       for (i=0; i<16; i++)
+                       {
+                               v = p1[i] - p2[i];
+                               s+= v*v;
+                       }
+                       p1+= lx;
+                       p2+= lx;
+               }
+       else if (hx && !hy)
+               for (j=0; j<h; j++)
+               {
+                       for (i=0; i<16; i++)
+                       {
+                               v = ((unsigned int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
+                               s+= v*v;
+                       }
+                       p1+= lx;
+                       p2+= lx;
+               }
+       else if (!hx && hy)
+       {
+               p1a = p1 + lx;
+               for (j=0; j<h; j++)
+               {
+                       for (i=0; i<16; i++)
+                       {
+                               v = ((unsigned int)(p1[i]+p1a[i]+1)>>1) - p2[i];
+                               s+= v*v;
+                       }
+                       p1 = p1a;
+                       p1a+= lx;
+                       p2+= lx;
+               }
+       }
+       else /* if (hx && hy) */
+       {
+               p1a = p1 + lx;
+               for (j=0; j<h; j++)
+               {
+                       for (i=0; i<16; i++)
+                       {
+                               v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1]+2)>>2) - p2[i];
+                               s+= v*v;
+                       }
+                       p1 = p1a;
+                       p1a+= lx;
+                       p2+= lx;
+               }
+       }
+       return s;
+}
+
+
+/*
+ * absolute difference error between a (16*h) block and a bidirectional
+ * prediction
+ *
+ * p2: address of top left pel of block
+ * pf,hxf,hyf: address and half pel flags of forward ref. block
+ * pb,hxb,hyb: address and half pel flags of backward ref. block
+ * h: height of block
+ * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
+ */
+
+static int bdist1(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
+       uint8_t *pf,*pb,*p2;
+       int lx,hxf,hyf,hxb,hyb,h;
+{
+       uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
+       int i,j;
+       int s,v;
+
+       pfa = pf + hxf;
+       pfb = pf + lx*hyf;
+       pfc = pfb + hxf;
+
+       pba = pb + hxb;
+       pbb = pb + lx*hyb;
+       pbc = pbb + hxb;
+
+       s = 0;
+
+       for (j=0; j<h; j++)
+       {
+               for (i=0; i<16; i++)
+               {
+                       v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
+                                 ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
+                               - *p2++;
+                       s += abs(v);
+               }
+               p2+= lx-16;
+               pf+= lx-16;
+               pfa+= lx-16;
+               pfb+= lx-16;
+               pfc+= lx-16;
+               pb+= lx-16;
+               pba+= lx-16;
+               pbb+= lx-16;
+               pbc+= lx-16;
+       }
+
+       return s;
+}
+
+/*
+ * squared error between a (16*h) block and a bidirectional
+ * prediction
+ *
+ * p2: address of top left pel of block
+ * pf,hxf,hyf: address and half pel flags of forward ref. block
+ * pb,hxb,hyb: address and half pel flags of backward ref. block
+ * h: height of block
+ * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
+ */
+
+static int bdist2(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
+       uint8_t *pf,*pb,*p2;
+       int lx,hxf,hyf,hxb,hyb,h;
+{
+       uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
+       int i,j;
+       int s,v;
+
+       pfa = pf + hxf;
+       pfb = pf + lx*hyf;
+       pfc = pfb + hxf;
+
+       pba = pb + hxb;
+       pbb = pb + lx*hyb;
+       pbc = pbb + hxb;
+
+       s = 0;
+
+       for (j=0; j<h; j++)
+       {
+               for (i=0; i<16; i++)
+               {
+                       v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
+                                 ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
+                               - *p2++;
+                       s+=v*v;
+               }
+               p2+= lx-16;
+               pf+= lx-16;
+               pfa+= lx-16;
+               pfb+= lx-16;
+               pfc+= lx-16;
+               pb+= lx-16;
+               pba+= lx-16;
+               pbb+= lx-16;
+               pbc+= lx-16;
+       }
+
+       return s;
+}
+
+
+/*
+ * variance of a (size*size) block, multiplied by 256
+ * p:  address of top left pel of block
+ * lx: distance (in bytes) of vertically adjacent pels
+ */
+static int variance(uint8_t *p, int size,      int lx)
+{
+       int i,j;
+       unsigned int v,s,s2;
+
+       s = s2 = 0;
+
+       for (j=0; j<size; j++)
+       {
+               for (i=0; i<size; i++)
+               {
+                       v = *p++;
+                       s+= v;
+                       s2+= v*v;
+               }
+               p+= lx-size;
+       }
+       return s2 - (s*s)/(size*size);
+}
+
+/*
+  Compute the variance of the residual of uni-directionally motion
+  compensated block.
+ */
+
+static int unidir_pred_var( const mb_motion_s *motion,
+                                                       uint8_t *mb,  
+                                                       int lx, 
+                                                       int h)
+{
+       return (*pdist2)(motion->blk, mb, lx, motion->hx, motion->hy, h);
+}
+
+
+/*
+  Compute the variance of the residual of bi-directionally motion
+  compensated block.
+ */
+
+static int bidir_pred_var( const mb_motion_s *motion_f, 
+                                                  const mb_motion_s *motion_b,
+                                                  uint8_t *mb,  
+                                                  int lx, int h)
+{
+       return (*pbdist2)( motion_f->blk, motion_b->blk,
+                                          mb, lx, 
+                                          motion_f->hx, motion_f->hy,
+                                          motion_b->hx, motion_b->hy,
+                                          h);
+}
+
+/*
+  Compute SAD for bi-directionally motion compensated blocks...
+ */
+
+static int bidir_pred_sad( const mb_motion_s *motion_f, 
+                                                  const mb_motion_s *motion_b,
+                                                  uint8_t *mb,  
+                                                  int lx, int h)
+{
+       return (*pbdist1)(motion_f->blk, motion_b->blk, 
+                                        mb, lx, 
+                                        motion_f->hx, motion_f->hy,
+                                        motion_b->hx, motion_b->hy,
+                                        h);
+}
+
+static void frame_ME(motion_engine_t *engine,
+                                       pict_data_s *picture,
+                                        motion_comp_s *mc,
+                                        int mb_row_start,
+                                        int i, int j, 
+                                        mbinfo_s *mbi)
+{
+       mb_motion_s framef_mc;
+       mb_motion_s frameb_mc;
+       mb_motion_s dualpf_mc;
+       mb_motion_s topfldf_mc;
+       mb_motion_s botfldf_mc;
+       mb_motion_s topfldb_mc;
+       mb_motion_s botfldb_mc;
+
+       int var,v0;
+       int vmc,vmcf,vmcr,vmci;
+       int vmcfieldf,vmcfieldr,vmcfieldi;
+       subsampled_mb_s ssmb;
+       int imins[2][2],jmins[2][2];
+       int imindmv,jmindmv,vmc_dp;
+//printf("frame_ME 1\n");
+
+
+       /* A.Stevens fast motion estimation data is appended to actual
+          luminance information 
+       */
+       ssmb.mb = mc->cur[0] + mb_row_start + i;
+       ssmb.umb = (uint8_t*)(mc->cur[1] + (i>>1) + (mb_row_start>>2));
+       ssmb.vmb = (uint8_t*)(mc->cur[2] + (i>>1) + (mb_row_start>>2));
+       ssmb.fmb = (uint8_t*)(mc->cur[0] + fsubsample_offset + 
+                                                 ((i>>1) + (mb_row_start>>2)));
+       ssmb.qmb = (uint8_t*)(mc->cur[0] + qsubsample_offset + 
+                                                 (i>>2) + (mb_row_start>>4));
+
+       /* Compute variance MB as a measure of Intra-coding complexity
+          We include chrominance information here, scaled to compensate
+          for sub-sampling.  Silly MPEG forcing chrom/lum to have same
+          quantisations...
+        */
+       var = variance(ssmb.mb,16,width);
+
+//printf("motion %d\n", picture->pict_type);
+       if (picture->pict_type==I_TYPE)
+       {
+               mbi->mb_type = MB_INTRA;
+       }
+       else if (picture->pict_type==P_TYPE)
+       {
+               /* For P pictures we take into account chrominance. This
+                  provides much better performance at scene changes */
+               var += chrom_var_sum(&ssmb,16,width);
+
+               if (picture->frame_pred_dct)
+               {
+                       fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
+                                          width,i,j,mc->sxf,mc->syf,16,width,height,
+                                           &framef_mc);
+                       framef_mc.fieldoff = 0;
+                       vmc = framef_mc.var +
+                               unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
+                       mbi->motion_type = MC_FRAME;
+               }
+               else
+               {
+//printf("frame_ME 2 %p %p\n", mc->oldorg[0], mc->oldref[0]);
+                       frame_estimate(engine, mc->oldorg[0],
+                               mc->oldref[0],
+                               &ssmb,
+                               i,
+                               j,
+                               mc->sxf,
+                               mc->syf,
+                               &framef_mc,
+                               &topfldf_mc,
+                               &botfldf_mc,
+                               imins,
+                               jmins);
+//printf("frame_ME 3\n");
+                       vmcf = framef_mc.var + 
+                               unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
+                       vmcfieldf = 
+                               topfldf_mc.var + 
+                               unidir_chrom_var_sum( &topfldf_mc, mc->oldref, &ssmb, (width<<1), 8 ) +
+                               botfldf_mc.var + 
+                               unidir_chrom_var_sum( &botfldf_mc, mc->oldref, &ssmb, (width<<1), 8 );
+                       /* DEBUG DP currently disabled... */
+//                     if ( M==1)
+//                     {
+//                             dpframe_estimate(engine, picture,mc->oldref[0],&ssmb,
+//                                                              i,j>>1,imins,jmins,
+//                                                              &dualpf_mc,
+//                                                              &imindmv,&jmindmv, &vmc_dp);
+//                     }
+
+                       /* NOTE: Typically M =3 so DP actually disabled... */
+                       /* select between dual prime, frame and field prediction */
+//                     if ( M==1 && vmc_dp<vmcf && vmc_dp<vmcfieldf)
+//                     {
+//                             mbi->motion_type = MC_DMV;
+//                             /* No chrominance squared difference  measure yet.
+//                                Assume identical to luminance */
+//                             vmc = vmc_dp + vmc_dp;
+//                     }
+//                     else 
+                       if ( vmcf < vmcfieldf)
+                       {
+                               mbi->motion_type = MC_FRAME;
+                               vmc = vmcf;
+                                       
+                       }
+                       else
+                       {
+                               mbi->motion_type = MC_FIELD;
+                               vmc = vmcfieldf;
+                       }
+               }
+
+
+               /* select between intra or non-intra coding:
+                *
+                * selection is based on intra block variance (var) vs.
+                * prediction error variance (vmc)
+                *
+                * Used to be: blocks with small prediction error are always 
+                * coded non-intra even if variance is smaller (is this reasonable?
+                *
+                * TODO: A.Stevens Jul 2000
+                * The bbmpeg guys have found this to be *unreasonable*.
+                * I'm not sure I buy their solution using vmc*2.  It is probabably
+                * the vmc>= 9*256 test that is suspect.
+                * 
+                */
+
+
+               if (vmc>var /*&& vmc>=(3*3)*16*16*2*/ )
+               {
+                       mbi->mb_type = MB_INTRA;
+                       mbi->var = var;
+               }
+               
+               else
+               {
+                       /* select between MC / No-MC
+                        *
+                        * use No-MC if var(No-MC) <= 1.25*var(MC)
+                        * (i.e slightly biased towards No-MC)
+                        *
+                        * blocks with small prediction error are always coded as No-MC
+                        * (requires no motion vectors, allows skipping)
+                        */
+                       v0 = (*pdist2)(mc->oldref[0]+i+width*j,ssmb.mb,width,0,0,16);
+
+                       if (4*v0>5*vmc )
+                       {
+                               /* use MC */
+                               var = vmc;
+                               mbi->mb_type = MB_FORWARD;
+                               if (mbi->motion_type==MC_FRAME)
+                               {
+                                       mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
+                                       mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
+                               }
+                               else if (mbi->motion_type==MC_DMV)
+                               {
+                                       /* these are FRAME vectors */
+                                       /* same parity vector */
+                                       mbi->MV[0][0][0] = dualpf_mc.pos.x - (i<<1);
+                                       mbi->MV[0][0][1] = (dualpf_mc.pos.y<<1) - (j<<1);
+
+                                       /* opposite parity vector */
+                                       mbi->dmvector[0] = imindmv;
+                                       mbi->dmvector[1] = jmindmv;
+                               }
+                               else
+                               {
+                                       /* these are FRAME vectors */
+                                       mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
+                                       mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
+                                       mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
+                                       mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
+                                       mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
+                                       mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
+                               }
+                       }
+                       else
+                       {
+
+                               /* No-MC */
+                               var = v0;
+                               mbi->mb_type = 0;
+                               mbi->motion_type = MC_FRAME;
+                               mbi->MV[0][0][0] = 0;
+                               mbi->MV[0][0][1] = 0;
+                       }
+               }
+       }
+       else /* if (pict_type==B_TYPE) */
+       {
+               if (picture->frame_pred_dct)
+               {
+                       var = variance(ssmb.mb,16,width);
+                       /* forward */
+                       fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
+                                          width,i,j,mc->sxf,mc->syf,
+                                          16,width,height,
+                                          &framef_mc
+                                          );
+                       framef_mc.fieldoff = 0;
+                       vmcf = framef_mc.var;
+
+                       /* backward */
+                       fullsearch(engine, mc->neworg[0],mc->newref[0],&ssmb,
+                                          width,i,j,mc->sxb,mc->syb,
+                                          16,width,height,
+                                          &frameb_mc);
+                       frameb_mc.fieldoff = 0;
+                       vmcr = frameb_mc.var;
+
+                       /* interpolated (bidirectional) */
+
+                       vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
+
+                       /* decisions */
+
+                       /* select between forward/backward/interpolated prediction:
+                        * use the one with smallest mean sqaured prediction error
+                        */
+                       if (vmcf<=vmcr && vmcf<=vmci)
+                       {
+                               vmc = vmcf;
+                               mbi->mb_type = MB_FORWARD;
+                       }
+                       else if (vmcr<=vmci)
+                       {
+                               vmc = vmcr;
+                               mbi->mb_type = MB_BACKWARD;
+                       }
+                       else
+                       {
+                               vmc = vmci;
+                               mbi->mb_type = MB_FORWARD|MB_BACKWARD;
+                       }
+
+                       mbi->motion_type = MC_FRAME;
+               }
+               else
+               {
+                       /* forward prediction */
+                       frame_estimate(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
+                                                  i,j,mc->sxf,mc->syf,
+                                                  &framef_mc,
+                                                  &topfldf_mc,
+                                                  &botfldf_mc,
+                                                  imins,jmins);
+
+
+                       /* backward prediction */
+                       frame_estimate(engine, mc->neworg[0],mc->newref[0],&ssmb,
+                                                  i,j,mc->sxb,mc->syb,
+                                                  &frameb_mc,
+                                                  &topfldb_mc,
+                                                  &botfldb_mc,
+                                                  imins,jmins);
+
+                       vmcf = framef_mc.var;
+                       vmcr = frameb_mc.var;
+                       vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
+
+                       vmcfieldf = topfldf_mc.var + botfldf_mc.var;
+                       vmcfieldr = topfldb_mc.var + botfldb_mc.var;
+                       vmcfieldi = bidir_pred_var( &topfldf_mc, &topfldb_mc, ssmb.mb, 
+                                                                               width<<1, 8) +
+                                       bidir_pred_var( &botfldf_mc, &botfldb_mc, ssmb.mb, 
+                                                                               width<<1, 8);
+
+                       /* select prediction type of minimum distance from the
+                        * six candidates (field/frame * forward/backward/interpolated)
+                        */
+                       if (vmci<vmcfieldi && vmci<vmcf && vmci<vmcfieldf
+                                 && vmci<vmcr && vmci<vmcfieldr)
+                       {
+                               /* frame, interpolated */
+                               mbi->mb_type = MB_FORWARD|MB_BACKWARD;
+                               mbi->motion_type = MC_FRAME;
+                               vmc = vmci;
+                       }
+                       else if ( vmcfieldi<vmcf && vmcfieldi<vmcfieldf
+                                          && vmcfieldi<vmcr && vmcfieldi<vmcfieldr)
+                       {
+                               /* field, interpolated */
+                               mbi->mb_type = MB_FORWARD|MB_BACKWARD;
+                               mbi->motion_type = MC_FIELD;
+                               vmc = vmcfieldi;
+                       }
+                       else if (vmcf<vmcfieldf && vmcf<vmcr && vmcf<vmcfieldr)
+                       {
+                               /* frame, forward */
+                               mbi->mb_type = MB_FORWARD;
+                               mbi->motion_type = MC_FRAME;
+                               vmc = vmcf;
+
+                       }
+                       else if ( vmcfieldf<vmcr && vmcfieldf<vmcfieldr)
+                       {
+                               /* field, forward */
+                               mbi->mb_type = MB_FORWARD;
+                               mbi->motion_type = MC_FIELD;
+                               vmc = vmcfieldf;
+                       }
+                       else if (vmcr<vmcfieldr)
+                       {
+                               /* frame, backward */
+                               mbi->mb_type = MB_BACKWARD;
+                               mbi->motion_type = MC_FRAME;
+                               vmc = vmcr;
+                       }
+                       else
+                       {
+                               /* field, backward */
+                               mbi->mb_type = MB_BACKWARD;
+                               mbi->motion_type = MC_FIELD;
+                               vmc = vmcfieldr;
+
+                       }
+               }
+
+               /* select between intra or non-intra coding:
+                *
+                * selection is based on intra block variance (var) vs.
+                * prediction error variance (vmc)
+                *
+                * Used to be: blocks with small prediction error are always 
+                * coded non-intra even if variance is smaller (is this reasonable?
+                *
+                * TODO: A.Stevens Jul 2000
+                * The bbmpeg guys have found this to be *unreasonable*.
+                * I'm not sure I buy their solution using vmc*2 in the first comparison.
+                * It is probabably the vmc>= 9*256 test that is suspect.
+                *
+                */
+               if (vmc>var && vmc>=9*256)
+                       mbi->mb_type = MB_INTRA;
+               else
+               {
+                       var = vmc;
+                       if (mbi->motion_type==MC_FRAME)
+                       {
+                               /* forward */
+                               mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
+                               mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
+                               /* backward */
+                               mbi->MV[0][1][0] = frameb_mc.pos.x - (i<<1);
+                               mbi->MV[0][1][1] = frameb_mc.pos.y - (j<<1);
+                       }
+                       else
+                       {
+                               /* these are FRAME vectors */
+                               /* forward */
+                               mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
+                               mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
+                               mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
+                               mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
+                               mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
+                               mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
+                               /* backward */
+                               mbi->MV[0][1][0] = topfldb_mc.pos.x - (i<<1);
+                               mbi->MV[0][1][1] = (topfldb_mc.pos.y<<1) - (j<<1);
+                               mbi->MV[1][1][0] = botfldb_mc.pos.x - (i<<1);
+                               mbi->MV[1][1][1] = (botfldb_mc.pos.y<<1) - (j<<1);
+                               mbi->mv_field_sel[0][1] = topfldb_mc.fieldsel;
+                               mbi->mv_field_sel[1][1] = botfldb_mc.fieldsel;
+                       }
+               }
+       }
+}
+
+
+
+/*
+ * motion estimation for field pictures
+ *
+ * mbi:    pointer to macroblock info structure
+ * secondfield: indicates second field of a frame (in P fields this means
+ *              that reference field of opposite parity is in curref instead
+ *              of oldref)
+ * ipflag: indicates a P type field which is the second field of a frame
+ *         in which the first field is I type (this restricts predictions
+ *         to be based only on the opposite parity (=I) field)
+ *
+ * results:
+ * mbi->
+ *  mb_type: 0, MB_INTRA, MB_FORWARD, MB_BACKWARD, MB_FORWARD|MB_BACKWARD
+ *  MV[][][]: motion vectors (field format)
+ *  mv_field_sel: top/bottom field
+ *  motion_type: MC_FIELD, MC_16X8
+ *
+ * uses global vars: pict_type, pict_struct
+ */
+static void field_ME(motion_engine_t *engine,
+       pict_data_s *picture,
+       motion_comp_s *mc,
+       int mb_row_start,
+       int i, int j, 
+       mbinfo_s *mbi, 
+       int secondfield, int ipflag)
+{
+       int w2;
+       uint8_t *toporg, *topref, *botorg, *botref;
+       subsampled_mb_s ssmb;
+       mb_motion_s fields_mc, dualp_mc;
+       mb_motion_s fieldf_mc, fieldb_mc;
+       mb_motion_s field8uf_mc, field8lf_mc;
+       mb_motion_s field8ub_mc, field8lb_mc;
+       int var,vmc,v0,dmc,dmcfieldi,dmcfield,dmcfieldf,dmcfieldr,dmc8i;
+       int imins,jmins;
+       int dmc8f,dmc8r;
+       int vmc_dp,dmc_dp;
+
+       w2 = width<<1;
+
+       /* Fast motion data sits at the end of the luminance buffer */
+       ssmb.mb = mc->cur[0] + i + w2*j;
+       ssmb.umb = mc->cur[1] + ((i>>1)+(w2>>1)*(j>>1));
+       ssmb.vmb = mc->cur[2] + ((i>>1)+(w2>>1)*(j>>1));
+       ssmb.fmb = mc->cur[0] + fsubsample_offset+((i>>1)+(w2>>1)*(j>>1));
+       ssmb.qmb = mc->cur[0] + qsubsample_offset+ (i>>2)+(w2>>2)*(j>>2);
+
+       if (picture->pict_struct==BOTTOM_FIELD)
+       {
+               ssmb.mb += width;
+               ssmb.umb += (width >> 1);
+               ssmb.vmb += (width >> 1);
+               ssmb.fmb += (width >> 1);
+               ssmb.qmb += (width >> 2);
+       }
+
+       var = variance(ssmb.mb,16,w2) + 
+               ( variance(ssmb.umb,8,(width>>1)) + variance(ssmb.vmb,8,(width>>1)))*2;
+
+       if (picture->pict_type==I_TYPE)
+               mbi->mb_type = MB_INTRA;
+       else if (picture->pict_type==P_TYPE)
+       {
+               toporg = mc->oldorg[0];
+               topref = mc->oldref[0];
+               botorg = mc->oldorg[0] + width;
+               botref = mc->oldref[0] + width;
+
+               if (secondfield)
+               {
+                       /* opposite parity field is in same frame */
+                       if (picture->pict_struct==TOP_FIELD)
+                       {
+                               /* current is top field */
+                               botorg = mc->cur[0] + width;
+                               botref = mc->curref[0] + width;
+                       }
+                       else
+                       {
+                               /* current is bottom field */
+                               toporg = mc->cur[0];
+                               topref = mc->curref[0];
+                       }
+               }
+
+               field_estimate(engine,
+                                               picture,
+                                          toporg,topref,botorg,botref,&ssmb,
+                                          i,j,mc->sxf,mc->syf,ipflag,
+                                          &fieldf_mc,
+                                          &field8uf_mc,
+                                          &field8lf_mc,
+                                          &fields_mc);
+               dmcfield = fieldf_mc.sad;
+               dmc8f = field8uf_mc.sad + field8lf_mc.sad;
+
+//             if (M==1 && !ipflag)  /* generic condition which permits Dual Prime */
+//             {
+//                     dpfield_estimate(engine,
+//                                                     picture,
+//                                                      topref,botref,ssmb.mb,i,j,imins,jmins,
+//                                                      &dualp_mc,
+//                                                      &vmc_dp);
+//                     dmc_dp = dualp_mc.sad;
+//             }
+               
+//             /* select between dual prime, field and 16x8 prediction */
+//             if (M==1 && !ipflag && dmc_dp<dmc8f && dmc_dp<dmcfield)
+//             {
+//                     /* Dual Prime prediction */
+//                     mbi->motion_type = MC_DMV;
+//                     dmc = dualp_mc.sad;
+//                     vmc = vmc_dp;
+//
+//             }
+//             else 
+               if (dmc8f<dmcfield)
+               {
+                       /* 16x8 prediction */
+                       mbi->motion_type = MC_16X8;
+                       /* upper and lower half blocks */
+                       vmc =  unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
+                       vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
+               }
+               else
+               {
+                       /* field prediction */
+                       mbi->motion_type = MC_FIELD;
+                       vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16 );
+               }
+
+               /* select between intra and non-intra coding */
+
+               if ( vmc>var && vmc>=9*256)
+                       mbi->mb_type = MB_INTRA;
+               else
+               {
+                       /* zero MV field prediction from same parity ref. field
+                        * (not allowed if ipflag is set)
+                        */
+                       if (!ipflag)
+                               v0 = (*pdist2)(((picture->pict_struct==BOTTOM_FIELD)?botref:topref) + i + w2*j,
+                                                          ssmb.mb,w2,0,0,16);
+                       if (ipflag  || (4*v0>5*vmc && v0>=9*256))
+                       {
+                               var = vmc;
+                               mbi->mb_type = MB_FORWARD;
+                               if (mbi->motion_type==MC_FIELD)
+                               {
+                                       mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
+                                       mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
+                                       mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
+                               }
+                               else if (mbi->motion_type==MC_DMV)
+                               {
+                                       /* same parity vector */
+                                       mbi->MV[0][0][0] = imins - (i<<1);
+                                       mbi->MV[0][0][1] = jmins - (j<<1);
+
+                                       /* opposite parity vector */
+                                       mbi->dmvector[0] = dualp_mc.pos.x;
+                                       mbi->dmvector[1] = dualp_mc.pos.y;
+                               }
+                               else
+                               {
+                                       mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
+                                       mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
+                                       mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
+                                       mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
+                                       mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
+                                       mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
+                               }
+                       }
+                       else
+                       {
+                               /* No MC */
+                               var = v0;
+                               mbi->mb_type = 0;
+                               mbi->motion_type = MC_FIELD;
+                               mbi->MV[0][0][0] = 0;
+                               mbi->MV[0][0][1] = 0;
+                               mbi->mv_field_sel[0][0] = (picture->pict_struct==BOTTOM_FIELD);
+                       }
+               }
+       }
+       else /* if (pict_type==B_TYPE) */
+       {
+               /* forward prediction */
+               field_estimate(engine, 
+                                               picture,
+                                          mc->oldorg[0],mc->oldref[0],
+                                          mc->oldorg[0]+width,mc->oldref[0]+width,&ssmb,
+                                          i,j,mc->sxf,mc->syf,0,
+                                          &fieldf_mc,
+                                          &field8uf_mc,
+                                          &field8lf_mc,
+                                          &fields_mc);
+               dmcfieldf = fieldf_mc.sad;
+               dmc8f = field8uf_mc.sad + field8lf_mc.sad;
+
+               /* backward prediction */
+               field_estimate(engine,
+                                               picture,
+                                          mc->neworg[0],mc->newref[0],mc->neworg[0]+width,mc->newref[0]+width,
+                                          &ssmb,
+                                          i,j,mc->sxb,mc->syb,0,
+                                          &fieldb_mc,
+                                          &field8ub_mc,
+                                          &field8lb_mc,
+                                          &fields_mc);
+               dmcfieldr = fieldb_mc.sad;
+               dmc8r = field8ub_mc.sad + field8lb_mc.sad;
+
+               /* calculate distances for bidirectional prediction */
+               /* field */
+               dmcfieldi = bidir_pred_sad( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
+
+               /* 16x8 upper and lower half blocks */
+               dmc8i =  bidir_pred_sad( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 16 );
+               dmc8i += bidir_pred_sad( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 16 );
+
+               /* select prediction type of minimum distance */
+               if (dmcfieldi<dmc8i && dmcfieldi<dmcfieldf && dmcfieldi<dmc8f
+                       && dmcfieldi<dmcfieldr && dmcfieldi<dmc8r)
+               {
+                       /* field, interpolated */
+                       mbi->mb_type = MB_FORWARD|MB_BACKWARD;
+                       mbi->motion_type = MC_FIELD;
+                       vmc = bidir_pred_var( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
+               }
+               else if (dmc8i<dmcfieldf && dmc8i<dmc8f
+                                && dmc8i<dmcfieldr && dmc8i<dmc8r)
+               {
+                       /* 16x8, interpolated */
+                       mbi->mb_type = MB_FORWARD|MB_BACKWARD;
+                       mbi->motion_type = MC_16X8;
+
+                       /* upper and lower half blocks */
+                       vmc =  bidir_pred_var( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 8);
+                       vmc += bidir_pred_var( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 8);
+               }
+               else if (dmcfieldf<dmc8f && dmcfieldf<dmcfieldr && dmcfieldf<dmc8r)
+               {
+                       /* field, forward */
+                       mbi->mb_type = MB_FORWARD;
+                       mbi->motion_type = MC_FIELD;
+                       vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16);
+               }
+               else if (dmc8f<dmcfieldr && dmc8f<dmc8r)
+               {
+                       /* 16x8, forward */
+                       mbi->mb_type = MB_FORWARD;
+                       mbi->motion_type = MC_16X8;
+
+                       /* upper and lower half blocks */
+                       vmc =  unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
+                       vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
+               }
+               else if (dmcfieldr<dmc8r)
+               {
+                       /* field, backward */
+                       mbi->mb_type = MB_BACKWARD;
+                       mbi->motion_type = MC_FIELD;
+                       vmc = unidir_pred_var( &fieldb_mc, ssmb.mb, w2, 16 );
+               }
+               else
+               {
+                       /* 16x8, backward */
+                       mbi->mb_type = MB_BACKWARD;
+                       mbi->motion_type = MC_16X8;
+
+                       /* upper and lower half blocks */
+                       vmc =  unidir_pred_var( &field8ub_mc, ssmb.mb, w2, 8);
+                       vmc += unidir_pred_var( &field8lb_mc, ssmb.mb, w2, 8);
+
+               }
+
+               /* select between intra and non-intra coding */
+               if (vmc>var && vmc>=9*256)
+                       mbi->mb_type = MB_INTRA;
+               else
+               {
+                       var = vmc;
+                       if (mbi->motion_type==MC_FIELD)
+                       {
+                               /* forward */
+                               mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
+                               mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
+                               mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
+                               /* backward */
+                               mbi->MV[0][1][0] = fieldb_mc.pos.x - (i<<1);
+                               mbi->MV[0][1][1] = fieldb_mc.pos.y - (j<<1);
+                               mbi->mv_field_sel[0][1] = fieldb_mc.fieldsel;
+                       }
+                       else /* MC_16X8 */
+                       {
+                               /* forward */
+                               mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
+                               mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
+                               mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
+                               mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
+                               mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
+                               mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
+                               /* backward */
+                               mbi->MV[0][1][0] = field8ub_mc.pos.x - (i<<1);
+                               mbi->MV[0][1][1] = field8ub_mc.pos.y - (j<<1);
+                               mbi->mv_field_sel[0][1] = field8ub_mc.fieldsel;
+                               mbi->MV[1][1][0] = field8lb_mc.pos.x - (i<<1);
+                               mbi->MV[1][1][1] = field8lb_mc.pos.y - ((j+8)<<1);
+                               mbi->mv_field_sel[1][1] = field8lb_mc.fieldsel;
+                       }
+               }
+       }
+
+}
+
+
+
+/*
+  Initialise motion compensation - currently purely selection of which
+  versions of the various low level computation routines to use
+  
+  */
+
+void init_motion()
+{
+       int cpucap = cpu_accel();
+
+       if( cpucap  == 0 )      /* No MMX/SSE etc support available */
+       {
+               pdist22 = dist22;
+               pdist44 = dist44;
+               pdist1_00 = dist1_00;
+               pdist1_01 = dist1_01;
+               pdist1_10 = dist1_10;
+               pdist1_11 = dist1_11;
+               pbdist1 = bdist1;
+               pdist2 = dist2;
+               pbdist2 = bdist2;
+               pdist2_22 = dist2_22;
+               pbdist2_22 = bdist2_22;
+               pfind_best_one_pel = find_best_one_pel;
+               pbuild_sub22_mcomps     = build_sub22_mcomps;
+        }
+#ifdef X86_CPU
+       else if(cpucap & ACCEL_X86_MMXEXT ) /* AMD MMX or SSE... */
+       {
+               if(verbose) fprintf( stderr, "SETTING EXTENDED MMX for MOTION!\n");
+               pdist22 = dist22_mmxe;
+               pdist44 = dist44_mmxe;
+               pdist1_00 = dist1_00_mmxe;
+               pdist1_01 = dist1_01_mmxe;
+               pdist1_10 = dist1_10_mmxe;
+               pdist1_11 = dist1_11_mmxe;
+               pbdist1 = bdist1_mmx;
+               pdist2 = dist2_mmx;
+               pbdist2 = bdist2_mmx;
+               pdist2_22 = dist2_22_mmx;
+               pbdist2_22 = bdist2_22_mmx;
+               pfind_best_one_pel = find_best_one_pel_mmxe;
+               pbuild_sub22_mcomps     = build_sub22_mcomps_mmxe;
+               pmblock_sub44_dists = mblock_sub44_dists_mmxe;
+
+       }
+       else if(cpucap & ACCEL_X86_MMX) /* Ordinary MMX CPU */
+       {
+               if(verbose) fprintf( stderr, "SETTING MMX for MOTION!\n");
+               pdist22 = dist22_mmx;
+               pdist44 = dist44_mmx;
+               pdist1_00 = dist1_00_mmx;
+               pdist1_01 = dist1_01_mmx;
+               pdist1_10 = dist1_10_mmx;
+               pdist1_11 = dist1_11_mmx;
+               pbdist1 = bdist1_mmx;
+               pdist2 = dist2_mmx;
+               pbdist2 = bdist2_mmx;
+               pdist2_22 = dist2_22_mmx;
+               pbdist2_22 = bdist2_22_mmx;
+               pfind_best_one_pel = find_best_one_pel;
+               pbuild_sub22_mcomps     = build_sub22_mcomps;
+               pmblock_sub44_dists = mblock_sub44_dists_mmx;
+       }
+#endif
+}
+
+
+void motion_engine_loop(motion_engine_t *engine)
+{
+       while(!engine->done)
+       {
+               pthread_mutex_lock(&(engine->input_lock));
+
+               if(!engine->done)
+               {
+                       pict_data_s *picture = engine->pict_data;
+                       motion_comp_s *mc_data = engine->motion_comp;
+                       int secondfield = engine->secondfield;
+                       int ipflag = engine->ipflag;
+                       mbinfo_s *mbi = picture->mbinfo + (engine->start_row / 16) * (width / 16);
+                       int i, j;
+                       int mb_row_incr;                        /* Offset increment to go down 1 row of mb's */
+                       int mb_row_start;
+
+                       if (picture->pict_struct == FRAME_PICTURE)
+                       {                       
+                               mb_row_incr = 16*width;
+                               mb_row_start = engine->start_row / 16 * mb_row_incr;
+/* loop through all macroblocks of a frame picture */
+                               for (j=engine->start_row; j < engine->end_row; j+=16)
+                               {
+                                       for (i=0; i<width; i+=16)
+                                       {
+                                               frame_ME(engine, picture, mc_data, mb_row_start, i,j, mbi);
+                                               mbi++;
+                                       }
+                                       mb_row_start += mb_row_incr;
+                               }
+                       }
+                       else
+                       {               
+                               mb_row_incr = (16 * 2) * width;
+                               mb_row_start = engine->start_row / 16 * mb_row_incr;
+/* loop through all macroblocks of a field picture */
+                               for (j=engine->start_row; j < engine->end_row; j+=16)
+                               {
+                                       for (i=0; i<width; i+=16)
+                                       {
+                                               field_ME(engine, picture, mc_data, mb_row_start, i,j,
+                                                                mbi,secondfield,ipflag);
+                                               mbi++;
+                                       }
+                                       mb_row_start += mb_row_incr;
+                               }
+                       }
+               }
+
+               pthread_mutex_unlock(&(engine->output_lock));
+       }
+}
+
+
+void motion_estimation(pict_data_s *picture,
+       motion_comp_s *mc_data,
+       int secondfield, int ipflag)
+{
+       int i;
+/* Start loop */
+       for(i = 0; i < processors; i++)
+       {
+               motion_engines[i].motion_comp = mc_data;
+
+               motion_engines[i].pict_data = picture;
+
+               motion_engines[i].secondfield = secondfield;
+               motion_engines[i].ipflag = ipflag;
+               pthread_mutex_unlock(&(motion_engines[i].input_lock));
+       }
+
+/* Wait for completion */
+       for(i = 0; i < processors; i++)
+       {
+               pthread_mutex_lock(&(motion_engines[i].output_lock));
+       }
+}
+
+void start_motion_engines()
+{
+       int i;
+       int rows_per_processor = (int)((float)height2 / 16 / processors + 0.5);
+       int current_row = 0;
+       pthread_attr_t  attr;
+       pthread_mutexattr_t mutex_attr;
+
+       pthread_mutexattr_init(&mutex_attr);
+       pthread_attr_init(&attr);
+       motion_engines = calloc(1, sizeof(motion_engine_t) * processors);
+       for(i = 0; i < processors; i++)
+       {
+               motion_engines[i].start_row = current_row * 16;
+               current_row += rows_per_processor;
+               if(current_row > height2 / 16) current_row = height2 / 16;
+               motion_engines[i].end_row = current_row * 16;
+               pthread_mutex_init(&(motion_engines[i].input_lock), &mutex_attr);
+               pthread_mutex_lock(&(motion_engines[i].input_lock));
+               pthread_mutex_init(&(motion_engines[i].output_lock), &mutex_attr);
+               pthread_mutex_lock(&(motion_engines[i].output_lock));
+               motion_engines[i].done = 0;
+               pthread_create(&(motion_engines[i].tid), 
+                       &attr, 
+                       (void*)motion_engine_loop, 
+                       &motion_engines[i]);
+       }
+}
+
+void stop_motion_engines()
+{
+       int i;
+       for(i = 0; i < processors; i++)
+       {
+               motion_engines[i].done = 1;
+               pthread_mutex_unlock(&(motion_engines[i].input_lock));
+               pthread_join(motion_engines[i].tid, 0);
+               pthread_mutex_destroy(&(motion_engines[i].input_lock));
+               pthread_mutex_destroy(&(motion_engines[i].output_lock));
+       }
+       free(motion_engines);
+}
+
+