/* motion.c, motion estimation */ /* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */ /* * Disclaimer of Warranty * * These software programs are available to the user without any license fee or * royalty on an "as is" basis. The MPEG Software Simulation Group disclaims * any and all warranties, whether express, implied, or statuary, including any * implied warranties or merchantability or of fitness for a particular * purpose. In no event shall the copyright-holder be liable for any * incidental, punitive, or consequential damages of any kind whatsoever * arising from the use of these programs. * * This disclaimer of warranty extends to the user of these programs and user's * customers, employees, agents, transferees, successors, and assigns. * * The MPEG Software Simulation Group does not represent or warrant that the * programs furnished hereunder are free of infringement of any third-party * patents. * * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware, * are subject to royalty fees to patent holders. Many of these patents are * general enough such that they are unavoidable regardless of implementation * design. * */ #include #include #include #include #include "config.h" #include "global.h" #include "cpu_accel.h" #include "simd.h" /* Macro-block Motion compensation results record */ typedef struct _blockcrd { int16_t x; int16_t y; } blockxy; struct mb_motion { blockxy pos; // Half-pel co-ordinates of source block int sad; // Sum of absolute difference int var; uint8_t *blk ; // Source block data (in luminace data array) int hx, hy; // Half-pel offsets int fieldsel; // 0 = top 1 = bottom int fieldoff; // Offset from start of frame data to first line // of field. (top = 0, bottom = width ); }; typedef struct mb_motion mb_motion_s; struct subsampled_mb { uint8_t *mb; // One pel uint8_t *fmb; // Two-pel subsampled uint8_t *qmb; // Four-pel subsampled uint8_t *umb; // U compoenent one-pel uint8_t *vmb; // V component one-pel }; typedef struct subsampled_mb subsampled_mb_s; static void frame_ME (motion_engine_t *engine, pict_data_s *picture, motion_comp_s *mc, int mboffset, int i, int j, struct mbinfo *mbi); static void field_ME (motion_engine_t *engine, pict_data_s *picture, motion_comp_s *mc, int mboffset, int i, int j, struct mbinfo *mbi, int secondfield, int ipflag); static void frame_estimate (motion_engine_t *engine, uint8_t *org, uint8_t *ref, subsampled_mb_s *ssmb, int i, int j, int sx, int sy, mb_motion_s *bestfr, mb_motion_s *besttop, mb_motion_s *bestbot, int imins[2][2], int jmins[2][2]); static void field_estimate (motion_engine_t *engine, pict_data_s *picture, uint8_t *toporg, uint8_t *topref, uint8_t *botorg, uint8_t *botref, subsampled_mb_s *ssmb, int i, int j, int sx, int sy, int ipflag, mb_motion_s *bestfr, mb_motion_s *best8u, mb_motion_s *best8l, mb_motion_s *bestsp); #if 0 static void dpframe_estimate (motion_engine_t *engine, pict_data_s *picture, uint8_t *ref, subsampled_mb_s *ssmb, int i, int j, int iminf[2][2], int jminf[2][2], mb_motion_s *dpbest, int *imindmvp, int *jmindmvp, int *vmcp); static void dpfield_estimate (motion_engine_t *engine, pict_data_s *picture, uint8_t *topref, uint8_t *botref, uint8_t *mb, int i, int j, int imins, int jmins, mb_motion_s *dpbest, int *vmcp); #endif static void fullsearch (motion_engine_t *engine, uint8_t *org, uint8_t *ref, subsampled_mb_s *ssblk, int lx, int i0, int j0, int sx, int sy, int h, int xmax, int ymax, mb_motion_s *motion ); static void find_best_one_pel( motion_engine_t *engine, uint8_t *org, uint8_t *blk, int searched_size, int i0, int j0, int ilow, int jlow, int xmax, int ymax, int lx, int h, mb_motion_s *res ); static int build_sub22_mcomps( motion_engine_t *engine, int i0, int j0, int ihigh, int jhigh, int null_mc_sad, uint8_t *s22org, uint8_t *s22blk, int flx, int fh, int searched_sub44_size ); #ifdef X86_CPU static void find_best_one_pel_mmxe( motion_engine_t *engine, uint8_t *org, uint8_t *blk, int searched_size, int i0, int j0, int ilow, int jlow, int xmax, int ymax, int lx, int h, mb_motion_s *res ); static int build_sub22_mcomps_mmxe( motion_engine_t *engine, int i0, int j0, int ihigh, int jhigh, int null_mc_sad, uint8_t *s22org, uint8_t *s22blk, int flx, int fh, int searched_sub44_size ); static int (*pmblock_sub44_dists)( uint8_t *blk, uint8_t *ref, int ilow, int jlow, int ihigh, int jhigh, int h, int rowstride, int threshold, mc_result_s *resvec); #endif static int unidir_pred_var( const mb_motion_s *motion, uint8_t *mb, int lx, int h); static int bidir_pred_var( const mb_motion_s *motion_f, const mb_motion_s *motion_b, uint8_t *mb, int lx, int h); static int bidir_pred_sad( const mb_motion_s *motion_f, const mb_motion_s *motion_b, uint8_t *mb, int lx, int h); static int variance( uint8_t *mb, int size, int lx); static int dist22 ( uint8_t *blk1, uint8_t *blk2, int qlx, int qh); static int dist44 ( uint8_t *blk1, uint8_t *blk2, int flx, int fh); static int dist2_22( uint8_t *blk1, uint8_t *blk2, int lx, int h); static int bdist2_22( uint8_t *blk1f, uint8_t *blk1b, uint8_t *blk2, int lx, int h); static void (*pfind_best_one_pel)( motion_engine_t *engine, uint8_t *org, uint8_t *blk, int searched_size, int i0, int j0, int ilow, int jlow, int xmax, int ymax, int lx, int h, mb_motion_s *res ); static int (*pbuild_sub22_mcomps)( motion_engine_t *engine, int i0, int j0, int ihigh, int jhigh, int null_mc_sad, uint8_t *s22org, uint8_t *s22blk, int flx, int fh, int searched_sub44_size ); static int (*pdist2_22)( uint8_t *blk1, uint8_t *blk2, int lx, int h); static int (*pbdist2_22)( uint8_t *blk1f, uint8_t *blk1b, uint8_t *blk2, int lx, int h); static int dist1_00( uint8_t *blk1, uint8_t *blk2, int lx, int h, int distlim); static int dist1_01(uint8_t *blk1, uint8_t *blk2, int lx, int h); static int dist1_10(uint8_t *blk1, uint8_t *blk2, int lx, int h); static int dist1_11(uint8_t *blk1, uint8_t *blk2, int lx, int h); static int dist2 (uint8_t *blk1, uint8_t *blk2, int lx, int hx, int hy, int h); static int bdist2 (uint8_t *pf, uint8_t *pb, uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h); static int bdist1 (uint8_t *pf, uint8_t *pb, uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h); static int (*pdist22) ( uint8_t *blk1, uint8_t *blk2, int flx, int fh); static int (*pdist44) ( uint8_t *blk1, uint8_t *blk2, int qlx, int qh); static int (*pdist1_00) ( uint8_t *blk1, uint8_t *blk2, int lx, int h, int distlim); static int (*pdist1_01) ( uint8_t *blk1, uint8_t *blk2, int lx, int h); static int (*pdist1_10) ( uint8_t *blk1, uint8_t *blk2, int lx, int h); static int (*pdist1_11) ( uint8_t *blk1, uint8_t *blk2, int lx, int h); static int (*pdist2) (uint8_t *blk1, uint8_t *blk2, int lx, int hx, int hy, int h); static int (*pbdist2) (uint8_t *pf, uint8_t *pb, uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h); static int (*pbdist1) (uint8_t *pf, uint8_t *pb, uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h); /* DEBUGGER... static void check_mc( const mb_motion_s *motion, int rx, int ry, int i, int j, char *str ) { rx *= 2; ry *= 2; if( motion->sad < 0 || motion->sad > 0x10000 ) { printf( "SAD ooops %s\n", str ); exit(1); } if( motion->pos.x-i*2 < -rx || motion->pos.x-i*2 >= rx ) { printf( "X MC ooops %s = %d [%d]\n", str, motion->pos.x-i*2,rx ); exit(1); } if( motion->pos.y-j*2 < -ry || motion->pos.y-j*2 >= ry ) { printf( "Y MC ooops %s %d [%d]\n", str, motion->pos.y-j*2, ry ); exit(1); } } static void init_mc( mb_motion_s *motion ) { motion->sad = -123; motion->pos.x = -1000; motion->pos.y = -1000; } */ /* unidir_NI_var_sum Compute the combined variance of luminance and chrominance information for a particular non-intra macro block after unidirectional motion compensation... Note: results are scaled to give chrominance equal weight to chrominance. The variance of the luminance portion is computed at the time the motion compensation is computed. TODO: Perhaps we should compute the whole thing in fullsearch not seperate it. However, that would involve a lot of fiddling with field_* and until its thoroughly debugged and tested I think I'll leave that alone. Furthermore, it is unclear if its really worth doing these computations for B *and* P frames. TODO: BUG: ONLY works for 420 video... */ static int unidir_chrom_var_sum( mb_motion_s *lum_mc, uint8_t **ref, subsampled_mb_s *ssblk, int lx, int h ) { int uvlx = (lx>>1); int uvh = (h>>1); /* N.b. MC co-ordinates are computed in half-pel units! */ int cblkoffset = (lum_mc->fieldoff>>1) + (lum_mc->pos.x>>2) + (lum_mc->pos.y>>2)*uvlx; return ((*pdist2_22)( ref[1] + cblkoffset, ssblk->umb, uvlx, uvh) + (*pdist2_22)( ref[2] + cblkoffset, ssblk->vmb, uvlx, uvh))*2; } /* bidir_NI_var_sum Compute the combined variance of luminance and chrominance information for a particular non-intra macro block after unidirectional motion compensation... Note: results are scaled to give chrominance equal weight to chrominance. The variance of the luminance portion is computed at the time the motion compensation is computed. Note: results scaled to give chrominance equal weight to chrominance. TODO: BUG: ONLY works for 420 video... NOTE: Currently unused but may be required if it turns out that taking chrominance into account in B frames is needed. */ int bidir_chrom_var_sum( mb_motion_s *lum_mc_f, mb_motion_s *lum_mc_b, uint8_t **ref_f, uint8_t **ref_b, subsampled_mb_s *ssblk, int lx, int h ) { int uvlx = (lx>>1); int uvh = (h>>1); /* N.b. MC co-ordinates are computed in half-pel units! */ int cblkoffset_f = (lum_mc_f->fieldoff>>1) + (lum_mc_f->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx; int cblkoffset_b = (lum_mc_b->fieldoff>>1) + (lum_mc_b->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx; return ( (*pbdist2_22)( ref_f[1] + cblkoffset_f, ref_b[1] + cblkoffset_b, ssblk->umb, uvlx, uvh ) + (*pbdist2_22)( ref_f[2] + cblkoffset_f, ref_b[2] + cblkoffset_b, ssblk->vmb, uvlx, uvh ))*2; } static int chrom_var_sum( subsampled_mb_s *ssblk, int h, int lx ) { return (variance(ssblk->umb,(h>>1),(lx>>1)) + variance(ssblk->vmb,(h>>1),(lx>>1))) * 2; } /* * frame picture motion estimation * * org: top left pel of source reference frame * ref: top left pel of reconstructed reference frame * ssmb: macroblock to be matched * i,j: location of mb relative to ref (=center of search window) * sx,sy: half widths of search window * besfr: location and SAD of best frame prediction * besttop: location of best field pred. for top field of mb * bestbo : location of best field pred. for bottom field of mb */ static void frame_estimate(motion_engine_t *engine, uint8_t *org, uint8_t *ref, subsampled_mb_s *ssmb, int i, int j, int sx, int sy, mb_motion_s *bestfr, mb_motion_s *besttop, mb_motion_s *bestbot, int imins[2][2], int jmins[2][2] ) { subsampled_mb_s botssmb; mb_motion_s topfld_mc; mb_motion_s botfld_mc; botssmb.mb = ssmb->mb+width; botssmb.fmb = ssmb->mb+(width>>1); botssmb.qmb = ssmb->qmb+(width>>2); botssmb.umb = ssmb->umb+(width>>1); botssmb.vmb = ssmb->vmb+(width>>1); /* frame prediction */ fullsearch(engine, org,ref,ssmb,width,i,j,sx,sy,16,width,height, bestfr ); bestfr->fieldsel = 0; bestfr->fieldoff = 0; /* predict top field from top field */ fullsearch(engine, org,ref,ssmb,width<<1,i,j>>1,sx,sy>>1,8,width,height>>1, &topfld_mc); /* predict top field from bottom field */ fullsearch(engine, org+width,ref+width,ssmb, width<<1,i,j>>1,sx,sy>>1,8, width,height>>1, &botfld_mc); /* set correct field selectors... */ topfld_mc.fieldsel = 0; botfld_mc.fieldsel = 1; topfld_mc.fieldoff = 0; botfld_mc.fieldoff = width; imins[0][0] = topfld_mc.pos.x; jmins[0][0] = topfld_mc.pos.y; imins[1][0] = botfld_mc.pos.x; jmins[1][0] = botfld_mc.pos.y; /* select prediction for top field */ if (topfld_mc.sad<=botfld_mc.sad) { *besttop = topfld_mc; } else { *besttop = botfld_mc; } /* predict bottom field from top field */ fullsearch(engine, org,ref,&botssmb, width<<1,i,j>>1,sx,sy>>1,8,width,height>>1, &topfld_mc); /* predict bottom field from bottom field */ fullsearch(engine, org+width,ref+width,&botssmb, width<<1,i,j>>1,sx,sy>>1,8,width,height>>1, &botfld_mc); /* set correct field selectors... */ topfld_mc.fieldsel = 0; botfld_mc.fieldsel = 1; topfld_mc.fieldoff = 0; botfld_mc.fieldoff = width; imins[0][1] = topfld_mc.pos.x; jmins[0][1] = topfld_mc.pos.y; imins[1][1] = botfld_mc.pos.x; jmins[1][1] = botfld_mc.pos.y; /* select prediction for bottom field */ if (botfld_mc.sad<=topfld_mc.sad) { *bestbot = botfld_mc; } else { *bestbot = topfld_mc; } } /* * field picture motion estimation subroutine * * toporg: address of original top reference field * topref: address of reconstructed top reference field * botorg: address of original bottom reference field * botref: address of reconstructed bottom reference field * ssmmb: macroblock to be matched * i,j: location of mb (=center of search window) * sx,sy: half width/height of search window * * bestfld: location and distance of best field prediction * best8u: location of best 16x8 pred. for upper half of mb * best8lp: location of best 16x8 pred. for lower half of mb * bdestsp: location and distance of best same parity field * prediction (needed for dual prime, only valid if * ipflag==0) */ static void field_estimate (motion_engine_t *engine, pict_data_s *picture, uint8_t *toporg, uint8_t *topref, uint8_t *botorg, uint8_t *botref, subsampled_mb_s *ssmb, int i, int j, int sx, int sy, int ipflag, mb_motion_s *bestfld, mb_motion_s *best8u, mb_motion_s *best8l, mb_motion_s *bestsp) { mb_motion_s topfld_mc; mb_motion_s botfld_mc; int dt, db; int notop, nobot; subsampled_mb_s botssmb; botssmb.mb = ssmb->mb+width; botssmb.umb = ssmb->umb+(width>>1); botssmb.vmb = ssmb->vmb+(width>>1); botssmb.fmb = ssmb->fmb+(width>>1); botssmb.qmb = ssmb->qmb+(width>>2); /* if ipflag is set, predict from field of opposite parity only */ notop = ipflag && (picture->pict_struct==TOP_FIELD); nobot = ipflag && (picture->pict_struct==BOTTOM_FIELD); /* field prediction */ /* predict current field from top field */ if (notop) topfld_mc.sad = dt = 65536; /* infinity */ else fullsearch(engine, toporg,topref,ssmb,width<<1, i,j,sx,sy>>1,16,width,height>>1, &topfld_mc); dt = topfld_mc.sad; /* predict current field from bottom field */ if (nobot) botfld_mc.sad = db = 65536; /* infinity */ else fullsearch(engine, botorg,botref,ssmb,width<<1, i,j,sx,sy>>1,16,width,height>>1, &botfld_mc); db = botfld_mc.sad; /* Set correct field selectors */ topfld_mc.fieldsel = 0; botfld_mc.fieldsel = 1; topfld_mc.fieldoff = 0; botfld_mc.fieldoff = width; /* same parity prediction (only valid if ipflag==0) */ if (picture->pict_struct==TOP_FIELD) { *bestsp = topfld_mc; } else { *bestsp = botfld_mc; } /* select field prediction */ if (dt<=db) { *bestfld = topfld_mc; } else { *bestfld = botfld_mc; } /* 16x8 motion compensation */ /* predict upper half field from top field */ if (notop) topfld_mc.sad = dt = 65536; else fullsearch(engine, toporg,topref,ssmb,width<<1, i,j,sx,sy>>1,8,width,height>>1, &topfld_mc); dt = topfld_mc.sad; /* predict upper half field from bottom field */ if (nobot) botfld_mc.sad = db = 65536; else fullsearch(engine, botorg,botref,ssmb,width<<1, i,j,sx,sy>>1,8,width,height>>1, &botfld_mc); db = botfld_mc.sad; /* Set correct field selectors */ topfld_mc.fieldsel = 0; botfld_mc.fieldsel = 1; topfld_mc.fieldoff = 0; botfld_mc.fieldoff = width; /* select prediction for upper half field */ if (dt<=db) { *best8u = topfld_mc; } else { *best8u = botfld_mc; } /* predict lower half field from top field */ /* N.b. For interlaced data width<<4 (width*16) takes us 8 rows down in the same field. Thus for the fast motion data (2*2 sub-sampled) we need to go 4 rows down in the same field. This requires adding width*4 = (width<<2). For the 4*4 sub-sampled motion data we need to go down 2 rows. This requires adding width = width */ if (notop) topfld_mc.sad = dt = 65536; else fullsearch(engine, toporg,topref,&botssmb, width<<1, i,j+8,sx,sy>>1,8,width,height>>1, /* &imint,&jmint, &dt,*/ &topfld_mc); dt = topfld_mc.sad; /* predict lower half field from bottom field */ if (nobot) botfld_mc.sad = db = 65536; else fullsearch(engine, botorg,botref,&botssmb,width<<1, i,j+8,sx,sy>>1,8,width,height>>1, /* &iminb,&jminb, &db,*/ &botfld_mc); db = botfld_mc.sad; /* Set correct field selectors */ topfld_mc.fieldsel = 0; botfld_mc.fieldsel = 1; topfld_mc.fieldoff = 0; botfld_mc.fieldoff = width; /* select prediction for lower half field */ if (dt<=db) { *best8l = topfld_mc; } else { *best8l = botfld_mc; } } #if 0 static void dpframe_estimate (motion_engine_t *engine, pict_data_s *picture, uint8_t *ref, subsampled_mb_s *ssmb, int i, int j, int iminf[2][2], int jminf[2][2], mb_motion_s *best_mc, int *imindmvp, int *jmindmvp, int *vmcp) { int pref,ppred,delta_x,delta_y; int is,js,it,jt,ib,jb,it0,jt0,ib0,jb0; int imins,jmins,imint,jmint,iminb,jminb,imindmv,jmindmv; int vmc,local_dist; /* Calculate Dual Prime distortions for 9 delta candidates * for each of the four minimum field vectors * Note: only for P pictures! */ /* initialize minimum dual prime distortion to large value */ vmc = INT_MAX; for (pref=0; pref<2; pref++) { for (ppred=0; ppred<2; ppred++) { /* convert Cartesian absolute to relative motion vector * values (wrt current macroblock address (i,j) */ is = iminf[pref][ppred] - (i<<1); js = jminf[pref][ppred] - (j<<1); if (pref!=ppred) { /* vertical field shift adjustment */ if (ppred==0) js++; else js--; /* mvxs and mvys scaling*/ is<<=1; js<<=1; if (picture->topfirst == ppred) { /* second field: scale by 1/3 */ is = (is>=0) ? (is+1)/3 : -((-is+1)/3); js = (js>=0) ? (js+1)/3 : -((-js+1)/3); } else continue; } /* vector for prediction from field of opposite 'parity' */ if (picture->topfirst) { /* vector for prediction of top field from bottom field */ it0 = ((is+(is>0))>>1); jt0 = ((js+(js>0))>>1) - 1; /* vector for prediction of bottom field from top field */ ib0 = ((3*is+(is>0))>>1); jb0 = ((3*js+(js>0))>>1) + 1; } else { /* vector for prediction of top field from bottom field */ it0 = ((3*is+(is>0))>>1); jt0 = ((3*js+(js>0))>>1) - 1; /* vector for prediction of bottom field from top field */ ib0 = ((is+(is>0))>>1); jb0 = ((js+(js>0))>>1) + 1; } /* convert back to absolute half-pel field picture coordinates */ is += i<<1; js += j<<1; it0 += i<<1; jt0 += j<<1; ib0 += i<<1; jb0 += j<<1; if (is >= 0 && is <= (width-16)<<1 && js >= 0 && js <= (height-16)) { for (delta_y=-1; delta_y<=1; delta_y++) { for (delta_x=-1; delta_x<=1; delta_x++) { /* opposite field coordinates */ it = it0 + delta_x; jt = jt0 + delta_y; ib = ib0 + delta_x; jb = jb0 + delta_y; if (it >= 0 && it <= (width-16)<<1 && jt >= 0 && jt <= (height-16) && ib >= 0 && ib <= (width-16)<<1 && jb >= 0 && jb <= (height-16)) { /* compute prediction error */ local_dist = (*pbdist2)( ref + (is>>1) + (width<<1)*(js>>1), ref + width + (it>>1) + (width<<1)*(jt>>1), ssmb->mb, /* current mb location */ width<<1, /* adjacent line distance */ is&1, js&1, it&1, jt&1, /* half-pel flags */ 8); /* block height */ local_dist += (*pbdist2)( ref + width + (is>>1) + (width<<1)*(js>>1), ref + (ib>>1) + (width<<1)*(jb>>1), ssmb->mb + width, /* current mb location */ width<<1, /* adjacent line distance */ is&1, js&1, ib&1, jb&1, /* half-pel flags */ 8); /* block height */ /* update delta with least distortion vector */ if (local_dist < vmc) { imins = is; jmins = js; imint = it; jmint = jt; iminb = ib; jminb = jb; imindmv = delta_x; jmindmv = delta_y; vmc = local_dist; } } } /* end delta x loop */ } /* end delta y loop */ } } } /* TODO: This is now likely to be obsolete... */ /* Compute L1 error for decision purposes */ local_dist = (*pbdist1)( ref + (imins>>1) + (width<<1)*(jmins>>1), ref + width + (imint>>1) + (width<<1)*(jmint>>1), ssmb->mb, width<<1, imins&1, jmins&1, imint&1, jmint&1, 8); //printf("motion 1 %p\n", pbdist1); local_dist += (*pbdist1)( ref + width + (imins>>1) + (width<<1)*(jmins>>1), ref + (iminb>>1) + (width<<1)*(jminb>>1), ssmb->mb + width, width<<1, imins&1, jmins&1, iminb&1, jminb&1, 8); //printf("motion 2\n"); best_mc->sad = local_dist; best_mc->pos.x = imins; best_mc->pos.y = jmins; *vmcp = vmc; *imindmvp = imindmv; *jmindmvp = jmindmv; //printf("motion 2\n"); } static void dpfield_estimate(motion_engine_t *engine, pict_data_s *picture, uint8_t *topref, uint8_t *botref, uint8_t *mb, int i, int j, int imins, int jmins, mb_motion_s *bestdp_mc, int *vmcp ) { uint8_t *sameref, *oppref; int io0,jo0,io,jo,delta_x,delta_y,mvxs,mvys,mvxo0,mvyo0; int imino,jmino,imindmv,vmc_dp,local_dist; /* Calculate Dual Prime distortions for 9 delta candidates */ /* Note: only for P pictures! */ /* Assign opposite and same reference pointer */ if (picture->pict_struct==TOP_FIELD) { sameref = topref; oppref = botref; } else { sameref = botref; oppref = topref; } /* convert Cartesian absolute to relative motion vector * values (wrt current macroblock address (i,j) */ mvxs = imins - (i<<1); mvys = jmins - (j<<1); /* vector for prediction from field of opposite 'parity' */ mvxo0 = (mvxs+(mvxs>0)) >> 1; /* mvxs / / */ mvyo0 = (mvys+(mvys>0)) >> 1; /* mvys / / 2*/ /* vertical field shift correction */ if (picture->pict_struct==TOP_FIELD) mvyo0--; else mvyo0++; /* convert back to absolute coordinates */ io0 = mvxo0 + (i<<1); jo0 = mvyo0 + (j<<1); /* initialize minimum dual prime distortion to large value */ vmc_dp = 1 << 30; for (delta_y = -1; delta_y <= 1; delta_y++) { for (delta_x = -1; delta_x <=1; delta_x++) { /* opposite field coordinates */ io = io0 + delta_x; jo = jo0 + delta_y; if (io >= 0 && io <= (width-16)<<1 && jo >= 0 && jo <= (height2-16)<<1) { /* compute prediction error */ local_dist = (*pbdist2)( sameref + (imins>>1) + width2*(jmins>>1), oppref + (io>>1) + width2*(jo>>1), mb, /* current mb location */ width2, /* adjacent line distance */ imins&1, jmins&1, io&1, jo&1, /* half-pel flags */ 16); /* block height */ /* update delta with least distortion vector */ if (local_dist < vmc_dp) { imino = io; jmino = jo; imindmv = delta_x; vmc_dp = local_dist; } } } /* end delta x loop */ } /* end delta y loop */ /* Compute L1 error for decision purposes */ bestdp_mc->sad = (*pbdist1)( sameref + (imins>>1) + width2*(jmins>>1), oppref + (imino>>1) + width2*(jmino>>1), mb, /* current mb location */ width2, /* adjacent line distance */ imins&1, jmins&1, imino&1, jmino&1, /* half-pel flags */ 16); /* block height */ bestdp_mc->pos.x = imindmv; bestdp_mc->pos.x = imindmv; *vmcp = vmc_dp; } #endif /* * Vectors of motion compensations */ /* Take a vector of motion compensations and repeatedly make passes discarding all elements whose sad "weight" is above the current mean weight. */ static void sub_mean_reduction( mc_result_s *matches, int len, int times, int *newlen_res, int *minweight_res) { int i,j; int weight_sum; int mean_weight; if( len == 0 ) { *minweight_res = 100000; *newlen_res = 0; return; } for(;;) { weight_sum = 0; for( i = 0; i < len ; ++i ) weight_sum += matches[i].weight; mean_weight = weight_sum / len; if( times <= 0) break; j = 0; for( i =0; i < len; ++i ) { if( matches[i].weight <= mean_weight ) { matches[j] = matches[i]; ++j; } } len = j; --times; } *newlen_res = len; *minweight_res = mean_weight; } /* Build a vector of the top 4*4 sub-sampled motion compensations in the box (ilow,jlow) to (ihigh,jhigh). The algorithm is as follows: 1. coarse matches on an 8*8 grid of positions are collected that fall below a (very conservative) sad threshold (basically 50% more than moving average of the mean sad of such matches). 2. The worse than-average matches are discarded. 3. The remaining coarse matches are expanded with the left/lower neighbouring 4*4 grid matches. Again only those better than a threshold (this time the mean of the 8*8 grid matches are retained. 4. Multiple passes are made discarding worse than-average matches. The number of passes is specified by the user. The default it 2 (leaving roughly 1/4 of the matches). The net result is very fast and find good matches if they're to be found. I.e. the penalty over exhaustive search is pretty low. NOTE: The "discard below average" trick depends critically on having some variation in the matches. The slight penalty imposed for distant matches (reasonably since the motion vectors have to be encoded) is *vital* as otherwise pathologically bad performance results on highly uniform images. TODO: We should probably allow the user to eliminate the initial thinning of 8*8 grid matches if ultimate quality is demanded (e.g. for low bit-rate applications). */ static int build_sub44_mcomps(motion_engine_t *engine, int ilow, int jlow, int ihigh, int jhigh, int i0, int j0, int null_mc_sad, uint8_t *s44org, uint8_t *s44blk, int qlx, int qh ) { uint8_t *s44orgblk; int istrt = ilow-i0; int jstrt = jlow-j0; int iend = ihigh-i0; int jend = jhigh-j0; int mean_weight; int threshold; #ifdef X86_CPU /*int rangex, rangey; static int rough_num_mcomps; static mc_result_s rough_mcomps[MAX_44_MATCHES]; int k; */ #else int i,j; int s1; uint8_t *old_s44orgblk; #endif /* N.b. we may ignore the right hand block of the pair going over the right edge as we have carefully allocated the buffer oversize to ensure no memory faults. The later motion compensation calculations performed on the results of this pass will filter out out-of-range blocks... */ engine->sub44_num_mcomps = 0; threshold = 6*null_mc_sad / (4*4*mc_44_red); s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2); /* Exhaustive search on 4*4 sub-sampled data. This is affordable because (a) it is only 16th of the size of the real 1-pel data (b) we ignore those matches with an sad above our threshold. */ #ifndef X86_CPU /* Invariant: s44orgblk = s44org+(i>>2)+qlx*(j>>2) */ s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2); for( j = jstrt; j <= jend; j += 4 ) { old_s44orgblk = s44orgblk; for( i = istrt; i <= iend; i += 4 ) { s1 = ((*pdist44)( s44orgblk,s44blk,qlx,qh) & 0xffff) + abs(i-i0) + abs(j-j0); if( s1 < threshold ) { engine->sub44_mcomps[engine->sub44_num_mcomps].x = i; engine->sub44_mcomps[engine->sub44_num_mcomps].y = j; engine->sub44_mcomps[engine->sub44_num_mcomps].weight = s1 ; ++engine->sub44_num_mcomps; } s44orgblk += 1; } s44orgblk = old_s44orgblk + qlx; } #else engine->sub44_num_mcomps = (*pmblock_sub44_dists)( s44orgblk, s44blk, istrt, jstrt, iend, jend, qh, qlx, threshold, engine->sub44_mcomps); #endif /* If we're really pushing quality we reduce once otherwise twice. */ sub_mean_reduction( engine->sub44_mcomps, engine->sub44_num_mcomps, 1+(mc_44_red>1), &engine->sub44_num_mcomps, &mean_weight); return engine->sub44_num_mcomps; } /* Build a vector of the best 2*2 sub-sampled motion compensations using the best 4*4 matches as starting points. As with with the 4*4 matches We don't collect them densely as they're just search starting points for 1-pel search and ones that are 1 out should still give better than average matches... */ static int build_sub22_mcomps(motion_engine_t *engine, int i0, int j0, int ihigh, int jhigh, int null_mc_sad, uint8_t *s22org, uint8_t *s22blk, int flx, int fh, int searched_sub44_size ) { int i,k,s; int threshold = 6*null_mc_sad / (2 * 2*mc_22_red); int min_weight; int ilim = ihigh-i0; int jlim = jhigh-j0; blockxy matchrec; uint8_t *s22orgblk; engine->sub22_num_mcomps = 0; for( k = 0; k < searched_sub44_size; ++k ) { matchrec.x = engine->sub44_mcomps[k].x; matchrec.y = engine->sub44_mcomps[k].y; s22orgblk = s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1); for( i = 0; i < 4; ++i ) { if( matchrec.x <= ilim && matchrec.y <= jlim ) { s = (*pdist22)( s22orgblk,s22blk,flx,fh); if( s < threshold ) { engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x; engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y; engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s; ++engine->sub22_num_mcomps; } } if( i == 1 ) { s22orgblk += flx-1; matchrec.x -= 2; matchrec.y += 2; } else { s22orgblk += 1; matchrec.x += 2; } } } sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2, &engine->sub22_num_mcomps, &min_weight ); return engine->sub22_num_mcomps; } #ifdef X86_CPU int build_sub22_mcomps_mmxe(motion_engine_t *engine, int i0, int j0, int ihigh, int jhigh, int null_mc_sad, uint8_t *s22org, uint8_t *s22blk, int flx, int fh, int searched_sub44_size ) { int i,k,s; int threshold = 6*null_mc_sad / (2 * 2*mc_22_red); int min_weight; int ilim = ihigh-i0; int jlim = jhigh-j0; blockxy matchrec; uint8_t *s22orgblk; int resvec[4]; /* TODO: The calculation of the lstrow offset really belongs in asm code... */ int lstrow=(fh-1)*flx; engine->sub22_num_mcomps = 0; for( k = 0; k < searched_sub44_size; ++k ) { matchrec.x = engine->sub44_mcomps[k].x; matchrec.y = engine->sub44_mcomps[k].y; s22orgblk = s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1); mblockq_dist22_mmxe(s22orgblk+lstrow, s22blk+lstrow, flx, fh, resvec); for( i = 0; i < 4; ++i ) { if( matchrec.x <= ilim && matchrec.y <= jlim ) { s =resvec[i]; if( s < threshold ) { engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x; engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y; engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s; ++engine->sub22_num_mcomps; } } if( i == 1 ) { matchrec.x -= 2; matchrec.y += 2; } else { matchrec.x += 2; } } } sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2, &engine->sub22_num_mcomps, &min_weight ); return engine->sub22_num_mcomps; } #endif /* Search for the best 1-pel match within 1-pel of a good 2*2-pel match. TODO: Its a bit silly to cart around absolute M/C co-ordinates that eventually get turned into relative ones anyway... */ static void find_best_one_pel(motion_engine_t *engine, uint8_t *org, uint8_t *blk, int searched_size, int i0, int j0, int ilow, int jlow, int xmax, int ymax, int lx, int h, mb_motion_s *res ) { int i,k; int d; blockxy minpos = res->pos; int dmin = INT_MAX; uint8_t *orgblk; int penalty; blockxy matchrec; for( k = 0; k < searched_size; ++k ) { matchrec.x = i0 + engine->sub22_mcomps[k].x; matchrec.y = j0 + engine->sub22_mcomps[k].y; orgblk = org + matchrec.x+lx*matchrec.y; penalty = abs(matchrec.x)+abs(matchrec.y); for( i = 0; i < 4; ++i ) { if( matchrec.x <= xmax && matchrec.y <= ymax ) { d = penalty+(*pdist1_00)(orgblk,blk,lx,h, dmin); if (dpos = minpos; res->blk = org + minpos.x+lx*minpos.y; res->sad = dmin; } #ifdef X86_CPU void find_best_one_pel_mmxe(motion_engine_t *engine, uint8_t *org, uint8_t *blk, int searched_size, int i0, int j0, int ilow, int jlow, int xmax, int ymax, int lx, int h, mb_motion_s *res ) { int i,k; int d; blockxy minpos = res->pos; int dmin = INT_MAX; uint8_t *orgblk; int penalty; blockxy matchrec; int resvec[4]; for( k = 0; k < searched_size; ++k ) { matchrec.x = i0 + engine->sub22_mcomps[k].x; matchrec.y = j0 + engine->sub22_mcomps[k].y; orgblk = org + matchrec.x+lx*matchrec.y; penalty = abs(matchrec.x)+abs(matchrec.y); mblockq_dist1_mmxe(orgblk,blk,lx,h, resvec); for( i = 0; i < 4; ++i ) { if( matchrec.x <= xmax && matchrec.y <= ymax ) { d = penalty+resvec[i]; if (dpos = minpos; res->blk = org + minpos.x+lx*minpos.y; res->sad = dmin; } #endif /* * full search block matching * * A.Stevens 2000: This is now a big misnomer. The search is now a hierarchical/sub-sampling * search not a full search. However, experiments have shown it is always close to * optimal and almost always very close or optimal. * * blk: top left pel of (16*h) block * s22blk: top element of fast motion compensation block corresponding to blk * h: height of block * lx: distance (in bytes) of vertically adjacent pels in ref,blk * org: top left pel of source reference picture * ref: top left pel of reconstructed reference picture * i0,j0: center of search window * sx,sy: half widths of search window * xmax,ymax: right/bottom limits of search area * iminp,jminp: pointers to where the result is stored * result is given as half pel offset from ref(0,0) * i.e. NOT relative to (i0,j0) */ static void fullsearch(motion_engine_t *engine, uint8_t *org, uint8_t *ref, subsampled_mb_s *ssblk, int lx, int i0, int j0, int sx, int sy, int h, int xmax, int ymax, /* int *iminp, int *jminp, int *sadminp, */ mb_motion_s *res ) { mb_motion_s best; /* int imin, jmin, dmin */ int i,j,ilow,ihigh,jlow,jhigh; int d; /* NOTE: Surprisingly, the initial motion compensation search works better when the original image not the reference (reconstructed) image is used. */ uint8_t *s22org = (uint8_t*)(org+fsubsample_offset); uint8_t *s44org = (uint8_t*)(org+qsubsample_offset); uint8_t *orgblk; int flx = lx >> 1; int qlx = lx >> 2; int fh = h >> 1; int qh = h >> 2; /* xmax and ymax into more useful form... */ xmax -= 16; ymax -= h; /* The search radii are *always* multiples of 4 to avoid messiness in the initial 4*4 pel search. This is handled by the parameter checking/processing code in readparmfile() */ /* Create a distance-order mcomps of possible motion compensations based on the fast estimation data - 4*4 pel sums (4*4 sub-sampled) rather than actual pel's. 1/16 the size... */ jlow = j0-sy; jlow = jlow < 0 ? 0 : jlow; jhigh = j0+(sy-1); jhigh = jhigh > ymax ? ymax : jhigh; ilow = i0-sx; ilow = ilow < 0 ? 0 : ilow; ihigh = i0+(sx-1); ihigh = ihigh > xmax ? xmax : ihigh; /* Very rarely this may fail to find matchs due to all the good looking ones being over threshold. hence we make sure we fall back to a 0 motion compensation in this case. The sad for the 0 motion compensation is also very useful as a basis for setting thresholds for rejecting really dud 4*4 and 2*2 sub-sampled matches. */ memset(&best, 0, sizeof(best)); best.sad = (*pdist1_00)(ref + i0 + j0 * lx, ssblk->mb, lx, h, best.sad); best.pos.x = i0; best.pos.y = j0; engine->sub44_num_mcomps = build_sub44_mcomps(engine, ilow, jlow, ihigh, jhigh, i0, j0, best.sad, s44org, ssblk->qmb, qlx, qh ); /* Now create a distance-ordered mcomps of possible motion compensations based on the fast estimation data - 2*2 pel sums using the best fraction of the 4*4 estimates However we cover only coarsely... on 4-pel boundaries... */ engine->sub22_num_mcomps = (*pbuild_sub22_mcomps)( engine, i0, j0, ihigh, jhigh, best.sad, s22org, ssblk->fmb, flx, fh, engine->sub44_num_mcomps ); /* Now choose best 1-pel match from what approximates (not exact due to the pre-processing trick with the mean) the top 1/2 of the 2*2 matches */ (*pfind_best_one_pel)( engine, ref, ssblk->mb, engine->sub22_num_mcomps, i0, j0, ilow, jlow, xmax, ymax, lx, h, &best ); /* Final polish: half-pel search of best candidate against reconstructed image. */ best.pos.x <<= 1; best.pos.y <<= 1; best.hx = 0; best.hy = 0; ilow = best.pos.x - (best.pos.x>(ilow<<1)); ihigh = best.pos.x + (best.pos.x<((ihigh)<<1)); jlow = best.pos.y - (best.pos.y>(jlow<<1)); jhigh = best.pos.y+ (best.pos.y<((jhigh)<<1)); for (j=jlow; j<=jhigh; j++) { for (i=ilow; i<=ihigh; i++) { orgblk = ref+(i>>1)+((j>>1)*lx); if( i&1 ) { if( j & 1 ) d = (*pdist1_11)(orgblk,ssblk->mb,lx,h); else d = (*pdist1_01)(orgblk,ssblk->mb,lx,h); } else { if( j & 1 ) d = (*pdist1_10)(orgblk,ssblk->mb,lx,h); else d = (*pdist1_00)(orgblk,ssblk->mb,lx,h,best.sad); } if (dmb, lx, best.hx, best.hy, h); *res = best; } /* * total absolute difference between two (16*h) blocks * including optional half pel interpolation of blk1 (hx,hy) * blk1,blk2: addresses of top left pels of both blocks * lx: distance (in bytes) of vertically adjacent pels * hx,hy: flags for horizontal and/or vertical interpolation * h: height of block (usually 8 or 16) * distlim: bail out if sum exceeds this value */ /* A.Stevens 2000: New version for highly pipelined CPUs where branching is costly. Really it sucks that C doesn't define a stdlib abs that could be realised as a compiler intrinsic using appropriate CPU instructions. That 1970's heritage... */ static int dist1_00(uint8_t *blk1,uint8_t *blk2, int lx, int h,int distlim) { uint8_t *p1,*p2; int j; int s; int v; s = 0; p1 = blk1; p2 = blk2; for (j=0; j= distlim) break; p1+= lx; p2+= lx; } return s; } static int dist1_01(uint8_t *blk1,uint8_t *blk2, int lx, int h) { uint8_t *p1,*p2; int i,j; int s; int v; s = 0; p1 = blk1; p2 = blk2; for (j=0; j>1) - p2[i]; /* v = ((p1[i]>>1)+(p1[i+1]>>1)>>1) - (p2[i]>>1); */ s+=abs(v); } p1+= lx; p2+= lx; } return s; } static int dist1_10(uint8_t *blk1,uint8_t *blk2, int lx, int h) { uint8_t *p1,*p1a,*p2; int i,j; int s; int v; s = 0; p1 = blk1; p2 = blk2; p1a = p1 + lx; for (j=0; j>1) - p2[i]; s+=abs(v); } p1 = p1a; p1a+= lx; p2+= lx; } return s; } static int dist1_11(uint8_t *blk1,uint8_t *blk2, int lx, int h) { uint8_t *p1,*p1a,*p2; int i,j; int s; int v; s = 0; p1 = blk1; p2 = blk2; p1a = p1 + lx; for (j=0; j>2) - p2[i]; s+=abs(v); } p1 = p1a; p1a+= lx; p2+= lx; } return s; } /* USED only during debugging... void check_fast_motion_data(uint8_t *blk, char *label ) { uint8_t *b, *nb; uint8_t *pb,*p; uint8_t *qb; uint8_t *start_s22blk, *start_s44blk; int i; unsigned int mismatch; int nextfieldline; start_s22blk = (blk+height*width); start_s44blk = (blk+height*width+(height>>1)*(width>>1)); if (pict_struct==FRAME_PICTURE) { nextfieldline = width; } else { nextfieldline = 2*width; } mismatch = 0; pb = start_s22blk; qb = start_s44blk; p = blk; while( pb < qb ) { for( i = 0; i < nextfieldline/2; ++i ) { mismatch |= ((p[0] + p[1] + p[nextfieldline] + p[nextfieldline+1])>>2) != *pb; p += 2; ++pb; } p += nextfieldline; } if( mismatch ) { printf( "Mismatch detected check %s for buffer %08x\n", label, blk ); exit(1); } } */ /* Append fast motion estimation data to original luminance data. N.b. memory allocation for luminance data allows space for this information... */ void fast_motion_data(uint8_t *blk, int picture_struct ) { uint8_t *b, *nb; uint8_t *pb; uint8_t *qb; uint8_t *start_s22blk, *start_s44blk; int i; int nextfieldline; #ifdef TEST_RCSEARCH uint16_t *start_rowblk, *start_colblk; uint16_t *pc, *pr,*p; int rowsum; int j,s; int down16 = width*16; uint16_t sums[32]; uint16_t rowsums[2048]; uint16_t colsums[2048]; /* TODO: BUG: should resize with width */ start_rowblk = (uint16_t *)blk+rowsums_offset; start_colblk = (uint16_t *)blk+colsums_offset; #endif /* In an interlaced field the "next" line is 2 width's down rather than 1 width down */ if (picture_struct==FRAME_PICTURE) { nextfieldline = width; } else { nextfieldline = 2*width; } start_s22blk = blk+fsubsample_offset; start_s44blk = blk+qsubsample_offset; b = blk; nb = (blk+nextfieldline); /* Sneaky stuff here... we can do lines in both fields at once */ pb = (uint8_t *) start_s22blk; while( nb < start_s22blk ) { for( i = 0; i < nextfieldline/4; ++i ) /* We're doing 4 pels horizontally at once */ { /* TODO: A.Stevens this has to be the most word-length dependent code in the world. Better than MMX assembler though I guess... */ pb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2; pb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2; pb += 2; b += 4; nb += 4; } b += nextfieldline; nb = b + nextfieldline; } /* Now create the 4*4 sub-sampled data from the 2*2 N.b. the 2*2 sub-sampled motion data preserves the interlace structure of the original. Albeit half as many lines and pixels... */ nextfieldline = nextfieldline >> 1; qb = start_s44blk; b = start_s22blk; nb = (start_s22blk+nextfieldline); while( nb < start_s44blk ) { for( i = 0; i < nextfieldline/4; ++i ) { /* TODO: BRITTLE: A.Stevens - this only works for uint8_t = uint8_t */ qb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2; qb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2; qb += 2; b += 4; nb += 4; } b += nextfieldline; nb = b + nextfieldline; } #ifdef TEST_RCSEARCH /* TODO: BUG: THIS CODE DOES NOT YET ALLOW FOR INTERLACED FIELDS.... */ /* Initial row sums.... */ pb = blk; for(j = 0; j < height; ++j ) { rowsum = 0; for( i = 0; i < 16; ++ i ) { rowsum += pb[i]; } rowsums[j] = rowsum; pb += width; } /* Initial column sums */ for( i = 0; i < width; ++i ) { colsums[i] = 0; } pb = blk; for( j = 0; j < 16; ++j ) { for( i = 0; i < width; ++i ) { colsums[i] += *pb; ++pb; } } /* Now fill in the row/column sum tables... Note: to allow efficient construction of sum/col differences for a given position row sums are held in a *column major* array (changing y co-ordinate makes for small index changes) the col sums are held in a *row major* array */ pb = blk; pc = start_colblk; for(j = 0; j <32; ++j ) { pr = start_rowblk; rowsum = rowsums[j]; for( i = 0; i < width-16; ++i ) { pc[i] = colsums[i]; pr[j] = rowsum; colsums[i] = (colsums[i] + pb[down16] )-pb[0]; rowsum = (rowsum + pb[16]) - pb[0]; ++pb; pr += height; } pb += 16; /* move pb on to next row... rememember we only did width-16! */ pc += width; } #endif } static int dist22( uint8_t *s22blk1, uint8_t *s22blk2,int flx,int fh) { uint8_t *p1 = s22blk1; uint8_t *p2 = s22blk2; int s = 0; int j; for( j = 0; j < fh; ++j ) { int diff; #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff) pipestep(0); pipestep(1); pipestep(2); pipestep(3); pipestep(4); pipestep(5); pipestep(6); pipestep(7); p1 += flx; p2 += flx; #undef pipestep } return s; } /* Sum absolute differences for 4*4 sub-sampled data. TODO: currently assumes only 16*16 or 16*8 motion compensation will be used... I.e. 4*4 or 4*2 sub-sampled blocks will be compared. */ static int dist44( uint8_t *s44blk1, uint8_t *s44blk2,int qlx,int qh) { uint8_t *p1 = s44blk1; uint8_t *p2 = s44blk2; int s = 0; int diff; /* #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff) */ #define pipestep(o) diff = p1[o]-p2[o]; s += diff < 0 ? -diff : diff; pipestep(0); pipestep(1); pipestep(2); pipestep(3); if( qh > 1 ) { p1 += qlx; p2 += qlx; pipestep(0); pipestep(1); pipestep(2); pipestep(3); if( qh > 2 ) { p1 += qlx; p2 += qlx; pipestep(0); pipestep(1); pipestep(2); pipestep(3); p1 += qlx; p2 += qlx; pipestep(0); pipestep(1); pipestep(2); pipestep(3); } } return s; } /* * total squared difference between two (8*h) blocks of 2*2 sub-sampled pels * blk1,blk2: addresses of top left pels of both blocks * lx: distance (in bytes) of vertically adjacent pels * h: height of block (usually 8 or 16) */ static int dist2_22(uint8_t *blk1, uint8_t *blk2, int lx, int h) { uint8_t *p1 = blk1, *p2 = blk2; int i,j,v; int s = 0; for (j=0; j>1) - p2[i]; s+= v*v; } p1f+= lx; p1b+= lx; p2+= lx; } return s; } /* * total squared difference between two (16*h) blocks * including optional half pel interpolation of blk1 (hx,hy) * blk1,blk2: addresses of top left pels of both blocks * lx: distance (in bytes) of vertically adjacent pels * hx,hy: flags for horizontal and/or vertical interpolation * h: height of block (usually 8 or 16) */ static int dist2(blk1,blk2,lx,hx,hy,h) uint8_t *blk1,*blk2; int lx,hx,hy,h; { uint8_t *p1,*p1a,*p2; int i,j; int s,v; s = 0; p1 = blk1; p2 = blk2; if (!hx && !hy) for (j=0; j>1) - p2[i]; s+= v*v; } p1+= lx; p2+= lx; } else if (!hx && hy) { p1a = p1 + lx; for (j=0; j>1) - p2[i]; s+= v*v; } p1 = p1a; p1a+= lx; p2+= lx; } } else /* if (hx && hy) */ { p1a = p1 + lx; for (j=0; j>2) - p2[i]; s+= v*v; } p1 = p1a; p1a+= lx; p2+= lx; } } return s; } /* * absolute difference error between a (16*h) block and a bidirectional * prediction * * p2: address of top left pel of block * pf,hxf,hyf: address and half pel flags of forward ref. block * pb,hxb,hyb: address and half pel flags of backward ref. block * h: height of block * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb */ static int bdist1(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h) uint8_t *pf,*pb,*p2; int lx,hxf,hyf,hxb,hyb,h; { uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc; int i,j; int s,v; pfa = pf + hxf; pfb = pf + lx*hyf; pfc = pfb + hxf; pba = pb + hxb; pbb = pb + lx*hyb; pbc = pbb + hxb; s = 0; for (j=0; j>2) + ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1) - *p2++; s += abs(v); } p2+= lx-16; pf+= lx-16; pfa+= lx-16; pfb+= lx-16; pfc+= lx-16; pb+= lx-16; pba+= lx-16; pbb+= lx-16; pbc+= lx-16; } return s; } /* * squared error between a (16*h) block and a bidirectional * prediction * * p2: address of top left pel of block * pf,hxf,hyf: address and half pel flags of forward ref. block * pb,hxb,hyb: address and half pel flags of backward ref. block * h: height of block * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb */ static int bdist2(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h) uint8_t *pf,*pb,*p2; int lx,hxf,hyf,hxb,hyb,h; { uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc; int i,j; int s,v; pfa = pf + hxf; pfb = pf + lx*hyf; pfc = pfb + hxf; pba = pb + hxb; pbb = pb + lx*hyb; pbc = pbb + hxb; s = 0; for (j=0; j>2) + ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1) - *p2++; s+=v*v; } p2+= lx-16; pf+= lx-16; pfa+= lx-16; pfb+= lx-16; pfc+= lx-16; pb+= lx-16; pba+= lx-16; pbb+= lx-16; pbc+= lx-16; } return s; } /* * variance of a (size*size) block, multiplied by 256 * p: address of top left pel of block * lx: distance (in bytes) of vertically adjacent pels */ static int variance(uint8_t *p, int size, int lx) { int i,j; unsigned int v,s,s2; s = s2 = 0; for (j=0; jblk, mb, lx, motion->hx, motion->hy, h); } /* Compute the variance of the residual of bi-directionally motion compensated block. */ static int bidir_pred_var( const mb_motion_s *motion_f, const mb_motion_s *motion_b, uint8_t *mb, int lx, int h) { return (*pbdist2)( motion_f->blk, motion_b->blk, mb, lx, motion_f->hx, motion_f->hy, motion_b->hx, motion_b->hy, h); } /* Compute SAD for bi-directionally motion compensated blocks... */ static int bidir_pred_sad( const mb_motion_s *motion_f, const mb_motion_s *motion_b, uint8_t *mb, int lx, int h) { return (*pbdist1)(motion_f->blk, motion_b->blk, mb, lx, motion_f->hx, motion_f->hy, motion_b->hx, motion_b->hy, h); } static void frame_ME(motion_engine_t *engine, pict_data_s *picture, motion_comp_s *mc, int mb_row_start, int i, int j, mbinfo_s *mbi) { mb_motion_s framef_mc; mb_motion_s frameb_mc; mb_motion_s dualpf_mc; mb_motion_s topfldf_mc; mb_motion_s botfldf_mc; mb_motion_s topfldb_mc; mb_motion_s botfldb_mc; int var,v0; int vmc,vmcf,vmcr,vmci; int vmcfieldf,vmcfieldr,vmcfieldi; subsampled_mb_s ssmb; int imins[2][2],jmins[2][2]; // int vmc_dp; int imindmv,jmindmv; imindmv = 0; jmindmv = 0; memset(&dualpf_mc,0,sizeof(dualpf_mc)); //printf("frame_ME 1\n"); /* A.Stevens fast motion estimation data is appended to actual luminance information */ ssmb.mb = mc->cur[0] + mb_row_start + i; ssmb.umb = (uint8_t*)(mc->cur[1] + (i>>1) + (mb_row_start>>2)); ssmb.vmb = (uint8_t*)(mc->cur[2] + (i>>1) + (mb_row_start>>2)); ssmb.fmb = (uint8_t*)(mc->cur[0] + fsubsample_offset + ((i>>1) + (mb_row_start>>2))); ssmb.qmb = (uint8_t*)(mc->cur[0] + qsubsample_offset + (i>>2) + (mb_row_start>>4)); /* Compute variance MB as a measure of Intra-coding complexity We include chrominance information here, scaled to compensate for sub-sampling. Silly MPEG forcing chrom/lum to have same quantisations... */ var = variance(ssmb.mb,16,width); //printf("motion %d\n", picture->pict_type); if (picture->pict_type==I_TYPE) { mbi->mb_type = MB_INTRA; } else if (picture->pict_type==P_TYPE) { /* For P pictures we take into account chrominance. This provides much better performance at scene changes */ var += chrom_var_sum(&ssmb,16,width); if (picture->frame_pred_dct) { fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb, width,i,j,mc->sxf,mc->syf,16,width,height, &framef_mc); framef_mc.fieldoff = 0; vmc = framef_mc.var + unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 ); mbi->motion_type = MC_FRAME; } else { //printf("frame_ME 2 %p %p\n", mc->oldorg[0], mc->oldref[0]); frame_estimate(engine, mc->oldorg[0], mc->oldref[0], &ssmb, i, j, mc->sxf, mc->syf, &framef_mc, &topfldf_mc, &botfldf_mc, imins, jmins); //printf("frame_ME 3\n"); vmcf = framef_mc.var + unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 ); vmcfieldf = topfldf_mc.var + unidir_chrom_var_sum( &topfldf_mc, mc->oldref, &ssmb, (width<<1), 8 ) + botfldf_mc.var + unidir_chrom_var_sum( &botfldf_mc, mc->oldref, &ssmb, (width<<1), 8 ); /* DEBUG DP currently disabled... */ // if ( M==1) // { // dpframe_estimate(engine, picture,mc->oldref[0],&ssmb, // i,j>>1,imins,jmins, // &dualpf_mc, // &imindmv,&jmindmv, &vmc_dp); // } /* NOTE: Typically M =3 so DP actually disabled... */ /* select between dual prime, frame and field prediction */ // if ( M==1 && vmc_dpmotion_type = MC_DMV; // /* No chrominance squared difference measure yet. // Assume identical to luminance */ // vmc = vmc_dp + vmc_dp; // } // else if ( vmcf < vmcfieldf) { mbi->motion_type = MC_FRAME; vmc = vmcf; } else { mbi->motion_type = MC_FIELD; vmc = vmcfieldf; } } /* select between intra or non-intra coding: * * selection is based on intra block variance (var) vs. * prediction error variance (vmc) * * Used to be: blocks with small prediction error are always * coded non-intra even if variance is smaller (is this reasonable? * * TODO: A.Stevens Jul 2000 * The bbmpeg guys have found this to be *unreasonable*. * I'm not sure I buy their solution using vmc*2. It is probabably * the vmc>= 9*256 test that is suspect. * */ if (vmc>var /*&& vmc>=(3*3)*16*16*2*/ ) { mbi->mb_type = MB_INTRA; mbi->var = var; } else { /* select between MC / No-MC * * use No-MC if var(No-MC) <= 1.25*var(MC) * (i.e slightly biased towards No-MC) * * blocks with small prediction error are always coded as No-MC * (requires no motion vectors, allows skipping) */ v0 = (*pdist2)(mc->oldref[0]+i+width*j,ssmb.mb,width,0,0,16); if (4*v0>5*vmc ) { /* use MC */ var = vmc; mbi->mb_type = MB_FORWARD; if (mbi->motion_type==MC_FRAME) { mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1); mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1); } else if (mbi->motion_type==MC_DMV) { /* these are FRAME vectors */ /* same parity vector */ mbi->MV[0][0][0] = dualpf_mc.pos.x - (i<<1); mbi->MV[0][0][1] = (dualpf_mc.pos.y<<1) - (j<<1); /* opposite parity vector */ mbi->dmvector[0] = imindmv; mbi->dmvector[1] = jmindmv; } else { /* these are FRAME vectors */ mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1); mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1); mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1); mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1); mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel; mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel; } } else { /* No-MC */ var = v0; mbi->mb_type = 0; mbi->motion_type = MC_FRAME; mbi->MV[0][0][0] = 0; mbi->MV[0][0][1] = 0; } } } else /* if (pict_type==B_TYPE) */ { if (picture->frame_pred_dct) { var = variance(ssmb.mb,16,width); /* forward */ fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb, width,i,j,mc->sxf,mc->syf, 16,width,height, &framef_mc ); framef_mc.fieldoff = 0; vmcf = framef_mc.var; /* backward */ fullsearch(engine, mc->neworg[0],mc->newref[0],&ssmb, width,i,j,mc->sxb,mc->syb, 16,width,height, &frameb_mc); frameb_mc.fieldoff = 0; vmcr = frameb_mc.var; /* interpolated (bidirectional) */ vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 ); /* decisions */ /* select between forward/backward/interpolated prediction: * use the one with smallest mean sqaured prediction error */ if (vmcf<=vmcr && vmcf<=vmci) { vmc = vmcf; mbi->mb_type = MB_FORWARD; } else if (vmcr<=vmci) { vmc = vmcr; mbi->mb_type = MB_BACKWARD; } else { vmc = vmci; mbi->mb_type = MB_FORWARD|MB_BACKWARD; } mbi->motion_type = MC_FRAME; } else { /* forward prediction */ frame_estimate(engine, mc->oldorg[0],mc->oldref[0],&ssmb, i,j,mc->sxf,mc->syf, &framef_mc, &topfldf_mc, &botfldf_mc, imins,jmins); /* backward prediction */ frame_estimate(engine, mc->neworg[0],mc->newref[0],&ssmb, i,j,mc->sxb,mc->syb, &frameb_mc, &topfldb_mc, &botfldb_mc, imins,jmins); vmcf = framef_mc.var; vmcr = frameb_mc.var; vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 ); vmcfieldf = topfldf_mc.var + botfldf_mc.var; vmcfieldr = topfldb_mc.var + botfldb_mc.var; vmcfieldi = bidir_pred_var( &topfldf_mc, &topfldb_mc, ssmb.mb, width<<1, 8) + bidir_pred_var( &botfldf_mc, &botfldb_mc, ssmb.mb, width<<1, 8); /* select prediction type of minimum distance from the * six candidates (field/frame * forward/backward/interpolated) */ if (vmcimb_type = MB_FORWARD|MB_BACKWARD; mbi->motion_type = MC_FRAME; vmc = vmci; } else if ( vmcfieldimb_type = MB_FORWARD|MB_BACKWARD; mbi->motion_type = MC_FIELD; vmc = vmcfieldi; } else if (vmcfmb_type = MB_FORWARD; mbi->motion_type = MC_FRAME; vmc = vmcf; } else if ( vmcfieldfmb_type = MB_FORWARD; mbi->motion_type = MC_FIELD; vmc = vmcfieldf; } else if (vmcrmb_type = MB_BACKWARD; mbi->motion_type = MC_FRAME; vmc = vmcr; } else { /* field, backward */ mbi->mb_type = MB_BACKWARD; mbi->motion_type = MC_FIELD; vmc = vmcfieldr; } } /* select between intra or non-intra coding: * * selection is based on intra block variance (var) vs. * prediction error variance (vmc) * * Used to be: blocks with small prediction error are always * coded non-intra even if variance is smaller (is this reasonable? * * TODO: A.Stevens Jul 2000 * The bbmpeg guys have found this to be *unreasonable*. * I'm not sure I buy their solution using vmc*2 in the first comparison. * It is probabably the vmc>= 9*256 test that is suspect. * */ if (vmc>var && vmc>=9*256) mbi->mb_type = MB_INTRA; else { var = vmc; if (mbi->motion_type==MC_FRAME) { /* forward */ mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1); mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1); /* backward */ mbi->MV[0][1][0] = frameb_mc.pos.x - (i<<1); mbi->MV[0][1][1] = frameb_mc.pos.y - (j<<1); } else { /* these are FRAME vectors */ /* forward */ mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1); mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1); mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1); mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1); mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel; mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel; /* backward */ mbi->MV[0][1][0] = topfldb_mc.pos.x - (i<<1); mbi->MV[0][1][1] = (topfldb_mc.pos.y<<1) - (j<<1); mbi->MV[1][1][0] = botfldb_mc.pos.x - (i<<1); mbi->MV[1][1][1] = (botfldb_mc.pos.y<<1) - (j<<1); mbi->mv_field_sel[0][1] = topfldb_mc.fieldsel; mbi->mv_field_sel[1][1] = botfldb_mc.fieldsel; } } } } /* * motion estimation for field pictures * * mbi: pointer to macroblock info structure * secondfield: indicates second field of a frame (in P fields this means * that reference field of opposite parity is in curref instead * of oldref) * ipflag: indicates a P type field which is the second field of a frame * in which the first field is I type (this restricts predictions * to be based only on the opposite parity (=I) field) * * results: * mbi-> * mb_type: 0, MB_INTRA, MB_FORWARD, MB_BACKWARD, MB_FORWARD|MB_BACKWARD * MV[][][]: motion vectors (field format) * mv_field_sel: top/bottom field * motion_type: MC_FIELD, MC_16X8 * * uses global vars: pict_type, pict_struct */ static void field_ME(motion_engine_t *engine, pict_data_s *picture, motion_comp_s *mc, int mb_row_start, int i, int j, mbinfo_s *mbi, int secondfield, int ipflag) { int w2; uint8_t *toporg, *topref, *botorg, *botref; subsampled_mb_s ssmb; mb_motion_s fields_mc, dualp_mc; mb_motion_s fieldf_mc, fieldb_mc; mb_motion_s field8uf_mc, field8lf_mc; mb_motion_s field8ub_mc, field8lb_mc; int var,vmc,v0,dmcfieldi,dmcfield,dmcfieldf,dmcfieldr,dmc8i; int imins,jmins; int dmc8f,dmc8r; // int vmc_dp,dmc_dp,dmc; imins = 0; jmins = 0; memset(&dualp_mc,0,sizeof(dualp_mc)); w2 = width<<1; /* Fast motion data sits at the end of the luminance buffer */ ssmb.mb = mc->cur[0] + i + w2*j; ssmb.umb = mc->cur[1] + ((i>>1)+(w2>>1)*(j>>1)); ssmb.vmb = mc->cur[2] + ((i>>1)+(w2>>1)*(j>>1)); ssmb.fmb = mc->cur[0] + fsubsample_offset+((i>>1)+(w2>>1)*(j>>1)); ssmb.qmb = mc->cur[0] + qsubsample_offset+ (i>>2)+(w2>>2)*(j>>2); if (picture->pict_struct==BOTTOM_FIELD) { ssmb.mb += width; ssmb.umb += (width >> 1); ssmb.vmb += (width >> 1); ssmb.fmb += (width >> 1); ssmb.qmb += (width >> 2); } var = variance(ssmb.mb,16,w2) + ( variance(ssmb.umb,8,(width>>1)) + variance(ssmb.vmb,8,(width>>1)))*2; if (picture->pict_type==I_TYPE) mbi->mb_type = MB_INTRA; else if (picture->pict_type==P_TYPE) { toporg = mc->oldorg[0]; topref = mc->oldref[0]; botorg = mc->oldorg[0] + width; botref = mc->oldref[0] + width; if (secondfield) { /* opposite parity field is in same frame */ if (picture->pict_struct==TOP_FIELD) { /* current is top field */ botorg = mc->cur[0] + width; botref = mc->curref[0] + width; } else { /* current is bottom field */ toporg = mc->cur[0]; topref = mc->curref[0]; } } field_estimate(engine, picture, toporg,topref,botorg,botref,&ssmb, i,j,mc->sxf,mc->syf,ipflag, &fieldf_mc, &field8uf_mc, &field8lf_mc, &fields_mc); dmcfield = fieldf_mc.sad; dmc8f = field8uf_mc.sad + field8lf_mc.sad; // if (M==1 && !ipflag) /* generic condition which permits Dual Prime */ // { // dpfield_estimate(engine, // picture, // topref,botref,ssmb.mb,i,j,imins,jmins, // &dualp_mc, // &vmc_dp); // dmc_dp = dualp_mc.sad; // } // /* select between dual prime, field and 16x8 prediction */ // if (M==1 && !ipflag && dmc_dpmotion_type = MC_DMV; // dmc = dualp_mc.sad; // vmc = vmc_dp; // // } // else if (dmc8fmotion_type = MC_16X8; /* upper and lower half blocks */ vmc = unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8); vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8); } else { /* field prediction */ mbi->motion_type = MC_FIELD; vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16 ); } /* select between intra and non-intra coding */ if ( vmc>var && vmc>=9*256) mbi->mb_type = MB_INTRA; else { /* zero MV field prediction from same parity ref. field * (not allowed if ipflag is set) */ if (!ipflag) v0 = (*pdist2)(((picture->pict_struct==BOTTOM_FIELD)?botref:topref) + i + w2*j, ssmb.mb,w2,0,0,16); if (ipflag || (4*v0>5*vmc && v0>=9*256)) { var = vmc; mbi->mb_type = MB_FORWARD; if (mbi->motion_type==MC_FIELD) { mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1); mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1); mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel; } else if (mbi->motion_type==MC_DMV) { /* same parity vector */ mbi->MV[0][0][0] = imins - (i<<1); mbi->MV[0][0][1] = jmins - (j<<1); /* opposite parity vector */ mbi->dmvector[0] = dualp_mc.pos.x; mbi->dmvector[1] = dualp_mc.pos.y; } else { mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1); mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1); mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1); mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1); mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel; mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel; } } else { /* No MC */ var = v0; mbi->mb_type = 0; mbi->motion_type = MC_FIELD; mbi->MV[0][0][0] = 0; mbi->MV[0][0][1] = 0; mbi->mv_field_sel[0][0] = (picture->pict_struct==BOTTOM_FIELD); } } } else /* if (pict_type==B_TYPE) */ { /* forward prediction */ field_estimate(engine, picture, mc->oldorg[0],mc->oldref[0], mc->oldorg[0]+width,mc->oldref[0]+width,&ssmb, i,j,mc->sxf,mc->syf,0, &fieldf_mc, &field8uf_mc, &field8lf_mc, &fields_mc); dmcfieldf = fieldf_mc.sad; dmc8f = field8uf_mc.sad + field8lf_mc.sad; /* backward prediction */ field_estimate(engine, picture, mc->neworg[0],mc->newref[0],mc->neworg[0]+width,mc->newref[0]+width, &ssmb, i,j,mc->sxb,mc->syb,0, &fieldb_mc, &field8ub_mc, &field8lb_mc, &fields_mc); dmcfieldr = fieldb_mc.sad; dmc8r = field8ub_mc.sad + field8lb_mc.sad; /* calculate distances for bidirectional prediction */ /* field */ dmcfieldi = bidir_pred_sad( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16); /* 16x8 upper and lower half blocks */ dmc8i = bidir_pred_sad( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 16 ); dmc8i += bidir_pred_sad( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 16 ); /* select prediction type of minimum distance */ if (dmcfieldimb_type = MB_FORWARD|MB_BACKWARD; mbi->motion_type = MC_FIELD; vmc = bidir_pred_var( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16); } else if (dmc8imb_type = MB_FORWARD|MB_BACKWARD; mbi->motion_type = MC_16X8; /* upper and lower half blocks */ vmc = bidir_pred_var( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 8); vmc += bidir_pred_var( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 8); } else if (dmcfieldfmb_type = MB_FORWARD; mbi->motion_type = MC_FIELD; vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16); } else if (dmc8fmb_type = MB_FORWARD; mbi->motion_type = MC_16X8; /* upper and lower half blocks */ vmc = unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8); vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8); } else if (dmcfieldrmb_type = MB_BACKWARD; mbi->motion_type = MC_FIELD; vmc = unidir_pred_var( &fieldb_mc, ssmb.mb, w2, 16 ); } else { /* 16x8, backward */ mbi->mb_type = MB_BACKWARD; mbi->motion_type = MC_16X8; /* upper and lower half blocks */ vmc = unidir_pred_var( &field8ub_mc, ssmb.mb, w2, 8); vmc += unidir_pred_var( &field8lb_mc, ssmb.mb, w2, 8); } /* select between intra and non-intra coding */ if (vmc>var && vmc>=9*256) mbi->mb_type = MB_INTRA; else { var = vmc; if (mbi->motion_type==MC_FIELD) { /* forward */ mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1); mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1); mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel; /* backward */ mbi->MV[0][1][0] = fieldb_mc.pos.x - (i<<1); mbi->MV[0][1][1] = fieldb_mc.pos.y - (j<<1); mbi->mv_field_sel[0][1] = fieldb_mc.fieldsel; } else /* MC_16X8 */ { /* forward */ mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1); mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1); mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel; mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1); mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1); mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel; /* backward */ mbi->MV[0][1][0] = field8ub_mc.pos.x - (i<<1); mbi->MV[0][1][1] = field8ub_mc.pos.y - (j<<1); mbi->mv_field_sel[0][1] = field8ub_mc.fieldsel; mbi->MV[1][1][0] = field8lb_mc.pos.x - (i<<1); mbi->MV[1][1][1] = field8lb_mc.pos.y - ((j+8)<<1); mbi->mv_field_sel[1][1] = field8lb_mc.fieldsel; } } } } /* Initialise motion compensation - currently purely selection of which versions of the various low level computation routines to use */ void init_motion() { int cpucap = cpu_accel(); if( cpucap == 0 ) /* No MMX/SSE etc support available */ { pdist22 = dist22; pdist44 = dist44; pdist1_00 = dist1_00; pdist1_01 = dist1_01; pdist1_10 = dist1_10; pdist1_11 = dist1_11; pbdist1 = bdist1; pdist2 = dist2; pbdist2 = bdist2; pdist2_22 = dist2_22; pbdist2_22 = bdist2_22; pfind_best_one_pel = find_best_one_pel; pbuild_sub22_mcomps = build_sub22_mcomps; } #ifdef X86_CPU else if(cpucap & ACCEL_X86_MMXEXT ) /* AMD MMX or SSE... */ { if(verbose) fprintf( stderr, "SETTING EXTENDED MMX for MOTION!\n"); pdist22 = dist22_mmxe; pdist44 = dist44_mmxe; pdist1_00 = dist1_00_mmxe; pdist1_01 = dist1_01_mmxe; pdist1_10 = dist1_10_mmxe; pdist1_11 = dist1_11_mmxe; pbdist1 = bdist1_mmx; pdist2 = dist2_mmx; pbdist2 = bdist2_mmx; pdist2_22 = dist2_22_mmx; pbdist2_22 = bdist2_22_mmx; pfind_best_one_pel = find_best_one_pel_mmxe; pbuild_sub22_mcomps = build_sub22_mcomps_mmxe; pmblock_sub44_dists = mblock_sub44_dists_mmxe; } else if(cpucap & ACCEL_X86_MMX) /* Ordinary MMX CPU */ { if(verbose) fprintf( stderr, "SETTING MMX for MOTION!\n"); pdist22 = dist22_mmx; pdist44 = dist44_mmx; pdist1_00 = dist1_00_mmx; pdist1_01 = dist1_01_mmx; pdist1_10 = dist1_10_mmx; pdist1_11 = dist1_11_mmx; pbdist1 = bdist1_mmx; pdist2 = dist2_mmx; pbdist2 = bdist2_mmx; pdist2_22 = dist2_22_mmx; pbdist2_22 = bdist2_22_mmx; pfind_best_one_pel = find_best_one_pel; pbuild_sub22_mcomps = build_sub22_mcomps; pmblock_sub44_dists = mblock_sub44_dists_mmx; } #endif } void motion_engine_loop(motion_engine_t *engine) { while(!engine->done) { pthread_mutex_lock(&(engine->input_lock)); if(!engine->done) { pict_data_s *picture = engine->pict_data; motion_comp_s *mc_data = engine->motion_comp; int secondfield = engine->secondfield; int ipflag = engine->ipflag; mbinfo_s *mbi = picture->mbinfo + (engine->start_row / 16) * (width / 16); int i, j; int mb_row_incr; /* Offset increment to go down 1 row of mb's */ int mb_row_start; if (picture->pict_struct == FRAME_PICTURE) { mb_row_incr = 16*width; mb_row_start = engine->start_row / 16 * mb_row_incr; /* loop through all macroblocks of a frame picture */ for (j=engine->start_row; j < engine->end_row; j+=16) { for (i=0; istart_row / 16 * mb_row_incr; /* loop through all macroblocks of a field picture */ for (j=engine->start_row; j < engine->end_row; j+=16) { for (i=0; ioutput_lock)); } } void motion_estimation(pict_data_s *picture, motion_comp_s *mc_data, int secondfield, int ipflag) { int i; /* Start loop */ for(i = 0; i < processors; i++) { motion_engines[i].motion_comp = mc_data; motion_engines[i].pict_data = picture; motion_engines[i].secondfield = secondfield; motion_engines[i].ipflag = ipflag; pthread_mutex_unlock(&(motion_engines[i].input_lock)); } /* Wait for completion */ for(i = 0; i < processors; i++) { pthread_mutex_lock(&(motion_engines[i].output_lock)); } } void start_motion_engines() { int i; int rows_per_processor = (int)((float)height2 / 16 / processors + 0.5); int current_row = 0; pthread_attr_t attr; pthread_mutexattr_t mutex_attr; pthread_mutexattr_init(&mutex_attr); pthread_attr_init(&attr); motion_engines = calloc(1, sizeof(motion_engine_t) * processors); for(i = 0; i < processors; i++) { motion_engines[i].start_row = current_row * 16; current_row += rows_per_processor; if(current_row > height2 / 16) current_row = height2 / 16; motion_engines[i].end_row = current_row * 16; pthread_mutex_init(&(motion_engines[i].input_lock), &mutex_attr); pthread_mutex_lock(&(motion_engines[i].input_lock)); pthread_mutex_init(&(motion_engines[i].output_lock), &mutex_attr); pthread_mutex_lock(&(motion_engines[i].output_lock)); motion_engines[i].done = 0; pthread_create(&(motion_engines[i].tid), &attr, (void*)motion_engine_loop, &motion_engines[i]); } } void stop_motion_engines() { int i; for(i = 0; i < processors; i++) { motion_engines[i].done = 1; pthread_mutex_unlock(&(motion_engines[i].input_lock)); pthread_join(motion_engines[i].tid, 0); pthread_mutex_destroy(&(motion_engines[i].input_lock)); pthread_mutex_destroy(&(motion_engines[i].output_lock)); } free(motion_engines); }