1 /* motion.c, motion estimation */
3 /* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
6 * Disclaimer of Warranty
8 * These software programs are available to the user without any license fee or
9 * royalty on an "as is" basis. The MPEG Software Simulation Group disclaims
10 * any and all warranties, whether express, implied, or statuary, including any
11 * implied warranties or merchantability or of fitness for a particular
12 * purpose. In no event shall the copyright-holder be liable for any
13 * incidental, punitive, or consequential damages of any kind whatsoever
14 * arising from the use of these programs.
16 * This disclaimer of warranty extends to the user of these programs and user's
17 * customers, employees, agents, transferees, successors, and assigns.
19 * The MPEG Software Simulation Group does not represent or warrant that the
20 * programs furnished hereunder are free of infringement of any third-party
23 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
24 * are subject to royalty fees to patent holders. Many of these patents are
25 * general enough such that they are unavoidable regardless of implementation
36 #include "cpu_accel.h"
39 /* Macro-block Motion compensation results record */
41 typedef struct _blockcrd {
48 blockxy pos; // Half-pel co-ordinates of source block
49 int sad; // Sum of absolute difference
51 uint8_t *blk ; // Source block data (in luminace data array)
52 int hx, hy; // Half-pel offsets
53 int fieldsel; // 0 = top 1 = bottom
54 int fieldoff; // Offset from start of frame data to first line
55 // of field. (top = 0, bottom = width );
58 typedef struct mb_motion mb_motion_s;
63 uint8_t *mb; // One pel
64 uint8_t *fmb; // Two-pel subsampled
65 uint8_t *qmb; // Four-pel subsampled
66 uint8_t *umb; // U compoenent one-pel
67 uint8_t *vmb; // V component one-pel
70 typedef struct subsampled_mb subsampled_mb_s;
73 static void frame_ME (motion_engine_t *engine,
77 int i, int j, struct mbinfo *mbi);
79 static void field_ME (motion_engine_t *engine,
88 static void frame_estimate (motion_engine_t *engine,
91 subsampled_mb_s *ssmb,
97 int imins[2][2], int jmins[2][2]);
99 static void field_estimate (motion_engine_t *engine,
100 pict_data_s *picture,
105 subsampled_mb_s *ssmb,
106 int i, int j, int sx, int sy, int ipflag,
110 mb_motion_s *bestsp);
112 static void dpframe_estimate (motion_engine_t *engine,
113 pict_data_s *picture,
115 subsampled_mb_s *ssmb,
116 int i, int j, int iminf[2][2], int jminf[2][2],
118 int *imindmvp, int *jmindmvp,
121 static void dpfield_estimate (motion_engine_t *engine,
122 pict_data_s *picture,
127 int imins, int jmins,
131 static void fullsearch (motion_engine_t *engine,
132 uint8_t *org, uint8_t *ref,
133 subsampled_mb_s *ssblk,
134 int lx, int i0, int j0,
135 int sx, int sy, int h,
137 mb_motion_s *motion );
139 static void find_best_one_pel( motion_engine_t *engine,
140 uint8_t *org, uint8_t *blk,
148 static int build_sub22_mcomps( motion_engine_t *engine,
149 int i0, int j0, int ihigh, int jhigh,
151 uint8_t *s22org, uint8_t *s22blk,
152 int flx, int fh, int searched_sub44_size );
155 static void find_best_one_pel_mmxe( motion_engine_t *engine,
156 uint8_t *org, uint8_t *blk,
165 static int build_sub22_mcomps_mmxe( motion_engine_t *engine, int i0, int j0, int ihigh, int jhigh,
167 uint8_t *s22org, uint8_t *s22blk,
168 int flx, int fh, int searched_sub44_size );
169 static int (*pmblock_sub44_dists)( uint8_t *blk, uint8_t *ref,
171 int ihigh, int jhigh,
172 int h, int rowstride,
174 mc_result_s *resvec);
177 static int unidir_pred_var( const mb_motion_s *motion,
178 uint8_t *mb, int lx, int h);
179 static int bidir_pred_var( const mb_motion_s *motion_f,
180 const mb_motion_s *motion_b,
181 uint8_t *mb, int lx, int h);
182 static int bidir_pred_sad( const mb_motion_s *motion_f,
183 const mb_motion_s *motion_b,
184 uint8_t *mb, int lx, int h);
186 static int variance( uint8_t *mb, int size, int lx);
188 static int dist22 ( uint8_t *blk1, uint8_t *blk2, int qlx, int qh);
190 static int dist44 ( uint8_t *blk1, uint8_t *blk2, int flx, int fh);
191 static int dist2_22( uint8_t *blk1, uint8_t *blk2,
193 static int bdist2_22( uint8_t *blk1f, uint8_t *blk1b,
198 static void (*pfind_best_one_pel)( motion_engine_t *engine,
199 uint8_t *org, uint8_t *blk,
207 static int (*pbuild_sub22_mcomps)( motion_engine_t *engine,
208 int i0, int j0, int ihigh, int jhigh,
210 uint8_t *s22org, uint8_t *s22blk,
211 int flx, int fh, int searched_sub44_size );
213 static int (*pdist2_22)( uint8_t *blk1, uint8_t *blk2,
215 static int (*pbdist2_22)( uint8_t *blk1f, uint8_t *blk1b,
219 static int dist1_00( uint8_t *blk1, uint8_t *blk2, int lx, int h, int distlim);
220 static int dist1_01(uint8_t *blk1, uint8_t *blk2, int lx, int h);
221 static int dist1_10(uint8_t *blk1, uint8_t *blk2, int lx, int h);
222 static int dist1_11(uint8_t *blk1, uint8_t *blk2, int lx, int h);
223 static int dist2 (uint8_t *blk1, uint8_t *blk2,
224 int lx, int hx, int hy, int h);
225 static int bdist2 (uint8_t *pf, uint8_t *pb,
226 uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
227 static int bdist1 (uint8_t *pf, uint8_t *pb,
228 uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
231 static int (*pdist22) ( uint8_t *blk1, uint8_t *blk2, int flx, int fh);
232 static int (*pdist44) ( uint8_t *blk1, uint8_t *blk2, int qlx, int qh);
233 static int (*pdist1_00) ( uint8_t *blk1, uint8_t *blk2, int lx, int h, int distlim);
234 static int (*pdist1_01) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
235 static int (*pdist1_10) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
236 static int (*pdist1_11) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
238 static int (*pdist2) (uint8_t *blk1, uint8_t *blk2,
239 int lx, int hx, int hy, int h);
242 static int (*pbdist2) (uint8_t *pf, uint8_t *pb,
243 uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
245 static int (*pbdist1) (uint8_t *pf, uint8_t *pb,
246 uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
250 static void check_mc( const mb_motion_s *motion, int rx, int ry, int i, int j, char *str )
253 if( motion->sad < 0 || motion->sad > 0x10000 )
255 printf( "SAD ooops %s\n", str );
258 if( motion->pos.x-i*2 < -rx || motion->pos.x-i*2 >= rx )
260 printf( "X MC ooops %s = %d [%d]\n", str, motion->pos.x-i*2,rx );
263 if( motion->pos.y-j*2 < -ry || motion->pos.y-j*2 >= ry )
265 printf( "Y MC ooops %s %d [%d]\n", str, motion->pos.y-j*2, ry );
270 static void init_mc( mb_motion_s *motion )
273 motion->pos.x = -1000;
274 motion->pos.y = -1000;
279 Compute the combined variance of luminance and chrominance information
280 for a particular non-intra macro block after unidirectional
281 motion compensation...
283 Note: results are scaled to give chrominance equal weight to
284 chrominance. The variance of the luminance portion is computed
285 at the time the motion compensation is computed.
287 TODO: Perhaps we should compute the whole thing in fullsearch not
288 seperate it. However, that would involve a lot of fiddling with
289 field_* and until its thoroughly debugged and tested I think I'll
290 leave that alone. Furthermore, it is unclear if its really worth
291 doing these computations for B *and* P frames.
293 TODO: BUG: ONLY works for 420 video...
297 static int unidir_chrom_var_sum( mb_motion_s *lum_mc,
299 subsampled_mb_s *ssblk,
304 /* N.b. MC co-ordinates are computed in half-pel units! */
305 int cblkoffset = (lum_mc->fieldoff>>1) +
306 (lum_mc->pos.x>>2) + (lum_mc->pos.y>>2)*uvlx;
308 return ((*pdist2_22)( ref[1] + cblkoffset, ssblk->umb, uvlx, uvh) +
309 (*pdist2_22)( ref[2] + cblkoffset, ssblk->vmb, uvlx, uvh))*2;
314 Compute the combined variance of luminance and chrominance information
315 for a particular non-intra macro block after unidirectional
316 motion compensation...
318 Note: results are scaled to give chrominance equal weight to
319 chrominance. The variance of the luminance portion is computed
320 at the time the motion compensation is computed.
322 Note: results scaled to give chrominance equal weight to chrominance.
324 TODO: BUG: ONLY works for 420 video...
326 NOTE: Currently unused but may be required if it turns out that taking
327 chrominance into account in B frames is needed.
331 int bidir_chrom_var_sum( mb_motion_s *lum_mc_f,
332 mb_motion_s *lum_mc_b,
335 subsampled_mb_s *ssblk,
340 /* N.b. MC co-ordinates are computed in half-pel units! */
341 int cblkoffset_f = (lum_mc_f->fieldoff>>1) +
342 (lum_mc_f->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
343 int cblkoffset_b = (lum_mc_b->fieldoff>>1) +
344 (lum_mc_b->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
347 (*pbdist2_22)( ref_f[1] + cblkoffset_f, ref_b[1] + cblkoffset_b,
348 ssblk->umb, uvlx, uvh ) +
349 (*pbdist2_22)( ref_f[2] + cblkoffset_f, ref_b[2] + cblkoffset_b,
350 ssblk->vmb, uvlx, uvh ))*2;
354 static int chrom_var_sum( subsampled_mb_s *ssblk, int h, int lx )
356 return (variance(ssblk->umb,(h>>1),(lx>>1)) +
357 variance(ssblk->vmb,(h>>1),(lx>>1))) * 2;
361 * frame picture motion estimation
363 * org: top left pel of source reference frame
364 * ref: top left pel of reconstructed reference frame
365 * ssmb: macroblock to be matched
366 * i,j: location of mb relative to ref (=center of search window)
367 * sx,sy: half widths of search window
368 * besfr: location and SAD of best frame prediction
369 * besttop: location of best field pred. for top field of mb
370 * bestbo : location of best field pred. for bottom field of mb
373 static void frame_estimate(motion_engine_t *engine,
376 subsampled_mb_s *ssmb,
377 int i, int j, int sx, int sy,
379 mb_motion_s *besttop,
380 mb_motion_s *bestbot,
385 subsampled_mb_s botssmb;
386 mb_motion_s topfld_mc;
387 mb_motion_s botfld_mc;
389 botssmb.mb = ssmb->mb+width;
390 botssmb.fmb = ssmb->mb+(width>>1);
391 botssmb.qmb = ssmb->qmb+(width>>2);
392 botssmb.umb = ssmb->umb+(width>>1);
393 botssmb.vmb = ssmb->vmb+(width>>1);
395 /* frame prediction */
396 fullsearch(engine, org,ref,ssmb,width,i,j,sx,sy,16,width,height,
398 bestfr->fieldsel = 0;
399 bestfr->fieldoff = 0;
401 /* predict top field from top field */
402 fullsearch(engine, org,ref,ssmb,width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
405 /* predict top field from bottom field */
406 fullsearch(engine, org+width,ref+width,ssmb, width<<1,i,j>>1,sx,sy>>1,8,
407 width,height>>1, &botfld_mc);
409 /* set correct field selectors... */
410 topfld_mc.fieldsel = 0;
411 botfld_mc.fieldsel = 1;
412 topfld_mc.fieldoff = 0;
413 botfld_mc.fieldoff = width;
415 imins[0][0] = topfld_mc.pos.x;
416 jmins[0][0] = topfld_mc.pos.y;
417 imins[1][0] = botfld_mc.pos.x;
418 jmins[1][0] = botfld_mc.pos.y;
420 /* select prediction for top field */
421 if (topfld_mc.sad<=botfld_mc.sad)
423 *besttop = topfld_mc;
427 *besttop = botfld_mc;
430 /* predict bottom field from top field */
431 fullsearch(engine, org,ref,&botssmb,
432 width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
435 /* predict bottom field from bottom field */
436 fullsearch(engine, org+width,ref+width,&botssmb,
437 width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
440 /* set correct field selectors... */
441 topfld_mc.fieldsel = 0;
442 botfld_mc.fieldsel = 1;
443 topfld_mc.fieldoff = 0;
444 botfld_mc.fieldoff = width;
446 imins[0][1] = topfld_mc.pos.x;
447 jmins[0][1] = topfld_mc.pos.y;
448 imins[1][1] = botfld_mc.pos.x;
449 jmins[1][1] = botfld_mc.pos.y;
451 /* select prediction for bottom field */
452 if (botfld_mc.sad<=topfld_mc.sad)
454 *bestbot = botfld_mc;
458 *bestbot = topfld_mc;
463 * field picture motion estimation subroutine
465 * toporg: address of original top reference field
466 * topref: address of reconstructed top reference field
467 * botorg: address of original bottom reference field
468 * botref: address of reconstructed bottom reference field
469 * ssmmb: macroblock to be matched
470 * i,j: location of mb (=center of search window)
471 * sx,sy: half width/height of search window
473 * bestfld: location and distance of best field prediction
474 * best8u: location of best 16x8 pred. for upper half of mb
475 * best8lp: location of best 16x8 pred. for lower half of mb
476 * bdestsp: location and distance of best same parity field
477 * prediction (needed for dual prime, only valid if
481 static void field_estimate (motion_engine_t *engine,
482 pict_data_s *picture,
487 subsampled_mb_s *ssmb,
488 int i, int j, int sx, int sy, int ipflag,
489 mb_motion_s *bestfld,
495 mb_motion_s topfld_mc;
496 mb_motion_s botfld_mc;
499 subsampled_mb_s botssmb;
501 botssmb.mb = ssmb->mb+width;
502 botssmb.umb = ssmb->umb+(width>>1);
503 botssmb.vmb = ssmb->vmb+(width>>1);
504 botssmb.fmb = ssmb->fmb+(width>>1);
505 botssmb.qmb = ssmb->qmb+(width>>2);
507 /* if ipflag is set, predict from field of opposite parity only */
508 notop = ipflag && (picture->pict_struct==TOP_FIELD);
509 nobot = ipflag && (picture->pict_struct==BOTTOM_FIELD);
511 /* field prediction */
513 /* predict current field from top field */
515 topfld_mc.sad = dt = 65536; /* infinity */
517 fullsearch(engine, toporg,topref,ssmb,width<<1,
518 i,j,sx,sy>>1,16,width,height>>1,
521 /* predict current field from bottom field */
523 botfld_mc.sad = db = 65536; /* infinity */
525 fullsearch(engine, botorg,botref,ssmb,width<<1,
526 i,j,sx,sy>>1,16,width,height>>1,
529 /* Set correct field selectors */
530 topfld_mc.fieldsel = 0;
531 botfld_mc.fieldsel = 1;
532 topfld_mc.fieldoff = 0;
533 botfld_mc.fieldoff = width;
535 /* same parity prediction (only valid if ipflag==0) */
536 if (picture->pict_struct==TOP_FIELD)
545 /* select field prediction */
548 *bestfld = topfld_mc;
552 *bestfld = botfld_mc;
556 /* 16x8 motion compensation */
558 /* predict upper half field from top field */
560 topfld_mc.sad = dt = 65536;
562 fullsearch(engine, toporg,topref,ssmb,width<<1,
563 i,j,sx,sy>>1,8,width,height>>1,
566 /* predict upper half field from bottom field */
568 botfld_mc.sad = db = 65536;
570 fullsearch(engine, botorg,botref,ssmb,width<<1,
571 i,j,sx,sy>>1,8,width,height>>1,
575 /* Set correct field selectors */
576 topfld_mc.fieldsel = 0;
577 botfld_mc.fieldsel = 1;
578 topfld_mc.fieldoff = 0;
579 botfld_mc.fieldoff = width;
581 /* select prediction for upper half field */
591 /* predict lower half field from top field */
593 N.b. For interlaced data width<<4 (width*16) takes us 8 rows
594 down in the same field.
595 Thus for the fast motion data (2*2
596 sub-sampled) we need to go 4 rows down in the same field.
597 This requires adding width*4 = (width<<2).
598 For the 4*4 sub-sampled motion data we need to go down 2 rows.
599 This requires adding width = width
603 topfld_mc.sad = dt = 65536;
605 fullsearch(engine, toporg,topref,&botssmb,
607 i,j+8,sx,sy>>1,8,width,height>>1,
608 /* &imint,&jmint, &dt,*/ &topfld_mc);
610 /* predict lower half field from bottom field */
612 botfld_mc.sad = db = 65536;
614 fullsearch(engine, botorg,botref,&botssmb,width<<1,
615 i,j+8,sx,sy>>1,8,width,height>>1,
616 /* &iminb,&jminb, &db,*/ &botfld_mc);
618 /* Set correct field selectors */
619 topfld_mc.fieldsel = 0;
620 botfld_mc.fieldsel = 1;
621 topfld_mc.fieldoff = 0;
622 botfld_mc.fieldoff = width;
624 /* select prediction for lower half field */
635 static void dpframe_estimate (motion_engine_t *engine,
636 pict_data_s *picture,
638 subsampled_mb_s *ssmb,
640 int i, int j, int iminf[2][2], int jminf[2][2],
641 mb_motion_s *best_mc,
642 int *imindmvp, int *jmindmvp,
645 int pref,ppred,delta_x,delta_y;
646 int is,js,it,jt,ib,jb,it0,jt0,ib0,jb0;
647 int imins,jmins,imint,jmint,iminb,jminb,imindmv,jmindmv;
650 /* Calculate Dual Prime distortions for 9 delta candidates
651 * for each of the four minimum field vectors
652 * Note: only for P pictures!
655 /* initialize minimum dual prime distortion to large value */
658 for (pref=0; pref<2; pref++)
660 for (ppred=0; ppred<2; ppred++)
662 /* convert Cartesian absolute to relative motion vector
663 * values (wrt current macroblock address (i,j)
665 is = iminf[pref][ppred] - (i<<1);
666 js = jminf[pref][ppred] - (j<<1);
670 /* vertical field shift adjustment */
676 /* mvxs and mvys scaling*/
679 if (picture->topfirst == ppred)
681 /* second field: scale by 1/3 */
682 is = (is>=0) ? (is+1)/3 : -((-is+1)/3);
683 js = (js>=0) ? (js+1)/3 : -((-js+1)/3);
689 /* vector for prediction from field of opposite 'parity' */
690 if (picture->topfirst)
692 /* vector for prediction of top field from bottom field */
693 it0 = ((is+(is>0))>>1);
694 jt0 = ((js+(js>0))>>1) - 1;
696 /* vector for prediction of bottom field from top field */
697 ib0 = ((3*is+(is>0))>>1);
698 jb0 = ((3*js+(js>0))>>1) + 1;
702 /* vector for prediction of top field from bottom field */
703 it0 = ((3*is+(is>0))>>1);
704 jt0 = ((3*js+(js>0))>>1) - 1;
706 /* vector for prediction of bottom field from top field */
707 ib0 = ((is+(is>0))>>1);
708 jb0 = ((js+(js>0))>>1) + 1;
711 /* convert back to absolute half-pel field picture coordinates */
719 if (is >= 0 && is <= (width-16)<<1 &&
720 js >= 0 && js <= (height-16))
722 for (delta_y=-1; delta_y<=1; delta_y++)
724 for (delta_x=-1; delta_x<=1; delta_x++)
726 /* opposite field coordinates */
732 if (it >= 0 && it <= (width-16)<<1 &&
733 jt >= 0 && jt <= (height-16) &&
734 ib >= 0 && ib <= (width-16)<<1 &&
735 jb >= 0 && jb <= (height-16))
737 /* compute prediction error */
738 local_dist = (*pbdist2)(
739 ref + (is>>1) + (width<<1)*(js>>1),
740 ref + width + (it>>1) + (width<<1)*(jt>>1),
741 ssmb->mb, /* current mb location */
742 width<<1, /* adjacent line distance */
743 is&1, js&1, it&1, jt&1, /* half-pel flags */
744 8); /* block height */
745 local_dist += (*pbdist2)(
746 ref + width + (is>>1) + (width<<1)*(js>>1),
747 ref + (ib>>1) + (width<<1)*(jb>>1),
748 ssmb->mb + width, /* current mb location */
749 width<<1, /* adjacent line distance */
750 is&1, js&1, ib&1, jb&1, /* half-pel flags */
751 8); /* block height */
753 /* update delta with least distortion vector */
754 if (local_dist < vmc)
767 } /* end delta x loop */
768 } /* end delta y loop */
773 /* TODO: This is now likely to be obsolete... */
774 /* Compute L1 error for decision purposes */
775 local_dist = (*pbdist1)(
776 ref + (imins>>1) + (width<<1)*(jmins>>1),
777 ref + width + (imint>>1) + (width<<1)*(jmint>>1),
780 imins&1, jmins&1, imint&1, jmint&1,
782 //printf("motion 1 %p\n", pbdist1);
783 local_dist += (*pbdist1)(
784 ref + width + (imins>>1) + (width<<1)*(jmins>>1),
785 ref + (iminb>>1) + (width<<1)*(jminb>>1),
788 imins&1, jmins&1, iminb&1, jminb&1,
790 //printf("motion 2\n");
792 best_mc->sad = local_dist;
793 best_mc->pos.x = imins;
794 best_mc->pos.y = jmins;
798 //printf("motion 2\n");
801 static void dpfield_estimate(motion_engine_t *engine,
802 pict_data_s *picture,
806 int i, int j, int imins, int jmins,
807 mb_motion_s *bestdp_mc,
812 uint8_t *sameref, *oppref;
813 int io0,jo0,io,jo,delta_x,delta_y,mvxs,mvys,mvxo0,mvyo0;
814 int imino,jmino,imindmv,vmc_dp,local_dist;
816 /* Calculate Dual Prime distortions for 9 delta candidates */
817 /* Note: only for P pictures! */
819 /* Assign opposite and same reference pointer */
820 if (picture->pict_struct==TOP_FIELD)
831 /* convert Cartesian absolute to relative motion vector
832 * values (wrt current macroblock address (i,j)
834 mvxs = imins - (i<<1);
835 mvys = jmins - (j<<1);
837 /* vector for prediction from field of opposite 'parity' */
838 mvxo0 = (mvxs+(mvxs>0)) >> 1; /* mvxs / / */
839 mvyo0 = (mvys+(mvys>0)) >> 1; /* mvys / / 2*/
841 /* vertical field shift correction */
842 if (picture->pict_struct==TOP_FIELD)
847 /* convert back to absolute coordinates */
848 io0 = mvxo0 + (i<<1);
849 jo0 = mvyo0 + (j<<1);
851 /* initialize minimum dual prime distortion to large value */
854 for (delta_y = -1; delta_y <= 1; delta_y++)
856 for (delta_x = -1; delta_x <=1; delta_x++)
858 /* opposite field coordinates */
862 if (io >= 0 && io <= (width-16)<<1 &&
863 jo >= 0 && jo <= (height2-16)<<1)
865 /* compute prediction error */
866 local_dist = (*pbdist2)(
867 sameref + (imins>>1) + width2*(jmins>>1),
868 oppref + (io>>1) + width2*(jo>>1),
869 mb, /* current mb location */
870 width2, /* adjacent line distance */
871 imins&1, jmins&1, io&1, jo&1, /* half-pel flags */
872 16); /* block height */
874 /* update delta with least distortion vector */
875 if (local_dist < vmc_dp)
883 } /* end delta x loop */
884 } /* end delta y loop */
886 /* Compute L1 error for decision purposes */
889 sameref + (imins>>1) + width2*(jmins>>1),
890 oppref + (imino>>1) + width2*(jmino>>1),
891 mb, /* current mb location */
892 width2, /* adjacent line distance */
893 imins&1, jmins&1, imino&1, jmino&1, /* half-pel flags */
894 16); /* block height */
896 bestdp_mc->pos.x = imindmv;
897 bestdp_mc->pos.x = imindmv;
903 * Vectors of motion compensations
907 Take a vector of motion compensations and repeatedly make passes
908 discarding all elements whose sad "weight" is above the current mean weight.
911 static void sub_mean_reduction( mc_result_s *matches, int len,
913 int *newlen_res, int *minweight_res)
920 *minweight_res = 100000;
928 for( i = 0; i < len ; ++i )
929 weight_sum += matches[i].weight;
930 mean_weight = weight_sum / len;
936 for( i =0; i < len; ++i )
938 if( matches[i].weight <= mean_weight )
940 matches[j] = matches[i];
948 *minweight_res = mean_weight;
952 Build a vector of the top 4*4 sub-sampled motion compensations in the box
953 (ilow,jlow) to (ihigh,jhigh).
955 The algorithm is as follows:
956 1. coarse matches on an 8*8 grid of positions are collected that fall below
957 a (very conservative) sad threshold (basically 50% more than moving average of
958 the mean sad of such matches).
960 2. The worse than-average matches are discarded.
962 3. The remaining coarse matches are expanded with the left/lower neighbouring
963 4*4 grid matches. Again only those better than a threshold (this time the mean
964 of the 8*8 grid matches are retained.
966 4. Multiple passes are made discarding worse than-average matches. The number of
967 passes is specified by the user. The default it 2 (leaving roughly 1/4 of the matches).
969 The net result is very fast and find good matches if they're to be found. I.e. the
970 penalty over exhaustive search is pretty low.
972 NOTE: The "discard below average" trick depends critically on having some variation in
973 the matches. The slight penalty imposed for distant matches (reasonably since the
974 motion vectors have to be encoded) is *vital* as otherwise pathologically bad
975 performance results on highly uniform images.
977 TODO: We should probably allow the user to eliminate the initial thinning of 8*8
978 grid matches if ultimate quality is demanded (e.g. for low bit-rate applications).
983 static int build_sub44_mcomps(motion_engine_t *engine,
984 int ilow, int jlow, int ihigh, int jhigh,
987 uint8_t *s44org, uint8_t *s44blk, int qlx, int qh )
999 /*int rangex, rangey;
1000 static int rough_num_mcomps;
1001 static mc_result_s rough_mcomps[MAX_44_MATCHES];
1007 uint8_t *old_s44orgblk;
1009 /* N.b. we may ignore the right hand block of the pair going over the
1010 right edge as we have carefully allocated the buffer oversize to ensure
1011 no memory faults. The later motion compensation calculations
1012 performed on the results of this pass will filter out
1013 out-of-range blocks...
1017 engine->sub44_num_mcomps = 0;
1019 threshold = 6*null_mc_sad / (4*4*mc_44_red);
1020 s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
1022 /* Exhaustive search on 4*4 sub-sampled data. This is affordable because
1023 (a) it is only 16th of the size of the real 1-pel data
1024 (b) we ignore those matches with an sad above our threshold.
1028 /* Invariant: s44orgblk = s44org+(i>>2)+qlx*(j>>2) */
1029 s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
1030 for( j = jstrt; j <= jend; j += 4 )
1032 old_s44orgblk = s44orgblk;
1033 for( i = istrt; i <= iend; i += 4 )
1035 s1 = ((*pdist44)( s44orgblk,s44blk,qlx,qh) & 0xffff) + abs(i-i0) + abs(j-j0);
1036 if( s1 < threshold )
1038 engine->sub44_mcomps[engine->sub44_num_mcomps].x = i;
1039 engine->sub44_mcomps[engine->sub44_num_mcomps].y = j;
1040 engine->sub44_mcomps[engine->sub44_num_mcomps].weight = s1 ;
1041 ++engine->sub44_num_mcomps;
1045 s44orgblk = old_s44orgblk + qlx;
1050 engine->sub44_num_mcomps
1051 = (*pmblock_sub44_dists)( s44orgblk, s44blk,
1056 engine->sub44_mcomps);
1059 /* If we're really pushing quality we reduce once otherwise twice. */
1061 sub_mean_reduction( engine->sub44_mcomps, engine->sub44_num_mcomps, 1+(mc_44_red>1),
1062 &engine->sub44_num_mcomps, &mean_weight);
1065 return engine->sub44_num_mcomps;
1069 /* Build a vector of the best 2*2 sub-sampled motion
1070 compensations using the best 4*4 matches as starting points. As
1071 with with the 4*4 matches We don't collect them densely as they're
1072 just search starting points for 1-pel search and ones that are 1 out
1073 should still give better than average matches...
1078 static int build_sub22_mcomps(motion_engine_t *engine,
1079 int i0, int j0, int ihigh, int jhigh,
1081 uint8_t *s22org, uint8_t *s22blk,
1082 int flx, int fh, int searched_sub44_size )
1085 int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
1088 int ilim = ihigh-i0;
1089 int jlim = jhigh-j0;
1093 engine->sub22_num_mcomps = 0;
1094 for( k = 0; k < searched_sub44_size; ++k )
1097 matchrec.x = engine->sub44_mcomps[k].x;
1098 matchrec.y = engine->sub44_mcomps[k].y;
1100 s22orgblk = s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
1101 for( i = 0; i < 4; ++i )
1103 if( matchrec.x <= ilim && matchrec.y <= jlim )
1105 s = (*pdist22)( s22orgblk,s22blk,flx,fh);
1108 engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
1109 engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
1110 engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
1111 ++engine->sub22_num_mcomps;
1132 sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
1133 &engine->sub22_num_mcomps, &min_weight );
1134 return engine->sub22_num_mcomps;
1138 int build_sub22_mcomps_mmxe(motion_engine_t *engine,
1139 int i0, int j0, int ihigh, int jhigh,
1141 uint8_t *s22org, uint8_t *s22blk,
1142 int flx, int fh, int searched_sub44_size )
1145 int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
1148 int ilim = ihigh-i0;
1149 int jlim = jhigh-j0;
1154 /* TODO: The calculation of the lstrow offset really belongs in
1156 int lstrow=(fh-1)*flx;
1158 engine->sub22_num_mcomps = 0;
1159 for( k = 0; k < searched_sub44_size; ++k )
1162 matchrec.x = engine->sub44_mcomps[k].x;
1163 matchrec.y = engine->sub44_mcomps[k].y;
1165 s22orgblk = s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
1166 mblockq_dist22_mmxe(s22orgblk+lstrow, s22blk+lstrow, flx, fh, resvec);
1167 for( i = 0; i < 4; ++i )
1169 if( matchrec.x <= ilim && matchrec.y <= jlim )
1174 engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
1175 engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
1176 engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
1177 ++engine->sub22_num_mcomps;
1195 sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
1196 &engine->sub22_num_mcomps, &min_weight );
1197 return engine->sub22_num_mcomps;
1203 Search for the best 1-pel match within 1-pel of a good 2*2-pel match.
1204 TODO: Its a bit silly to cart around absolute M/C co-ordinates that
1205 eventually get turned into relative ones anyway...
1209 static void find_best_one_pel(motion_engine_t *engine,
1210 uint8_t *org, uint8_t *blk,
1222 blockxy minpos = res->pos;
1228 for( k = 0; k < searched_size; ++k )
1230 matchrec.x = i0 + engine->sub22_mcomps[k].x;
1231 matchrec.y = j0 + engine->sub22_mcomps[k].y;
1232 orgblk = org + matchrec.x+lx*matchrec.y;
1233 penalty = abs(matchrec.x)+abs(matchrec.y);
1235 for( i = 0; i < 4; ++i )
1237 if( matchrec.x <= xmax && matchrec.y <= ymax )
1240 d = penalty+(*pdist1_00)(orgblk,blk,lx,h, dmin);
1262 res->blk = org + minpos.x+lx*minpos.y;
1268 void find_best_one_pel_mmxe(motion_engine_t *engine,
1269 uint8_t *org, uint8_t *blk,
1281 blockxy minpos = res->pos;
1289 for( k = 0; k < searched_size; ++k )
1291 matchrec.x = i0 + engine->sub22_mcomps[k].x;
1292 matchrec.y = j0 + engine->sub22_mcomps[k].y;
1293 orgblk = org + matchrec.x+lx*matchrec.y;
1294 penalty = abs(matchrec.x)+abs(matchrec.y);
1297 mblockq_dist1_mmxe(orgblk,blk,lx,h, resvec);
1299 for( i = 0; i < 4; ++i )
1301 if( matchrec.x <= xmax && matchrec.y <= ymax )
1304 d = penalty+resvec[i];
1326 res->blk = org + minpos.x+lx*minpos.y;
1333 * full search block matching
1335 * A.Stevens 2000: This is now a big misnomer. The search is now a hierarchical/sub-sampling
1336 * search not a full search. However, experiments have shown it is always close to
1337 * optimal and almost always very close or optimal.
1339 * blk: top left pel of (16*h) block
1340 * s22blk: top element of fast motion compensation block corresponding to blk
1341 * h: height of block
1342 * lx: distance (in bytes) of vertically adjacent pels in ref,blk
1343 * org: top left pel of source reference picture
1344 * ref: top left pel of reconstructed reference picture
1345 * i0,j0: center of search window
1346 * sx,sy: half widths of search window
1347 * xmax,ymax: right/bottom limits of search area
1348 * iminp,jminp: pointers to where the result is stored
1349 * result is given as half pel offset from ref(0,0)
1350 * i.e. NOT relative to (i0,j0)
1354 static void fullsearch(motion_engine_t *engine,
1357 subsampled_mb_s *ssblk,
1358 int lx, int i0, int j0,
1359 int sx, int sy, int h,
1361 /* int *iminp, int *jminp, int *sadminp, */
1366 /* int imin, jmin, dmin */
1367 int i,j,ilow,ihigh,jlow,jhigh;
1370 /* NOTE: Surprisingly, the initial motion compensation search
1371 works better when the original image not the reference (reconstructed)
1374 uint8_t *s22org = (uint8_t*)(org+fsubsample_offset);
1375 uint8_t *s44org = (uint8_t*)(org+qsubsample_offset);
1382 /* xmax and ymax into more useful form... */
1387 /* The search radii are *always* multiples of 4 to avoid messiness
1388 in the initial 4*4 pel search. This is handled by the
1389 parameter checking/processing code in readparmfile() */
1391 /* Create a distance-order mcomps of possible motion compensations
1392 based on the fast estimation data - 4*4 pel sums (4*4
1393 sub-sampled) rather than actual pel's. 1/16 the size... */
1395 jlow = jlow < 0 ? 0 : jlow;
1397 jhigh = jhigh > ymax ? ymax : jhigh;
1399 ilow = ilow < 0 ? 0 : ilow;
1401 ihigh = ihigh > xmax ? xmax : ihigh;
1404 Very rarely this may fail to find matchs due to all the good
1405 looking ones being over threshold. hence we make sure we
1406 fall back to a 0 motion compensation in this case.
1408 The sad for the 0 motion compensation is also very useful as
1409 a basis for setting thresholds for rejecting really dud 4*4
1410 and 2*2 sub-sampled matches.
1412 memset(&best, 0, sizeof(best));
1413 best.sad = (*pdist1_00)(ref + i0 + j0 * lx,
1421 engine->sub44_num_mcomps = build_sub44_mcomps(engine,
1422 ilow, jlow, ihigh, jhigh,
1426 ssblk->qmb, qlx, qh );
1429 /* Now create a distance-ordered mcomps of possible motion
1430 compensations based on the fast estimation data - 2*2 pel sums
1431 using the best fraction of the 4*4 estimates However we cover
1432 only coarsely... on 4-pel boundaries... */
1434 engine->sub22_num_mcomps = (*pbuild_sub22_mcomps)( engine, i0, j0, ihigh, jhigh,
1436 s22org, ssblk->fmb, flx, fh,
1437 engine->sub44_num_mcomps );
1440 /* Now choose best 1-pel match from what approximates (not exact
1441 due to the pre-processing trick with the mean) the top 1/2 of
1446 (*pfind_best_one_pel)( engine, ref, ssblk->mb, engine->sub22_num_mcomps,
1448 ilow, jlow, xmax, ymax,
1451 /* Final polish: half-pel search of best candidate against
1452 reconstructed image.
1460 ilow = best.pos.x - (best.pos.x>(ilow<<1));
1461 ihigh = best.pos.x + (best.pos.x<((ihigh)<<1));
1462 jlow = best.pos.y - (best.pos.y>(jlow<<1));
1463 jhigh = best.pos.y+ (best.pos.y<((jhigh)<<1));
1465 for (j=jlow; j<=jhigh; j++)
1467 for (i=ilow; i<=ihigh; i++)
1469 orgblk = ref+(i>>1)+((j>>1)*lx);
1473 d = (*pdist1_11)(orgblk,ssblk->mb,lx,h);
1475 d = (*pdist1_01)(orgblk,ssblk->mb,lx,h);
1480 d = (*pdist1_10)(orgblk,ssblk->mb,lx,h);
1482 d = (*pdist1_00)(orgblk,ssblk->mb,lx,h,best.sad);
1495 best.var = (*pdist2)(best.blk, ssblk->mb, lx, best.hx, best.hy, h);
1500 * total absolute difference between two (16*h) blocks
1501 * including optional half pel interpolation of blk1 (hx,hy)
1502 * blk1,blk2: addresses of top left pels of both blocks
1503 * lx: distance (in bytes) of vertically adjacent pels
1504 * hx,hy: flags for horizontal and/or vertical interpolation
1505 * h: height of block (usually 8 or 16)
1506 * distlim: bail out if sum exceeds this value
1509 /* A.Stevens 2000: New version for highly pipelined CPUs where branching is
1510 costly. Really it sucks that C doesn't define a stdlib abs that could
1511 be realised as a compiler intrinsic using appropriate CPU instructions.
1512 That 1970's heritage...
1516 static int dist1_00(uint8_t *blk1,uint8_t *blk2,
1517 int lx, int h,int distlim)
1530 #define pipestep(o) v = p1[o]-p2[o]; s+= abs(v);
1531 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1532 pipestep(4); pipestep(5); pipestep(6); pipestep(7);
1533 pipestep(8); pipestep(9); pipestep(10); pipestep(11);
1534 pipestep(12); pipestep(13); pipestep(14); pipestep(15);
1546 static int dist1_01(uint8_t *blk1,uint8_t *blk2,
1559 for (i=0; i<16; i++)
1562 v = ((unsigned int)(p1[i]+p1[i+1])>>1) - p2[i];
1564 v = ((p1[i]>>1)+(p1[i+1]>>1)>>1) - (p2[i]>>1);
1574 static int dist1_10(uint8_t *blk1,uint8_t *blk2,
1577 uint8_t *p1,*p1a,*p2;
1588 for (i=0; i<16; i++)
1590 v = ((unsigned int)(p1[i]+p1a[i])>>1) - p2[i];
1601 static int dist1_11(uint8_t *blk1,uint8_t *blk2,
1604 uint8_t *p1,*p1a,*p2;
1616 for (i=0; i<16; i++)
1618 v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1])>>2) - p2[i];
1628 /* USED only during debugging...
1629 void check_fast_motion_data(uint8_t *blk, char *label )
1634 uint8_t *start_s22blk, *start_s44blk;
1636 unsigned int mismatch;
1638 start_s22blk = (blk+height*width);
1639 start_s44blk = (blk+height*width+(height>>1)*(width>>1));
1641 if (pict_struct==FRAME_PICTURE)
1643 nextfieldline = width;
1647 nextfieldline = 2*width;
1656 for( i = 0; i < nextfieldline/2; ++i )
1658 mismatch |= ((p[0] + p[1] + p[nextfieldline] + p[nextfieldline+1])>>2) != *pb;
1666 printf( "Mismatch detected check %s for buffer %08x\n", label, blk );
1673 Append fast motion estimation data to original luminance
1674 data. N.b. memory allocation for luminance data allows space
1675 for this information...
1678 void fast_motion_data(uint8_t *blk, int picture_struct )
1683 uint8_t *start_s22blk, *start_s44blk;
1686 #ifdef TEST_RCSEARCH
1687 uint16_t *start_rowblk, *start_colblk;
1688 uint16_t *pc, *pr,*p;
1691 int down16 = width*16;
1693 uint16_t rowsums[2048];
1694 uint16_t colsums[2048]; /* TODO: BUG: should resize with width */
1695 start_rowblk = (uint16_t *)blk+rowsums_offset;
1696 start_colblk = (uint16_t *)blk+colsums_offset;
1700 /* In an interlaced field the "next" line is 2 width's down
1701 rather than 1 width down */
1703 if (picture_struct==FRAME_PICTURE)
1705 nextfieldline = width;
1709 nextfieldline = 2*width;
1712 start_s22blk = blk+fsubsample_offset;
1713 start_s44blk = blk+qsubsample_offset;
1715 nb = (blk+nextfieldline);
1716 /* Sneaky stuff here... we can do lines in both fields at once */
1717 pb = (uint8_t *) start_s22blk;
1719 while( nb < start_s22blk )
1721 for( i = 0; i < nextfieldline/4; ++i ) /* We're doing 4 pels horizontally at once */
1723 /* TODO: A.Stevens this has to be the most word-length dependent
1724 code in the world. Better than MMX assembler though I guess... */
1725 pb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
1726 pb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;
1732 nb = b + nextfieldline;
1736 /* Now create the 4*4 sub-sampled data from the 2*2
1737 N.b. the 2*2 sub-sampled motion data preserves the interlace structure of the
1738 original. Albeit half as many lines and pixels...
1741 nextfieldline = nextfieldline >> 1;
1745 nb = (start_s22blk+nextfieldline);
1747 while( nb < start_s44blk )
1749 for( i = 0; i < nextfieldline/4; ++i )
1751 /* TODO: BRITTLE: A.Stevens - this only works for uint8_t = uint8_t */
1752 qb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
1753 qb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;
1759 nb = b + nextfieldline;
1762 #ifdef TEST_RCSEARCH
1763 /* TODO: BUG: THIS CODE DOES NOT YET ALLOW FOR INTERLACED FIELDS.... */
1766 Initial row sums....
1769 for(j = 0; j < height; ++j )
1772 for( i = 0; i < 16; ++ i )
1776 rowsums[j] = rowsum;
1783 for( i = 0; i < width; ++i )
1788 for( j = 0; j < 16; ++j )
1790 for( i = 0; i < width; ++i )
1797 /* Now fill in the row/column sum tables...
1798 Note: to allow efficient construction of sum/col differences for a
1799 given position row sums are held in a *column major* array
1800 (changing y co-ordinate makes for small index changes)
1801 the col sums are held in a *row major* array
1806 for(j = 0; j <32; ++j )
1809 rowsum = rowsums[j];
1810 for( i = 0; i < width-16; ++i )
1814 colsums[i] = (colsums[i] + pb[down16] )-pb[0];
1815 rowsum = (rowsum + pb[16]) - pb[0];
1819 pb += 16; /* move pb on to next row... rememember we only did width-16! */
1826 static int dist22( uint8_t *s22blk1, uint8_t *s22blk2,int flx,int fh)
1828 uint8_t *p1 = s22blk1;
1829 uint8_t *p2 = s22blk2;
1833 for( j = 0; j < fh; ++j )
1836 #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff)
1837 pipestep(0); pipestep(1);
1838 pipestep(2); pipestep(3);
1839 pipestep(4); pipestep(5);
1840 pipestep(6); pipestep(7);
1852 Sum absolute differences for 4*4 sub-sampled data.
1854 TODO: currently assumes only 16*16 or 16*8 motion compensation will be used...
1855 I.e. 4*4 or 4*2 sub-sampled blocks will be compared.
1859 static int dist44( uint8_t *s44blk1, uint8_t *s44blk2,int qlx,int qh)
1861 uint8_t *p1 = s44blk1;
1862 uint8_t *p2 = s44blk2;
1866 /* #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff) */
1867 #define pipestep(o) diff = p1[o]-p2[o]; s += diff < 0 ? -diff : diff;
1868 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1871 p1 += qlx; p2 += qlx;
1872 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1875 p1 += qlx; p2 += qlx;
1876 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1877 p1 += qlx; p2 += qlx;
1878 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1887 * total squared difference between two (8*h) blocks of 2*2 sub-sampled pels
1888 * blk1,blk2: addresses of top left pels of both blocks
1889 * lx: distance (in bytes) of vertically adjacent pels
1890 * h: height of block (usually 8 or 16)
1893 static int dist2_22(uint8_t *blk1, uint8_t *blk2, int lx, int h)
1895 uint8_t *p1 = blk1, *p2 = blk2;
1911 /* total squared difference between bidirection prediction of (8*h)
1912 * blocks of 2*2 sub-sampled pels and reference
1913 * blk1f, blk1b,blk2: addresses of top left
1915 * lx: distance (in bytes) of vertically adjacent
1917 * h: height of block (usually 4 or 8)
1920 static int bdist2_22(uint8_t *blk1f, uint8_t *blk1b, uint8_t *blk2,
1923 uint8_t *p1f = blk1f,*p1b = blk1b,*p2 = blk2;
1930 v = ((p1f[i]+p1b[i]+1)>>1) - p2[i];
1941 * total squared difference between two (16*h) blocks
1942 * including optional half pel interpolation of blk1 (hx,hy)
1943 * blk1,blk2: addresses of top left pels of both blocks
1944 * lx: distance (in bytes) of vertically adjacent pels
1945 * hx,hy: flags for horizontal and/or vertical interpolation
1946 * h: height of block (usually 8 or 16)
1950 static int dist2(blk1,blk2,lx,hx,hy,h)
1951 uint8_t *blk1,*blk2;
1954 uint8_t *p1,*p1a,*p2;
1964 for (i=0; i<16; i++)
1975 for (i=0; i<16; i++)
1977 v = ((unsigned int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
1988 for (i=0; i<16; i++)
1990 v = ((unsigned int)(p1[i]+p1a[i]+1)>>1) - p2[i];
1998 else /* if (hx && hy) */
2003 for (i=0; i<16; i++)
2005 v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1]+2)>>2) - p2[i];
2019 * absolute difference error between a (16*h) block and a bidirectional
2022 * p2: address of top left pel of block
2023 * pf,hxf,hyf: address and half pel flags of forward ref. block
2024 * pb,hxb,hyb: address and half pel flags of backward ref. block
2025 * h: height of block
2026 * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
2030 static int bdist1(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
2031 uint8_t *pf,*pb,*p2;
2032 int lx,hxf,hyf,hxb,hyb,h;
2034 uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
2050 for (i=0; i<16; i++)
2052 v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
2053 ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
2072 * squared error between a (16*h) block and a bidirectional
2075 * p2: address of top left pel of block
2076 * pf,hxf,hyf: address and half pel flags of forward ref. block
2077 * pb,hxb,hyb: address and half pel flags of backward ref. block
2078 * h: height of block
2079 * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
2083 static int bdist2(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
2084 uint8_t *pf,*pb,*p2;
2085 int lx,hxf,hyf,hxb,hyb,h;
2087 uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
2103 for (i=0; i<16; i++)
2105 v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
2106 ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
2126 * variance of a (size*size) block, multiplied by 256
2127 * p: address of top left pel of block
2128 * lx: distance (in bytes) of vertically adjacent pels
2130 static int variance(uint8_t *p, int size, int lx)
2133 unsigned int v,s,s2;
2137 for (j=0; j<size; j++)
2139 for (i=0; i<size; i++)
2147 return s2 - (s*s)/(size*size);
2151 Compute the variance of the residual of uni-directionally motion
2155 static int unidir_pred_var( const mb_motion_s *motion,
2160 return (*pdist2)(motion->blk, mb, lx, motion->hx, motion->hy, h);
2165 Compute the variance of the residual of bi-directionally motion
2169 static int bidir_pred_var( const mb_motion_s *motion_f,
2170 const mb_motion_s *motion_b,
2174 return (*pbdist2)( motion_f->blk, motion_b->blk,
2176 motion_f->hx, motion_f->hy,
2177 motion_b->hx, motion_b->hy,
2182 Compute SAD for bi-directionally motion compensated blocks...
2185 static int bidir_pred_sad( const mb_motion_s *motion_f,
2186 const mb_motion_s *motion_b,
2190 return (*pbdist1)(motion_f->blk, motion_b->blk,
2192 motion_f->hx, motion_f->hy,
2193 motion_b->hx, motion_b->hy,
2197 static void frame_ME(motion_engine_t *engine,
2198 pict_data_s *picture,
2204 mb_motion_s framef_mc;
2205 mb_motion_s frameb_mc;
2206 mb_motion_s dualpf_mc;
2207 mb_motion_s topfldf_mc;
2208 mb_motion_s botfldf_mc;
2209 mb_motion_s topfldb_mc;
2210 mb_motion_s botfldb_mc;
2213 int vmc,vmcf,vmcr,vmci;
2214 int vmcfieldf,vmcfieldr,vmcfieldi;
2215 subsampled_mb_s ssmb;
2216 int imins[2][2],jmins[2][2];
2218 int imindmv,jmindmv;
2219 imindmv = 0; jmindmv = 0;
2220 memset(&dualpf_mc,0,sizeof(dualpf_mc));
2222 //printf("frame_ME 1\n");
2225 /* A.Stevens fast motion estimation data is appended to actual
2226 luminance information
2228 ssmb.mb = mc->cur[0] + mb_row_start + i;
2229 ssmb.umb = (uint8_t*)(mc->cur[1] + (i>>1) + (mb_row_start>>2));
2230 ssmb.vmb = (uint8_t*)(mc->cur[2] + (i>>1) + (mb_row_start>>2));
2231 ssmb.fmb = (uint8_t*)(mc->cur[0] + fsubsample_offset +
2232 ((i>>1) + (mb_row_start>>2)));
2233 ssmb.qmb = (uint8_t*)(mc->cur[0] + qsubsample_offset +
2234 (i>>2) + (mb_row_start>>4));
2236 /* Compute variance MB as a measure of Intra-coding complexity
2237 We include chrominance information here, scaled to compensate
2238 for sub-sampling. Silly MPEG forcing chrom/lum to have same
2241 var = variance(ssmb.mb,16,width);
2243 //printf("motion %d\n", picture->pict_type);
2244 if (picture->pict_type==I_TYPE)
2246 mbi->mb_type = MB_INTRA;
2248 else if (picture->pict_type==P_TYPE)
2250 /* For P pictures we take into account chrominance. This
2251 provides much better performance at scene changes */
2252 var += chrom_var_sum(&ssmb,16,width);
2254 if (picture->frame_pred_dct)
2256 fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2257 width,i,j,mc->sxf,mc->syf,16,width,height,
2259 framef_mc.fieldoff = 0;
2260 vmc = framef_mc.var +
2261 unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
2262 mbi->motion_type = MC_FRAME;
2266 //printf("frame_ME 2 %p %p\n", mc->oldorg[0], mc->oldref[0]);
2267 frame_estimate(engine, mc->oldorg[0],
2279 //printf("frame_ME 3\n");
2280 vmcf = framef_mc.var +
2281 unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
2284 unidir_chrom_var_sum( &topfldf_mc, mc->oldref, &ssmb, (width<<1), 8 ) +
2286 unidir_chrom_var_sum( &botfldf_mc, mc->oldref, &ssmb, (width<<1), 8 );
2287 /* DEBUG DP currently disabled... */
2290 // dpframe_estimate(engine, picture,mc->oldref[0],&ssmb,
2291 // i,j>>1,imins,jmins,
2293 // &imindmv,&jmindmv, &vmc_dp);
2296 /* NOTE: Typically M =3 so DP actually disabled... */
2297 /* select between dual prime, frame and field prediction */
2298 // if ( M==1 && vmc_dp<vmcf && vmc_dp<vmcfieldf)
2300 // mbi->motion_type = MC_DMV;
2301 // /* No chrominance squared difference measure yet.
2302 // Assume identical to luminance */
2303 // vmc = vmc_dp + vmc_dp;
2306 if ( vmcf < vmcfieldf)
2308 mbi->motion_type = MC_FRAME;
2314 mbi->motion_type = MC_FIELD;
2320 /* select between intra or non-intra coding:
2322 * selection is based on intra block variance (var) vs.
2323 * prediction error variance (vmc)
2325 * Used to be: blocks with small prediction error are always
2326 * coded non-intra even if variance is smaller (is this reasonable?
2328 * TODO: A.Stevens Jul 2000
2329 * The bbmpeg guys have found this to be *unreasonable*.
2330 * I'm not sure I buy their solution using vmc*2. It is probabably
2331 * the vmc>= 9*256 test that is suspect.
2336 if (vmc>var /*&& vmc>=(3*3)*16*16*2*/ )
2338 mbi->mb_type = MB_INTRA;
2344 /* select between MC / No-MC
2346 * use No-MC if var(No-MC) <= 1.25*var(MC)
2347 * (i.e slightly biased towards No-MC)
2349 * blocks with small prediction error are always coded as No-MC
2350 * (requires no motion vectors, allows skipping)
2352 v0 = (*pdist2)(mc->oldref[0]+i+width*j,ssmb.mb,width,0,0,16);
2358 mbi->mb_type = MB_FORWARD;
2359 if (mbi->motion_type==MC_FRAME)
2361 mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
2362 mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
2364 else if (mbi->motion_type==MC_DMV)
2366 /* these are FRAME vectors */
2367 /* same parity vector */
2368 mbi->MV[0][0][0] = dualpf_mc.pos.x - (i<<1);
2369 mbi->MV[0][0][1] = (dualpf_mc.pos.y<<1) - (j<<1);
2371 /* opposite parity vector */
2372 mbi->dmvector[0] = imindmv;
2373 mbi->dmvector[1] = jmindmv;
2377 /* these are FRAME vectors */
2378 mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
2379 mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
2380 mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
2381 mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
2382 mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
2383 mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
2392 mbi->motion_type = MC_FRAME;
2393 mbi->MV[0][0][0] = 0;
2394 mbi->MV[0][0][1] = 0;
2398 else /* if (pict_type==B_TYPE) */
2400 if (picture->frame_pred_dct)
2402 var = variance(ssmb.mb,16,width);
2404 fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2405 width,i,j,mc->sxf,mc->syf,
2409 framef_mc.fieldoff = 0;
2410 vmcf = framef_mc.var;
2413 fullsearch(engine, mc->neworg[0],mc->newref[0],&ssmb,
2414 width,i,j,mc->sxb,mc->syb,
2417 frameb_mc.fieldoff = 0;
2418 vmcr = frameb_mc.var;
2420 /* interpolated (bidirectional) */
2422 vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
2426 /* select between forward/backward/interpolated prediction:
2427 * use the one with smallest mean sqaured prediction error
2429 if (vmcf<=vmcr && vmcf<=vmci)
2432 mbi->mb_type = MB_FORWARD;
2434 else if (vmcr<=vmci)
2437 mbi->mb_type = MB_BACKWARD;
2442 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2445 mbi->motion_type = MC_FRAME;
2449 /* forward prediction */
2450 frame_estimate(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2451 i,j,mc->sxf,mc->syf,
2458 /* backward prediction */
2459 frame_estimate(engine, mc->neworg[0],mc->newref[0],&ssmb,
2460 i,j,mc->sxb,mc->syb,
2466 vmcf = framef_mc.var;
2467 vmcr = frameb_mc.var;
2468 vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
2470 vmcfieldf = topfldf_mc.var + botfldf_mc.var;
2471 vmcfieldr = topfldb_mc.var + botfldb_mc.var;
2472 vmcfieldi = bidir_pred_var( &topfldf_mc, &topfldb_mc, ssmb.mb,
2474 bidir_pred_var( &botfldf_mc, &botfldb_mc, ssmb.mb,
2477 /* select prediction type of minimum distance from the
2478 * six candidates (field/frame * forward/backward/interpolated)
2480 if (vmci<vmcfieldi && vmci<vmcf && vmci<vmcfieldf
2481 && vmci<vmcr && vmci<vmcfieldr)
2483 /* frame, interpolated */
2484 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2485 mbi->motion_type = MC_FRAME;
2488 else if ( vmcfieldi<vmcf && vmcfieldi<vmcfieldf
2489 && vmcfieldi<vmcr && vmcfieldi<vmcfieldr)
2491 /* field, interpolated */
2492 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2493 mbi->motion_type = MC_FIELD;
2496 else if (vmcf<vmcfieldf && vmcf<vmcr && vmcf<vmcfieldr)
2498 /* frame, forward */
2499 mbi->mb_type = MB_FORWARD;
2500 mbi->motion_type = MC_FRAME;
2504 else if ( vmcfieldf<vmcr && vmcfieldf<vmcfieldr)
2506 /* field, forward */
2507 mbi->mb_type = MB_FORWARD;
2508 mbi->motion_type = MC_FIELD;
2511 else if (vmcr<vmcfieldr)
2513 /* frame, backward */
2514 mbi->mb_type = MB_BACKWARD;
2515 mbi->motion_type = MC_FRAME;
2520 /* field, backward */
2521 mbi->mb_type = MB_BACKWARD;
2522 mbi->motion_type = MC_FIELD;
2528 /* select between intra or non-intra coding:
2530 * selection is based on intra block variance (var) vs.
2531 * prediction error variance (vmc)
2533 * Used to be: blocks with small prediction error are always
2534 * coded non-intra even if variance is smaller (is this reasonable?
2536 * TODO: A.Stevens Jul 2000
2537 * The bbmpeg guys have found this to be *unreasonable*.
2538 * I'm not sure I buy their solution using vmc*2 in the first comparison.
2539 * It is probabably the vmc>= 9*256 test that is suspect.
2542 if (vmc>var && vmc>=9*256)
2543 mbi->mb_type = MB_INTRA;
2547 if (mbi->motion_type==MC_FRAME)
2550 mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
2551 mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
2553 mbi->MV[0][1][0] = frameb_mc.pos.x - (i<<1);
2554 mbi->MV[0][1][1] = frameb_mc.pos.y - (j<<1);
2558 /* these are FRAME vectors */
2560 mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
2561 mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
2562 mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
2563 mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
2564 mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
2565 mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
2567 mbi->MV[0][1][0] = topfldb_mc.pos.x - (i<<1);
2568 mbi->MV[0][1][1] = (topfldb_mc.pos.y<<1) - (j<<1);
2569 mbi->MV[1][1][0] = botfldb_mc.pos.x - (i<<1);
2570 mbi->MV[1][1][1] = (botfldb_mc.pos.y<<1) - (j<<1);
2571 mbi->mv_field_sel[0][1] = topfldb_mc.fieldsel;
2572 mbi->mv_field_sel[1][1] = botfldb_mc.fieldsel;
2581 * motion estimation for field pictures
2583 * mbi: pointer to macroblock info structure
2584 * secondfield: indicates second field of a frame (in P fields this means
2585 * that reference field of opposite parity is in curref instead
2587 * ipflag: indicates a P type field which is the second field of a frame
2588 * in which the first field is I type (this restricts predictions
2589 * to be based only on the opposite parity (=I) field)
2593 * mb_type: 0, MB_INTRA, MB_FORWARD, MB_BACKWARD, MB_FORWARD|MB_BACKWARD
2594 * MV[][][]: motion vectors (field format)
2595 * mv_field_sel: top/bottom field
2596 * motion_type: MC_FIELD, MC_16X8
2598 * uses global vars: pict_type, pict_struct
2600 static void field_ME(motion_engine_t *engine,
2601 pict_data_s *picture,
2606 int secondfield, int ipflag)
2609 uint8_t *toporg, *topref, *botorg, *botref;
2610 subsampled_mb_s ssmb;
2611 mb_motion_s fields_mc, dualp_mc;
2612 mb_motion_s fieldf_mc, fieldb_mc;
2613 mb_motion_s field8uf_mc, field8lf_mc;
2614 mb_motion_s field8ub_mc, field8lb_mc;
2615 int var,vmc,v0,dmcfieldi,dmcfield,dmcfieldf,dmcfieldr,dmc8i;
2618 // int vmc_dp,dmc_dp,dmc;
2619 imins = 0; jmins = 0;
2620 memset(&dualp_mc,0,sizeof(dualp_mc));
2623 /* Fast motion data sits at the end of the luminance buffer */
2624 ssmb.mb = mc->cur[0] + i + w2*j;
2625 ssmb.umb = mc->cur[1] + ((i>>1)+(w2>>1)*(j>>1));
2626 ssmb.vmb = mc->cur[2] + ((i>>1)+(w2>>1)*(j>>1));
2627 ssmb.fmb = mc->cur[0] + fsubsample_offset+((i>>1)+(w2>>1)*(j>>1));
2628 ssmb.qmb = mc->cur[0] + qsubsample_offset+ (i>>2)+(w2>>2)*(j>>2);
2630 if (picture->pict_struct==BOTTOM_FIELD)
2633 ssmb.umb += (width >> 1);
2634 ssmb.vmb += (width >> 1);
2635 ssmb.fmb += (width >> 1);
2636 ssmb.qmb += (width >> 2);
2639 var = variance(ssmb.mb,16,w2) +
2640 ( variance(ssmb.umb,8,(width>>1)) + variance(ssmb.vmb,8,(width>>1)))*2;
2642 if (picture->pict_type==I_TYPE)
2643 mbi->mb_type = MB_INTRA;
2644 else if (picture->pict_type==P_TYPE)
2646 toporg = mc->oldorg[0];
2647 topref = mc->oldref[0];
2648 botorg = mc->oldorg[0] + width;
2649 botref = mc->oldref[0] + width;
2653 /* opposite parity field is in same frame */
2654 if (picture->pict_struct==TOP_FIELD)
2656 /* current is top field */
2657 botorg = mc->cur[0] + width;
2658 botref = mc->curref[0] + width;
2662 /* current is bottom field */
2663 toporg = mc->cur[0];
2664 topref = mc->curref[0];
2668 field_estimate(engine,
2670 toporg,topref,botorg,botref,&ssmb,
2671 i,j,mc->sxf,mc->syf,ipflag,
2676 dmcfield = fieldf_mc.sad;
2677 dmc8f = field8uf_mc.sad + field8lf_mc.sad;
2679 // if (M==1 && !ipflag) /* generic condition which permits Dual Prime */
2681 // dpfield_estimate(engine,
2683 // topref,botref,ssmb.mb,i,j,imins,jmins,
2686 // dmc_dp = dualp_mc.sad;
2689 // /* select between dual prime, field and 16x8 prediction */
2690 // if (M==1 && !ipflag && dmc_dp<dmc8f && dmc_dp<dmcfield)
2692 // /* Dual Prime prediction */
2693 // mbi->motion_type = MC_DMV;
2694 // dmc = dualp_mc.sad;
2701 /* 16x8 prediction */
2702 mbi->motion_type = MC_16X8;
2703 /* upper and lower half blocks */
2704 vmc = unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
2705 vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
2709 /* field prediction */
2710 mbi->motion_type = MC_FIELD;
2711 vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16 );
2714 /* select between intra and non-intra coding */
2716 if ( vmc>var && vmc>=9*256)
2717 mbi->mb_type = MB_INTRA;
2720 /* zero MV field prediction from same parity ref. field
2721 * (not allowed if ipflag is set)
2724 v0 = (*pdist2)(((picture->pict_struct==BOTTOM_FIELD)?botref:topref) + i + w2*j,
2726 if (ipflag || (4*v0>5*vmc && v0>=9*256))
2729 mbi->mb_type = MB_FORWARD;
2730 if (mbi->motion_type==MC_FIELD)
2732 mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
2733 mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
2734 mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
2736 else if (mbi->motion_type==MC_DMV)
2738 /* same parity vector */
2739 mbi->MV[0][0][0] = imins - (i<<1);
2740 mbi->MV[0][0][1] = jmins - (j<<1);
2742 /* opposite parity vector */
2743 mbi->dmvector[0] = dualp_mc.pos.x;
2744 mbi->dmvector[1] = dualp_mc.pos.y;
2748 mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
2749 mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
2750 mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
2751 mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
2752 mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
2753 mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
2761 mbi->motion_type = MC_FIELD;
2762 mbi->MV[0][0][0] = 0;
2763 mbi->MV[0][0][1] = 0;
2764 mbi->mv_field_sel[0][0] = (picture->pict_struct==BOTTOM_FIELD);
2768 else /* if (pict_type==B_TYPE) */
2770 /* forward prediction */
2771 field_estimate(engine,
2773 mc->oldorg[0],mc->oldref[0],
2774 mc->oldorg[0]+width,mc->oldref[0]+width,&ssmb,
2775 i,j,mc->sxf,mc->syf,0,
2780 dmcfieldf = fieldf_mc.sad;
2781 dmc8f = field8uf_mc.sad + field8lf_mc.sad;
2783 /* backward prediction */
2784 field_estimate(engine,
2786 mc->neworg[0],mc->newref[0],mc->neworg[0]+width,mc->newref[0]+width,
2788 i,j,mc->sxb,mc->syb,0,
2793 dmcfieldr = fieldb_mc.sad;
2794 dmc8r = field8ub_mc.sad + field8lb_mc.sad;
2796 /* calculate distances for bidirectional prediction */
2798 dmcfieldi = bidir_pred_sad( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
2800 /* 16x8 upper and lower half blocks */
2801 dmc8i = bidir_pred_sad( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 16 );
2802 dmc8i += bidir_pred_sad( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 16 );
2804 /* select prediction type of minimum distance */
2805 if (dmcfieldi<dmc8i && dmcfieldi<dmcfieldf && dmcfieldi<dmc8f
2806 && dmcfieldi<dmcfieldr && dmcfieldi<dmc8r)
2808 /* field, interpolated */
2809 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2810 mbi->motion_type = MC_FIELD;
2811 vmc = bidir_pred_var( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
2813 else if (dmc8i<dmcfieldf && dmc8i<dmc8f
2814 && dmc8i<dmcfieldr && dmc8i<dmc8r)
2816 /* 16x8, interpolated */
2817 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2818 mbi->motion_type = MC_16X8;
2820 /* upper and lower half blocks */
2821 vmc = bidir_pred_var( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 8);
2822 vmc += bidir_pred_var( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 8);
2824 else if (dmcfieldf<dmc8f && dmcfieldf<dmcfieldr && dmcfieldf<dmc8r)
2826 /* field, forward */
2827 mbi->mb_type = MB_FORWARD;
2828 mbi->motion_type = MC_FIELD;
2829 vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16);
2831 else if (dmc8f<dmcfieldr && dmc8f<dmc8r)
2834 mbi->mb_type = MB_FORWARD;
2835 mbi->motion_type = MC_16X8;
2837 /* upper and lower half blocks */
2838 vmc = unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
2839 vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
2841 else if (dmcfieldr<dmc8r)
2843 /* field, backward */
2844 mbi->mb_type = MB_BACKWARD;
2845 mbi->motion_type = MC_FIELD;
2846 vmc = unidir_pred_var( &fieldb_mc, ssmb.mb, w2, 16 );
2850 /* 16x8, backward */
2851 mbi->mb_type = MB_BACKWARD;
2852 mbi->motion_type = MC_16X8;
2854 /* upper and lower half blocks */
2855 vmc = unidir_pred_var( &field8ub_mc, ssmb.mb, w2, 8);
2856 vmc += unidir_pred_var( &field8lb_mc, ssmb.mb, w2, 8);
2860 /* select between intra and non-intra coding */
2861 if (vmc>var && vmc>=9*256)
2862 mbi->mb_type = MB_INTRA;
2866 if (mbi->motion_type==MC_FIELD)
2869 mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
2870 mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
2871 mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
2873 mbi->MV[0][1][0] = fieldb_mc.pos.x - (i<<1);
2874 mbi->MV[0][1][1] = fieldb_mc.pos.y - (j<<1);
2875 mbi->mv_field_sel[0][1] = fieldb_mc.fieldsel;
2880 mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
2881 mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
2882 mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
2883 mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
2884 mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
2885 mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
2887 mbi->MV[0][1][0] = field8ub_mc.pos.x - (i<<1);
2888 mbi->MV[0][1][1] = field8ub_mc.pos.y - (j<<1);
2889 mbi->mv_field_sel[0][1] = field8ub_mc.fieldsel;
2890 mbi->MV[1][1][0] = field8lb_mc.pos.x - (i<<1);
2891 mbi->MV[1][1][1] = field8lb_mc.pos.y - ((j+8)<<1);
2892 mbi->mv_field_sel[1][1] = field8lb_mc.fieldsel;
2902 Initialise motion compensation - currently purely selection of which
2903 versions of the various low level computation routines to use
2909 int cpucap = cpu_accel();
2911 if( cpucap == 0 ) /* No MMX/SSE etc support available */
2915 pdist1_00 = dist1_00;
2916 pdist1_01 = dist1_01;
2917 pdist1_10 = dist1_10;
2918 pdist1_11 = dist1_11;
2922 pdist2_22 = dist2_22;
2923 pbdist2_22 = bdist2_22;
2924 pfind_best_one_pel = find_best_one_pel;
2925 pbuild_sub22_mcomps = build_sub22_mcomps;
2928 else if(cpucap & ACCEL_X86_MMXEXT ) /* AMD MMX or SSE... */
2930 if(verbose) fprintf( stderr, "SETTING EXTENDED MMX for MOTION!\n");
2931 pdist22 = dist22_mmxe;
2932 pdist44 = dist44_mmxe;
2933 pdist1_00 = dist1_00_mmxe;
2934 pdist1_01 = dist1_01_mmxe;
2935 pdist1_10 = dist1_10_mmxe;
2936 pdist1_11 = dist1_11_mmxe;
2937 pbdist1 = bdist1_mmx;
2939 pbdist2 = bdist2_mmx;
2940 pdist2_22 = dist2_22_mmx;
2941 pbdist2_22 = bdist2_22_mmx;
2942 pfind_best_one_pel = find_best_one_pel_mmxe;
2943 pbuild_sub22_mcomps = build_sub22_mcomps_mmxe;
2944 pmblock_sub44_dists = mblock_sub44_dists_mmxe;
2947 else if(cpucap & ACCEL_X86_MMX) /* Ordinary MMX CPU */
2949 if(verbose) fprintf( stderr, "SETTING MMX for MOTION!\n");
2950 pdist22 = dist22_mmx;
2951 pdist44 = dist44_mmx;
2952 pdist1_00 = dist1_00_mmx;
2953 pdist1_01 = dist1_01_mmx;
2954 pdist1_10 = dist1_10_mmx;
2955 pdist1_11 = dist1_11_mmx;
2956 pbdist1 = bdist1_mmx;
2958 pbdist2 = bdist2_mmx;
2959 pdist2_22 = dist2_22_mmx;
2960 pbdist2_22 = bdist2_22_mmx;
2961 pfind_best_one_pel = find_best_one_pel;
2962 pbuild_sub22_mcomps = build_sub22_mcomps;
2963 pmblock_sub44_dists = mblock_sub44_dists_mmx;
2969 void motion_engine_loop(motion_engine_t *engine)
2971 while(!engine->done)
2973 pthread_mutex_lock(&(engine->input_lock));
2977 pict_data_s *picture = engine->pict_data;
2978 motion_comp_s *mc_data = engine->motion_comp;
2979 int secondfield = engine->secondfield;
2980 int ipflag = engine->ipflag;
2981 mbinfo_s *mbi = picture->mbinfo + (engine->start_row / 16) * (width / 16);
2983 int mb_row_incr; /* Offset increment to go down 1 row of mb's */
2986 if (picture->pict_struct == FRAME_PICTURE)
2988 mb_row_incr = 16*width;
2989 mb_row_start = engine->start_row / 16 * mb_row_incr;
2990 /* loop through all macroblocks of a frame picture */
2991 for (j=engine->start_row; j < engine->end_row; j+=16)
2993 for (i=0; i<width; i+=16)
2995 frame_ME(engine, picture, mc_data, mb_row_start, i,j, mbi);
2998 mb_row_start += mb_row_incr;
3003 mb_row_incr = (16 * 2) * width;
3004 mb_row_start = engine->start_row / 16 * mb_row_incr;
3005 /* loop through all macroblocks of a field picture */
3006 for (j=engine->start_row; j < engine->end_row; j+=16)
3008 for (i=0; i<width; i+=16)
3010 field_ME(engine, picture, mc_data, mb_row_start, i,j,
3011 mbi,secondfield,ipflag);
3014 mb_row_start += mb_row_incr;
3019 pthread_mutex_unlock(&(engine->output_lock));
3024 void motion_estimation(pict_data_s *picture,
3025 motion_comp_s *mc_data,
3026 int secondfield, int ipflag)
3030 for(i = 0; i < processors; i++)
3032 motion_engines[i].motion_comp = mc_data;
3034 motion_engines[i].pict_data = picture;
3036 motion_engines[i].secondfield = secondfield;
3037 motion_engines[i].ipflag = ipflag;
3038 pthread_mutex_unlock(&(motion_engines[i].input_lock));
3041 /* Wait for completion */
3042 for(i = 0; i < processors; i++)
3044 pthread_mutex_lock(&(motion_engines[i].output_lock));
3048 void start_motion_engines()
3051 int rows_per_processor = (int)((float)height2 / 16 / processors + 0.5);
3052 int current_row = 0;
3053 pthread_attr_t attr;
3054 pthread_mutexattr_t mutex_attr;
3056 pthread_mutexattr_init(&mutex_attr);
3057 pthread_attr_init(&attr);
3058 motion_engines = calloc(1, sizeof(motion_engine_t) * processors);
3059 for(i = 0; i < processors; i++)
3061 motion_engines[i].start_row = current_row * 16;
3062 current_row += rows_per_processor;
3063 if(current_row > height2 / 16) current_row = height2 / 16;
3064 motion_engines[i].end_row = current_row * 16;
3065 pthread_mutex_init(&(motion_engines[i].input_lock), &mutex_attr);
3066 pthread_mutex_lock(&(motion_engines[i].input_lock));
3067 pthread_mutex_init(&(motion_engines[i].output_lock), &mutex_attr);
3068 pthread_mutex_lock(&(motion_engines[i].output_lock));
3069 motion_engines[i].done = 0;
3070 pthread_create(&(motion_engines[i].tid),
3072 (void*)motion_engine_loop,
3073 &motion_engines[i]);
3077 void stop_motion_engines()
3080 for(i = 0; i < processors; i++)
3082 motion_engines[i].done = 1;
3083 pthread_mutex_unlock(&(motion_engines[i].input_lock));
3084 pthread_join(motion_engines[i].tid, 0);
3085 pthread_mutex_destroy(&(motion_engines[i].input_lock));
3086 pthread_mutex_destroy(&(motion_engines[i].output_lock));
3088 free(motion_engines);