1 /* motion.c, motion estimation */
3 /* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
6 * Disclaimer of Warranty
8 * These software programs are available to the user without any license fee or
9 * royalty on an "as is" basis. The MPEG Software Simulation Group disclaims
10 * any and all warranties, whether express, implied, or statuary, including any
11 * implied warranties or merchantability or of fitness for a particular
12 * purpose. In no event shall the copyright-holder be liable for any
13 * incidental, punitive, or consequential damages of any kind whatsoever
14 * arising from the use of these programs.
16 * This disclaimer of warranty extends to the user of these programs and user's
17 * customers, employees, agents, transferees, successors, and assigns.
19 * The MPEG Software Simulation Group does not represent or warrant that the
20 * programs furnished hereunder are free of infringement of any third-party
23 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
24 * are subject to royalty fees to patent holders. Many of these patents are
25 * general enough such that they are unavoidable regardless of implementation
35 #include "cpu_accel.h"
38 /* Macro-block Motion compensation results record */
40 typedef struct _blockcrd {
47 blockxy pos; // Half-pel co-ordinates of source block
48 int sad; // Sum of absolute difference
50 uint8_t *blk ; // Source block data (in luminace data array)
51 int hx, hy; // Half-pel offsets
52 int fieldsel; // 0 = top 1 = bottom
53 int fieldoff; // Offset from start of frame data to first line
54 // of field. (top = 0, bottom = width );
57 typedef struct mb_motion mb_motion_s;
62 uint8_t *mb; // One pel
63 uint8_t *fmb; // Two-pel subsampled
64 uint8_t *qmb; // Four-pel subsampled
65 uint8_t *umb; // U compoenent one-pel
66 uint8_t *vmb; // V component one-pel
69 typedef struct subsampled_mb subsampled_mb_s;
72 static void frame_ME (motion_engine_t *engine,
76 int i, int j, struct mbinfo *mbi);
78 static void field_ME (motion_engine_t *engine,
87 static void frame_estimate (motion_engine_t *engine,
90 subsampled_mb_s *ssmb,
96 int imins[2][2], int jmins[2][2]);
98 static void field_estimate (motion_engine_t *engine,
104 subsampled_mb_s *ssmb,
105 int i, int j, int sx, int sy, int ipflag,
109 mb_motion_s *bestsp);
111 static void dpframe_estimate (motion_engine_t *engine,
112 pict_data_s *picture,
114 subsampled_mb_s *ssmb,
115 int i, int j, int iminf[2][2], int jminf[2][2],
117 int *imindmvp, int *jmindmvp,
120 static void dpfield_estimate (motion_engine_t *engine,
121 pict_data_s *picture,
126 int imins, int jmins,
130 static void fullsearch (motion_engine_t *engine,
131 uint8_t *org, uint8_t *ref,
132 subsampled_mb_s *ssblk,
133 int lx, int i0, int j0,
134 int sx, int sy, int h,
136 mb_motion_s *motion );
138 static void find_best_one_pel( motion_engine_t *engine,
139 uint8_t *org, uint8_t *blk,
147 static int build_sub22_mcomps( motion_engine_t *engine,
148 int i0, int j0, int ihigh, int jhigh,
150 uint8_t *s22org, uint8_t *s22blk,
151 int flx, int fh, int searched_sub44_size );
154 static void find_best_one_pel_mmxe( motion_engine_t *engine,
155 uint8_t *org, uint8_t *blk,
164 static int build_sub22_mcomps_mmxe( motion_engine_t *engine, int i0, int j0, int ihigh, int jhigh,
166 uint8_t *s22org, uint8_t *s22blk,
167 int flx, int fh, int searched_sub44_size );
168 static int (*pmblock_sub44_dists)( uint8_t *blk, uint8_t *ref,
170 int ihigh, int jhigh,
171 int h, int rowstride,
173 mc_result_s *resvec);
176 static int unidir_pred_var( const mb_motion_s *motion,
177 uint8_t *mb, int lx, int h);
178 static int bidir_pred_var( const mb_motion_s *motion_f,
179 const mb_motion_s *motion_b,
180 uint8_t *mb, int lx, int h);
181 static int bidir_pred_sad( const mb_motion_s *motion_f,
182 const mb_motion_s *motion_b,
183 uint8_t *mb, int lx, int h);
185 static int variance( uint8_t *mb, int size, int lx);
187 static int dist22 ( uint8_t *blk1, uint8_t *blk2, int qlx, int qh);
189 static int dist44 ( uint8_t *blk1, uint8_t *blk2, int flx, int fh);
190 static int dist2_22( uint8_t *blk1, uint8_t *blk2,
192 static int bdist2_22( uint8_t *blk1f, uint8_t *blk1b,
197 static void (*pfind_best_one_pel)( motion_engine_t *engine,
198 uint8_t *org, uint8_t *blk,
206 static int (*pbuild_sub22_mcomps)( motion_engine_t *engine,
207 int i0, int j0, int ihigh, int jhigh,
209 uint8_t *s22org, uint8_t *s22blk,
210 int flx, int fh, int searched_sub44_size );
212 static int (*pdist2_22)( uint8_t *blk1, uint8_t *blk2,
214 static int (*pbdist2_22)( uint8_t *blk1f, uint8_t *blk1b,
218 static int dist1_00( uint8_t *blk1, uint8_t *blk2, int lx, int h, int distlim);
219 static int dist1_01(uint8_t *blk1, uint8_t *blk2, int lx, int h);
220 static int dist1_10(uint8_t *blk1, uint8_t *blk2, int lx, int h);
221 static int dist1_11(uint8_t *blk1, uint8_t *blk2, int lx, int h);
222 static int dist2 (uint8_t *blk1, uint8_t *blk2,
223 int lx, int hx, int hy, int h);
224 static int bdist2 (uint8_t *pf, uint8_t *pb,
225 uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
226 static int bdist1 (uint8_t *pf, uint8_t *pb,
227 uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
230 static int (*pdist22) ( uint8_t *blk1, uint8_t *blk2, int flx, int fh);
231 static int (*pdist44) ( uint8_t *blk1, uint8_t *blk2, int qlx, int qh);
232 static int (*pdist1_00) ( uint8_t *blk1, uint8_t *blk2, int lx, int h, int distlim);
233 static int (*pdist1_01) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
234 static int (*pdist1_10) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
235 static int (*pdist1_11) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
237 static int (*pdist2) (uint8_t *blk1, uint8_t *blk2,
238 int lx, int hx, int hy, int h);
241 static int (*pbdist2) (uint8_t *pf, uint8_t *pb,
242 uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
244 static int (*pbdist1) (uint8_t *pf, uint8_t *pb,
245 uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
249 static void check_mc( const mb_motion_s *motion, int rx, int ry, int i, int j, char *str )
252 if( motion->sad < 0 || motion->sad > 0x10000 )
254 printf( "SAD ooops %s\n", str );
257 if( motion->pos.x-i*2 < -rx || motion->pos.x-i*2 >= rx )
259 printf( "X MC ooops %s = %d [%d]\n", str, motion->pos.x-i*2,rx );
262 if( motion->pos.y-j*2 < -ry || motion->pos.y-j*2 >= ry )
264 printf( "Y MC ooops %s %d [%d]\n", str, motion->pos.y-j*2, ry );
269 static void init_mc( mb_motion_s *motion )
272 motion->pos.x = -1000;
273 motion->pos.y = -1000;
278 Compute the combined variance of luminance and chrominance information
279 for a particular non-intra macro block after unidirectional
280 motion compensation...
282 Note: results are scaled to give chrominance equal weight to
283 chrominance. The variance of the luminance portion is computed
284 at the time the motion compensation is computed.
286 TODO: Perhaps we should compute the whole thing in fullsearch not
287 seperate it. However, that would involve a lot of fiddling with
288 field_* and until its thoroughly debugged and tested I think I'll
289 leave that alone. Furthermore, it is unclear if its really worth
290 doing these computations for B *and* P frames.
292 TODO: BUG: ONLY works for 420 video...
296 static int unidir_chrom_var_sum( mb_motion_s *lum_mc,
298 subsampled_mb_s *ssblk,
303 /* N.b. MC co-ordinates are computed in half-pel units! */
304 int cblkoffset = (lum_mc->fieldoff>>1) +
305 (lum_mc->pos.x>>2) + (lum_mc->pos.y>>2)*uvlx;
307 return ((*pdist2_22)( ref[1] + cblkoffset, ssblk->umb, uvlx, uvh) +
308 (*pdist2_22)( ref[2] + cblkoffset, ssblk->vmb, uvlx, uvh))*2;
313 Compute the combined variance of luminance and chrominance information
314 for a particular non-intra macro block after unidirectional
315 motion compensation...
317 Note: results are scaled to give chrominance equal weight to
318 chrominance. The variance of the luminance portion is computed
319 at the time the motion compensation is computed.
321 Note: results scaled to give chrominance equal weight to chrominance.
323 TODO: BUG: ONLY works for 420 video...
325 NOTE: Currently unused but may be required if it turns out that taking
326 chrominance into account in B frames is needed.
330 int bidir_chrom_var_sum( mb_motion_s *lum_mc_f,
331 mb_motion_s *lum_mc_b,
334 subsampled_mb_s *ssblk,
339 /* N.b. MC co-ordinates are computed in half-pel units! */
340 int cblkoffset_f = (lum_mc_f->fieldoff>>1) +
341 (lum_mc_f->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
342 int cblkoffset_b = (lum_mc_b->fieldoff>>1) +
343 (lum_mc_b->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
346 (*pbdist2_22)( ref_f[1] + cblkoffset_f, ref_b[1] + cblkoffset_b,
347 ssblk->umb, uvlx, uvh ) +
348 (*pbdist2_22)( ref_f[2] + cblkoffset_f, ref_b[2] + cblkoffset_b,
349 ssblk->vmb, uvlx, uvh ))*2;
353 static int chrom_var_sum( subsampled_mb_s *ssblk, int h, int lx )
355 return (variance(ssblk->umb,(h>>1),(lx>>1)) +
356 variance(ssblk->vmb,(h>>1),(lx>>1))) * 2;
360 * frame picture motion estimation
362 * org: top left pel of source reference frame
363 * ref: top left pel of reconstructed reference frame
364 * ssmb: macroblock to be matched
365 * i,j: location of mb relative to ref (=center of search window)
366 * sx,sy: half widths of search window
367 * besfr: location and SAD of best frame prediction
368 * besttop: location of best field pred. for top field of mb
369 * bestbo : location of best field pred. for bottom field of mb
372 static void frame_estimate(motion_engine_t *engine,
375 subsampled_mb_s *ssmb,
376 int i, int j, int sx, int sy,
378 mb_motion_s *besttop,
379 mb_motion_s *bestbot,
384 subsampled_mb_s botssmb;
385 mb_motion_s topfld_mc;
386 mb_motion_s botfld_mc;
388 botssmb.mb = ssmb->mb+width;
389 botssmb.fmb = ssmb->mb+(width>>1);
390 botssmb.qmb = ssmb->qmb+(width>>2);
391 botssmb.umb = ssmb->umb+(width>>1);
392 botssmb.vmb = ssmb->vmb+(width>>1);
394 /* frame prediction */
395 fullsearch(engine, org,ref,ssmb,width,i,j,sx,sy,16,width,height,
397 bestfr->fieldsel = 0;
398 bestfr->fieldoff = 0;
400 /* predict top field from top field */
401 fullsearch(engine, org,ref,ssmb,width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
404 /* predict top field from bottom field */
405 fullsearch(engine, org+width,ref+width,ssmb, width<<1,i,j>>1,sx,sy>>1,8,
406 width,height>>1, &botfld_mc);
408 /* set correct field selectors... */
409 topfld_mc.fieldsel = 0;
410 botfld_mc.fieldsel = 1;
411 topfld_mc.fieldoff = 0;
412 botfld_mc.fieldoff = width;
414 imins[0][0] = topfld_mc.pos.x;
415 jmins[0][0] = topfld_mc.pos.y;
416 imins[1][0] = botfld_mc.pos.x;
417 jmins[1][0] = botfld_mc.pos.y;
419 /* select prediction for top field */
420 if (topfld_mc.sad<=botfld_mc.sad)
422 *besttop = topfld_mc;
426 *besttop = botfld_mc;
429 /* predict bottom field from top field */
430 fullsearch(engine, org,ref,&botssmb,
431 width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
434 /* predict bottom field from bottom field */
435 fullsearch(engine, org+width,ref+width,&botssmb,
436 width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
439 /* set correct field selectors... */
440 topfld_mc.fieldsel = 0;
441 botfld_mc.fieldsel = 1;
442 topfld_mc.fieldoff = 0;
443 botfld_mc.fieldoff = width;
445 imins[0][1] = topfld_mc.pos.x;
446 jmins[0][1] = topfld_mc.pos.y;
447 imins[1][1] = botfld_mc.pos.x;
448 jmins[1][1] = botfld_mc.pos.y;
450 /* select prediction for bottom field */
451 if (botfld_mc.sad<=topfld_mc.sad)
453 *bestbot = botfld_mc;
457 *bestbot = topfld_mc;
462 * field picture motion estimation subroutine
464 * toporg: address of original top reference field
465 * topref: address of reconstructed top reference field
466 * botorg: address of original bottom reference field
467 * botref: address of reconstructed bottom reference field
468 * ssmmb: macroblock to be matched
469 * i,j: location of mb (=center of search window)
470 * sx,sy: half width/height of search window
472 * bestfld: location and distance of best field prediction
473 * best8u: location of best 16x8 pred. for upper half of mb
474 * best8lp: location of best 16x8 pred. for lower half of mb
475 * bdestsp: location and distance of best same parity field
476 * prediction (needed for dual prime, only valid if
480 static void field_estimate (motion_engine_t *engine,
481 pict_data_s *picture,
486 subsampled_mb_s *ssmb,
487 int i, int j, int sx, int sy, int ipflag,
488 mb_motion_s *bestfld,
494 mb_motion_s topfld_mc;
495 mb_motion_s botfld_mc;
498 subsampled_mb_s botssmb;
500 botssmb.mb = ssmb->mb+width;
501 botssmb.umb = ssmb->umb+(width>>1);
502 botssmb.vmb = ssmb->vmb+(width>>1);
503 botssmb.fmb = ssmb->fmb+(width>>1);
504 botssmb.qmb = ssmb->qmb+(width>>2);
506 /* if ipflag is set, predict from field of opposite parity only */
507 notop = ipflag && (picture->pict_struct==TOP_FIELD);
508 nobot = ipflag && (picture->pict_struct==BOTTOM_FIELD);
510 /* field prediction */
512 /* predict current field from top field */
514 topfld_mc.sad = dt = 65536; /* infinity */
516 fullsearch(engine, toporg,topref,ssmb,width<<1,
517 i,j,sx,sy>>1,16,width,height>>1,
520 /* predict current field from bottom field */
522 botfld_mc.sad = db = 65536; /* infinity */
524 fullsearch(engine, botorg,botref,ssmb,width<<1,
525 i,j,sx,sy>>1,16,width,height>>1,
528 /* Set correct field selectors */
529 topfld_mc.fieldsel = 0;
530 botfld_mc.fieldsel = 1;
531 topfld_mc.fieldoff = 0;
532 botfld_mc.fieldoff = width;
534 /* same parity prediction (only valid if ipflag==0) */
535 if (picture->pict_struct==TOP_FIELD)
544 /* select field prediction */
547 *bestfld = topfld_mc;
551 *bestfld = botfld_mc;
555 /* 16x8 motion compensation */
557 /* predict upper half field from top field */
559 topfld_mc.sad = dt = 65536;
561 fullsearch(engine, toporg,topref,ssmb,width<<1,
562 i,j,sx,sy>>1,8,width,height>>1,
565 /* predict upper half field from bottom field */
567 botfld_mc.sad = db = 65536;
569 fullsearch(engine, botorg,botref,ssmb,width<<1,
570 i,j,sx,sy>>1,8,width,height>>1,
574 /* Set correct field selectors */
575 topfld_mc.fieldsel = 0;
576 botfld_mc.fieldsel = 1;
577 topfld_mc.fieldoff = 0;
578 botfld_mc.fieldoff = width;
580 /* select prediction for upper half field */
590 /* predict lower half field from top field */
592 N.b. For interlaced data width<<4 (width*16) takes us 8 rows
593 down in the same field.
594 Thus for the fast motion data (2*2
595 sub-sampled) we need to go 4 rows down in the same field.
596 This requires adding width*4 = (width<<2).
597 For the 4*4 sub-sampled motion data we need to go down 2 rows.
598 This requires adding width = width
602 topfld_mc.sad = dt = 65536;
604 fullsearch(engine, toporg,topref,&botssmb,
606 i,j+8,sx,sy>>1,8,width,height>>1,
607 /* &imint,&jmint, &dt,*/ &topfld_mc);
609 /* predict lower half field from bottom field */
611 botfld_mc.sad = db = 65536;
613 fullsearch(engine, botorg,botref,&botssmb,width<<1,
614 i,j+8,sx,sy>>1,8,width,height>>1,
615 /* &iminb,&jminb, &db,*/ &botfld_mc);
617 /* Set correct field selectors */
618 topfld_mc.fieldsel = 0;
619 botfld_mc.fieldsel = 1;
620 topfld_mc.fieldoff = 0;
621 botfld_mc.fieldoff = width;
623 /* select prediction for lower half field */
634 static void dpframe_estimate (motion_engine_t *engine,
635 pict_data_s *picture,
637 subsampled_mb_s *ssmb,
639 int i, int j, int iminf[2][2], int jminf[2][2],
640 mb_motion_s *best_mc,
641 int *imindmvp, int *jmindmvp,
644 int pref,ppred,delta_x,delta_y;
645 int is,js,it,jt,ib,jb,it0,jt0,ib0,jb0;
646 int imins,jmins,imint,jmint,iminb,jminb,imindmv,jmindmv;
649 /* Calculate Dual Prime distortions for 9 delta candidates
650 * for each of the four minimum field vectors
651 * Note: only for P pictures!
654 /* initialize minimum dual prime distortion to large value */
657 for (pref=0; pref<2; pref++)
659 for (ppred=0; ppred<2; ppred++)
661 /* convert Cartesian absolute to relative motion vector
662 * values (wrt current macroblock address (i,j)
664 is = iminf[pref][ppred] - (i<<1);
665 js = jminf[pref][ppred] - (j<<1);
669 /* vertical field shift adjustment */
675 /* mvxs and mvys scaling*/
678 if (picture->topfirst == ppred)
680 /* second field: scale by 1/3 */
681 is = (is>=0) ? (is+1)/3 : -((-is+1)/3);
682 js = (js>=0) ? (js+1)/3 : -((-js+1)/3);
688 /* vector for prediction from field of opposite 'parity' */
689 if (picture->topfirst)
691 /* vector for prediction of top field from bottom field */
692 it0 = ((is+(is>0))>>1);
693 jt0 = ((js+(js>0))>>1) - 1;
695 /* vector for prediction of bottom field from top field */
696 ib0 = ((3*is+(is>0))>>1);
697 jb0 = ((3*js+(js>0))>>1) + 1;
701 /* vector for prediction of top field from bottom field */
702 it0 = ((3*is+(is>0))>>1);
703 jt0 = ((3*js+(js>0))>>1) - 1;
705 /* vector for prediction of bottom field from top field */
706 ib0 = ((is+(is>0))>>1);
707 jb0 = ((js+(js>0))>>1) + 1;
710 /* convert back to absolute half-pel field picture coordinates */
718 if (is >= 0 && is <= (width-16)<<1 &&
719 js >= 0 && js <= (height-16))
721 for (delta_y=-1; delta_y<=1; delta_y++)
723 for (delta_x=-1; delta_x<=1; delta_x++)
725 /* opposite field coordinates */
731 if (it >= 0 && it <= (width-16)<<1 &&
732 jt >= 0 && jt <= (height-16) &&
733 ib >= 0 && ib <= (width-16)<<1 &&
734 jb >= 0 && jb <= (height-16))
736 /* compute prediction error */
737 local_dist = (*pbdist2)(
738 ref + (is>>1) + (width<<1)*(js>>1),
739 ref + width + (it>>1) + (width<<1)*(jt>>1),
740 ssmb->mb, /* current mb location */
741 width<<1, /* adjacent line distance */
742 is&1, js&1, it&1, jt&1, /* half-pel flags */
743 8); /* block height */
744 local_dist += (*pbdist2)(
745 ref + width + (is>>1) + (width<<1)*(js>>1),
746 ref + (ib>>1) + (width<<1)*(jb>>1),
747 ssmb->mb + width, /* current mb location */
748 width<<1, /* adjacent line distance */
749 is&1, js&1, ib&1, jb&1, /* half-pel flags */
750 8); /* block height */
752 /* update delta with least distortion vector */
753 if (local_dist < vmc)
766 } /* end delta x loop */
767 } /* end delta y loop */
772 /* TODO: This is now likely to be obsolete... */
773 /* Compute L1 error for decision purposes */
774 local_dist = (*pbdist1)(
775 ref + (imins>>1) + (width<<1)*(jmins>>1),
776 ref + width + (imint>>1) + (width<<1)*(jmint>>1),
779 imins&1, jmins&1, imint&1, jmint&1,
781 //printf("motion 1 %p\n", pbdist1);
782 local_dist += (*pbdist1)(
783 ref + width + (imins>>1) + (width<<1)*(jmins>>1),
784 ref + (iminb>>1) + (width<<1)*(jminb>>1),
787 imins&1, jmins&1, iminb&1, jminb&1,
789 //printf("motion 2\n");
791 best_mc->sad = local_dist;
792 best_mc->pos.x = imins;
793 best_mc->pos.y = jmins;
797 //printf("motion 2\n");
800 static void dpfield_estimate(motion_engine_t *engine,
801 pict_data_s *picture,
805 int i, int j, int imins, int jmins,
806 mb_motion_s *bestdp_mc,
811 uint8_t *sameref, *oppref;
812 int io0,jo0,io,jo,delta_x,delta_y,mvxs,mvys,mvxo0,mvyo0;
813 int imino,jmino,imindmv,jmindmv,vmc_dp,local_dist;
815 /* Calculate Dual Prime distortions for 9 delta candidates */
816 /* Note: only for P pictures! */
818 /* Assign opposite and same reference pointer */
819 if (picture->pict_struct==TOP_FIELD)
830 /* convert Cartesian absolute to relative motion vector
831 * values (wrt current macroblock address (i,j)
833 mvxs = imins - (i<<1);
834 mvys = jmins - (j<<1);
836 /* vector for prediction from field of opposite 'parity' */
837 mvxo0 = (mvxs+(mvxs>0)) >> 1; /* mvxs / / */
838 mvyo0 = (mvys+(mvys>0)) >> 1; /* mvys / / 2*/
840 /* vertical field shift correction */
841 if (picture->pict_struct==TOP_FIELD)
846 /* convert back to absolute coordinates */
847 io0 = mvxo0 + (i<<1);
848 jo0 = mvyo0 + (j<<1);
850 /* initialize minimum dual prime distortion to large value */
853 for (delta_y = -1; delta_y <= 1; delta_y++)
855 for (delta_x = -1; delta_x <=1; delta_x++)
857 /* opposite field coordinates */
861 if (io >= 0 && io <= (width-16)<<1 &&
862 jo >= 0 && jo <= (height2-16)<<1)
864 /* compute prediction error */
865 local_dist = (*pbdist2)(
866 sameref + (imins>>1) + width2*(jmins>>1),
867 oppref + (io>>1) + width2*(jo>>1),
868 mb, /* current mb location */
869 width2, /* adjacent line distance */
870 imins&1, jmins&1, io&1, jo&1, /* half-pel flags */
871 16); /* block height */
873 /* update delta with least distortion vector */
874 if (local_dist < vmc_dp)
883 } /* end delta x loop */
884 } /* end delta y loop */
886 /* Compute L1 error for decision purposes */
889 sameref + (imins>>1) + width2*(jmins>>1),
890 oppref + (imino>>1) + width2*(jmino>>1),
891 mb, /* current mb location */
892 width2, /* adjacent line distance */
893 imins&1, jmins&1, imino&1, jmino&1, /* half-pel flags */
894 16); /* block height */
896 bestdp_mc->pos.x = imindmv;
897 bestdp_mc->pos.x = imindmv;
903 * Vectors of motion compensations
907 Take a vector of motion compensations and repeatedly make passes
908 discarding all elements whose sad "weight" is above the current mean weight.
911 static void sub_mean_reduction( mc_result_s *matches, int len,
913 int *newlen_res, int *minweight_res)
918 int min_weight = 100000;
921 *minweight_res = 100000;
929 for( i = 0; i < len ; ++i )
930 weight_sum += matches[i].weight;
931 mean_weight = weight_sum / len;
937 for( i =0; i < len; ++i )
939 if( matches[i].weight <= mean_weight )
943 min_weight = matches[i].weight ;
945 matches[j] = matches[i];
953 *minweight_res = mean_weight;
957 Build a vector of the top 4*4 sub-sampled motion compensations in the box
958 (ilow,jlow) to (ihigh,jhigh).
960 The algorithm is as follows:
961 1. coarse matches on an 8*8 grid of positions are collected that fall below
962 a (very conservative) sad threshold (basically 50% more than moving average of
963 the mean sad of such matches).
965 2. The worse than-average matches are discarded.
967 3. The remaining coarse matches are expanded with the left/lower neighbouring
968 4*4 grid matches. Again only those better than a threshold (this time the mean
969 of the 8*8 grid matches are retained.
971 4. Multiple passes are made discarding worse than-average matches. The number of
972 passes is specified by the user. The default it 2 (leaving roughly 1/4 of the matches).
974 The net result is very fast and find good matches if they're to be found. I.e. the
975 penalty over exhaustive search is pretty low.
977 NOTE: The "discard below average" trick depends critically on having some variation in
978 the matches. The slight penalty imposed for distant matches (reasonably since the
979 motion vectors have to be encoded) is *vital* as otherwise pathologically bad
980 performance results on highly uniform images.
982 TODO: We should probably allow the user to eliminate the initial thinning of 8*8
983 grid matches if ultimate quality is demanded (e.g. for low bit-rate applications).
988 static int build_sub44_mcomps(motion_engine_t *engine,
989 int ilow, int jlow, int ihigh, int jhigh,
992 uint8_t *s44org, uint8_t *s44blk, int qlx, int qh )
1004 /*int rangex, rangey;
1005 static int rough_num_mcomps;
1006 static mc_result_s rough_mcomps[MAX_44_MATCHES];
1012 uint8_t *old_s44orgblk;
1014 /* N.b. we may ignore the right hand block of the pair going over the
1015 right edge as we have carefully allocated the buffer oversize to ensure
1016 no memory faults. The later motion compensation calculations
1017 performed on the results of this pass will filter out
1018 out-of-range blocks...
1022 engine->sub44_num_mcomps = 0;
1024 threshold = 6*null_mc_sad / (4*4*mc_44_red);
1025 s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
1027 /* Exhaustive search on 4*4 sub-sampled data. This is affordable because
1028 (a) it is only 16th of the size of the real 1-pel data
1029 (b) we ignore those matches with an sad above our threshold.
1033 /* Invariant: s44orgblk = s44org+(i>>2)+qlx*(j>>2) */
1034 s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
1035 for( j = jstrt; j <= jend; j += 4 )
1037 old_s44orgblk = s44orgblk;
1038 for( i = istrt; i <= iend; i += 4 )
1040 s1 = ((*pdist44)( s44orgblk,s44blk,qlx,qh) & 0xffff) + abs(i-i0) + abs(j-j0);
1041 if( s1 < threshold )
1043 engine->sub44_mcomps[engine->sub44_num_mcomps].x = i;
1044 engine->sub44_mcomps[engine->sub44_num_mcomps].y = j;
1045 engine->sub44_mcomps[engine->sub44_num_mcomps].weight = s1 ;
1046 ++engine->sub44_num_mcomps;
1050 s44orgblk = old_s44orgblk + qlx;
1055 engine->sub44_num_mcomps
1056 = (*pmblock_sub44_dists)( s44orgblk, s44blk,
1061 engine->sub44_mcomps);
1064 /* If we're really pushing quality we reduce once otherwise twice. */
1066 sub_mean_reduction( engine->sub44_mcomps, engine->sub44_num_mcomps, 1+(mc_44_red>1),
1067 &engine->sub44_num_mcomps, &mean_weight);
1070 return engine->sub44_num_mcomps;
1074 /* Build a vector of the best 2*2 sub-sampled motion
1075 compensations using the best 4*4 matches as starting points. As
1076 with with the 4*4 matches We don't collect them densely as they're
1077 just search starting points for 1-pel search and ones that are 1 out
1078 should still give better than average matches...
1083 static int build_sub22_mcomps(motion_engine_t *engine,
1084 int i0, int j0, int ihigh, int jhigh,
1086 uint8_t *s22org, uint8_t *s22blk,
1087 int flx, int fh, int searched_sub44_size )
1090 int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
1093 int ilim = ihigh-i0;
1094 int jlim = jhigh-j0;
1098 engine->sub22_num_mcomps = 0;
1099 for( k = 0; k < searched_sub44_size; ++k )
1102 matchrec.x = engine->sub44_mcomps[k].x;
1103 matchrec.y = engine->sub44_mcomps[k].y;
1105 s22orgblk = s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
1106 for( i = 0; i < 4; ++i )
1108 if( matchrec.x <= ilim && matchrec.y <= jlim )
1110 s = (*pdist22)( s22orgblk,s22blk,flx,fh);
1113 engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
1114 engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
1115 engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
1116 ++engine->sub22_num_mcomps;
1137 sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
1138 &engine->sub22_num_mcomps, &min_weight );
1139 return engine->sub22_num_mcomps;
1143 int build_sub22_mcomps_mmxe(motion_engine_t *engine,
1144 int i0, int j0, int ihigh, int jhigh,
1146 uint8_t *s22org, uint8_t *s22blk,
1147 int flx, int fh, int searched_sub44_size )
1150 int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
1153 int ilim = ihigh-i0;
1154 int jlim = jhigh-j0;
1159 /* TODO: The calculation of the lstrow offset really belongs in
1161 int lstrow=(fh-1)*flx;
1163 engine->sub22_num_mcomps = 0;
1164 for( k = 0; k < searched_sub44_size; ++k )
1167 matchrec.x = engine->sub44_mcomps[k].x;
1168 matchrec.y = engine->sub44_mcomps[k].y;
1170 s22orgblk = s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
1171 mblockq_dist22_mmxe(s22orgblk+lstrow, s22blk+lstrow, flx, fh, resvec);
1172 for( i = 0; i < 4; ++i )
1174 if( matchrec.x <= ilim && matchrec.y <= jlim )
1179 engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
1180 engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
1181 engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
1182 ++engine->sub22_num_mcomps;
1200 sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
1201 &engine->sub22_num_mcomps, &min_weight );
1202 return engine->sub22_num_mcomps;
1208 Search for the best 1-pel match within 1-pel of a good 2*2-pel match.
1209 TODO: Its a bit silly to cart around absolute M/C co-ordinates that
1210 eventually get turned into relative ones anyway...
1214 static void find_best_one_pel(motion_engine_t *engine,
1215 uint8_t *org, uint8_t *blk,
1227 blockxy minpos = res->pos;
1235 init_search = searched_size;
1236 init_size = engine->sub22_num_mcomps;
1237 for( k = 0; k < searched_size; ++k )
1239 matchrec.x = i0 + engine->sub22_mcomps[k].x;
1240 matchrec.y = j0 + engine->sub22_mcomps[k].y;
1241 orgblk = org + matchrec.x+lx*matchrec.y;
1242 penalty = abs(matchrec.x)+abs(matchrec.y);
1244 for( i = 0; i < 4; ++i )
1246 if( matchrec.x <= xmax && matchrec.y <= ymax )
1249 d = penalty+(*pdist1_00)(orgblk,blk,lx,h, dmin);
1271 res->blk = org + minpos.x+lx*minpos.y;
1277 void find_best_one_pel_mmxe(motion_engine_t *engine,
1278 uint8_t *org, uint8_t *blk,
1290 blockxy minpos = res->pos;
1300 init_search = searched_size;
1301 init_size = engine->sub22_num_mcomps;
1302 for( k = 0; k < searched_size; ++k )
1304 matchrec.x = i0 + engine->sub22_mcomps[k].x;
1305 matchrec.y = j0 + engine->sub22_mcomps[k].y;
1306 orgblk = org + matchrec.x+lx*matchrec.y;
1307 penalty = abs(matchrec.x)+abs(matchrec.y);
1310 mblockq_dist1_mmxe(orgblk,blk,lx,h, resvec);
1312 for( i = 0; i < 4; ++i )
1314 if( matchrec.x <= xmax && matchrec.y <= ymax )
1317 d = penalty+resvec[i];
1339 res->blk = org + minpos.x+lx*minpos.y;
1346 * full search block matching
1348 * A.Stevens 2000: This is now a big misnomer. The search is now a hierarchical/sub-sampling
1349 * search not a full search. However, experiments have shown it is always close to
1350 * optimal and almost always very close or optimal.
1352 * blk: top left pel of (16*h) block
1353 * s22blk: top element of fast motion compensation block corresponding to blk
1354 * h: height of block
1355 * lx: distance (in bytes) of vertically adjacent pels in ref,blk
1356 * org: top left pel of source reference picture
1357 * ref: top left pel of reconstructed reference picture
1358 * i0,j0: center of search window
1359 * sx,sy: half widths of search window
1360 * xmax,ymax: right/bottom limits of search area
1361 * iminp,jminp: pointers to where the result is stored
1362 * result is given as half pel offset from ref(0,0)
1363 * i.e. NOT relative to (i0,j0)
1367 static void fullsearch(motion_engine_t *engine,
1370 subsampled_mb_s *ssblk,
1371 int lx, int i0, int j0,
1372 int sx, int sy, int h,
1374 /* int *iminp, int *jminp, int *sadminp, */
1379 /* int imin, jmin, dmin */
1380 int i,j,ilow,ihigh,jlow,jhigh;
1383 /* NOTE: Surprisingly, the initial motion compensation search
1384 works better when the original image not the reference (reconstructed)
1387 uint8_t *s22org = (uint8_t*)(org+fsubsample_offset);
1388 uint8_t *s44org = (uint8_t*)(org+qsubsample_offset);
1395 /* xmax and ymax into more useful form... */
1400 /* The search radii are *always* multiples of 4 to avoid messiness
1401 in the initial 4*4 pel search. This is handled by the
1402 parameter checking/processing code in readparmfile() */
1404 /* Create a distance-order mcomps of possible motion compensations
1405 based on the fast estimation data - 4*4 pel sums (4*4
1406 sub-sampled) rather than actual pel's. 1/16 the size... */
1408 jlow = jlow < 0 ? 0 : jlow;
1410 jhigh = jhigh > ymax ? ymax : jhigh;
1412 ilow = ilow < 0 ? 0 : ilow;
1414 ihigh = ihigh > xmax ? xmax : ihigh;
1417 Very rarely this may fail to find matchs due to all the good
1418 looking ones being over threshold. hence we make sure we
1419 fall back to a 0 motion compensation in this case.
1421 The sad for the 0 motion compensation is also very useful as
1422 a basis for setting thresholds for rejecting really dud 4*4
1423 and 2*2 sub-sampled matches.
1425 memset(&best, 0, sizeof(best));
1426 best.sad = (*pdist1_00)(ref + i0 + j0 * lx,
1434 engine->sub44_num_mcomps = build_sub44_mcomps(engine,
1435 ilow, jlow, ihigh, jhigh,
1439 ssblk->qmb, qlx, qh );
1442 /* Now create a distance-ordered mcomps of possible motion
1443 compensations based on the fast estimation data - 2*2 pel sums
1444 using the best fraction of the 4*4 estimates However we cover
1445 only coarsely... on 4-pel boundaries... */
1447 engine->sub22_num_mcomps = (*pbuild_sub22_mcomps)( engine, i0, j0, ihigh, jhigh,
1449 s22org, ssblk->fmb, flx, fh,
1450 engine->sub44_num_mcomps );
1453 /* Now choose best 1-pel match from what approximates (not exact
1454 due to the pre-processing trick with the mean) the top 1/2 of
1459 (*pfind_best_one_pel)( engine, ref, ssblk->mb, engine->sub22_num_mcomps,
1461 ilow, jlow, xmax, ymax,
1464 /* Final polish: half-pel search of best candidate against
1465 reconstructed image.
1473 ilow = best.pos.x - (best.pos.x>(ilow<<1));
1474 ihigh = best.pos.x + (best.pos.x<((ihigh)<<1));
1475 jlow = best.pos.y - (best.pos.y>(jlow<<1));
1476 jhigh = best.pos.y+ (best.pos.y<((jhigh)<<1));
1478 for (j=jlow; j<=jhigh; j++)
1480 for (i=ilow; i<=ihigh; i++)
1482 orgblk = ref+(i>>1)+((j>>1)*lx);
1486 d = (*pdist1_11)(orgblk,ssblk->mb,lx,h);
1488 d = (*pdist1_01)(orgblk,ssblk->mb,lx,h);
1493 d = (*pdist1_10)(orgblk,ssblk->mb,lx,h);
1495 d = (*pdist1_00)(orgblk,ssblk->mb,lx,h,best.sad);
1508 best.var = (*pdist2)(best.blk, ssblk->mb, lx, best.hx, best.hy, h);
1513 * total absolute difference between two (16*h) blocks
1514 * including optional half pel interpolation of blk1 (hx,hy)
1515 * blk1,blk2: addresses of top left pels of both blocks
1516 * lx: distance (in bytes) of vertically adjacent pels
1517 * hx,hy: flags for horizontal and/or vertical interpolation
1518 * h: height of block (usually 8 or 16)
1519 * distlim: bail out if sum exceeds this value
1522 /* A.Stevens 2000: New version for highly pipelined CPUs where branching is
1523 costly. Really it sucks that C doesn't define a stdlib abs that could
1524 be realised as a compiler intrinsic using appropriate CPU instructions.
1525 That 1970's heritage...
1529 static int dist1_00(uint8_t *blk1,uint8_t *blk2,
1530 int lx, int h,int distlim)
1543 #define pipestep(o) v = p1[o]-p2[o]; s+= abs(v);
1544 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1545 pipestep(4); pipestep(5); pipestep(6); pipestep(7);
1546 pipestep(8); pipestep(9); pipestep(10); pipestep(11);
1547 pipestep(12); pipestep(13); pipestep(14); pipestep(15);
1559 static int dist1_01(uint8_t *blk1,uint8_t *blk2,
1572 for (i=0; i<16; i++)
1575 v = ((unsigned int)(p1[i]+p1[i+1])>>1) - p2[i];
1577 v = ((p1[i]>>1)+(p1[i+1]>>1)>>1) - (p2[i]>>1);
1587 static int dist1_10(uint8_t *blk1,uint8_t *blk2,
1590 uint8_t *p1,*p1a,*p2;
1601 for (i=0; i<16; i++)
1603 v = ((unsigned int)(p1[i]+p1a[i])>>1) - p2[i];
1614 static int dist1_11(uint8_t *blk1,uint8_t *blk2,
1617 uint8_t *p1,*p1a,*p2;
1629 for (i=0; i<16; i++)
1631 v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1])>>2) - p2[i];
1641 /* USED only during debugging...
1642 void check_fast_motion_data(uint8_t *blk, char *label )
1647 uint8_t *start_s22blk, *start_s44blk;
1649 unsigned int mismatch;
1651 start_s22blk = (blk+height*width);
1652 start_s44blk = (blk+height*width+(height>>1)*(width>>1));
1654 if (pict_struct==FRAME_PICTURE)
1656 nextfieldline = width;
1660 nextfieldline = 2*width;
1669 for( i = 0; i < nextfieldline/2; ++i )
1671 mismatch |= ((p[0] + p[1] + p[nextfieldline] + p[nextfieldline+1])>>2) != *pb;
1679 printf( "Mismatch detected check %s for buffer %08x\n", label, blk );
1686 Append fast motion estimation data to original luminance
1687 data. N.b. memory allocation for luminance data allows space
1688 for this information...
1691 void fast_motion_data(uint8_t *blk, int picture_struct )
1696 uint8_t *start_s22blk, *start_s44blk;
1697 uint16_t *start_rowblk, *start_colblk;
1700 #ifdef TEST_RCSEARCH
1701 uint16_t *pc, *pr,*p;
1704 int down16 = width*16;
1706 uint16_t rowsums[2048];
1707 uint16_t colsums[2048]; /* TODO: BUG: should resize with width */
1711 /* In an interlaced field the "next" line is 2 width's down
1712 rather than 1 width down */
1714 if (picture_struct==FRAME_PICTURE)
1716 nextfieldline = width;
1720 nextfieldline = 2*width;
1723 start_s22blk = blk+fsubsample_offset;
1724 start_s44blk = blk+qsubsample_offset;
1725 start_rowblk = (uint16_t *)blk+rowsums_offset;
1726 start_colblk = (uint16_t *)blk+colsums_offset;
1728 nb = (blk+nextfieldline);
1729 /* Sneaky stuff here... we can do lines in both fields at once */
1730 pb = (uint8_t *) start_s22blk;
1732 while( nb < start_s22blk )
1734 for( i = 0; i < nextfieldline/4; ++i ) /* We're doing 4 pels horizontally at once */
1736 /* TODO: A.Stevens this has to be the most word-length dependent
1737 code in the world. Better than MMX assembler though I guess... */
1738 pb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
1739 pb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;
1745 nb = b + nextfieldline;
1749 /* Now create the 4*4 sub-sampled data from the 2*2
1750 N.b. the 2*2 sub-sampled motion data preserves the interlace structure of the
1751 original. Albeit half as many lines and pixels...
1754 nextfieldline = nextfieldline >> 1;
1758 nb = (start_s22blk+nextfieldline);
1760 while( nb < start_s44blk )
1762 for( i = 0; i < nextfieldline/4; ++i )
1764 /* TODO: BRITTLE: A.Stevens - this only works for uint8_t = uint8_t */
1765 qb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
1766 qb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;
1772 nb = b + nextfieldline;
1775 #ifdef TEST_RCSEARCH
1776 /* TODO: BUG: THIS CODE DOES NOT YET ALLOW FOR INTERLACED FIELDS.... */
1779 Initial row sums....
1782 for(j = 0; j < height; ++j )
1785 for( i = 0; i < 16; ++ i )
1789 rowsums[j] = rowsum;
1796 for( i = 0; i < width; ++i )
1801 for( j = 0; j < 16; ++j )
1803 for( i = 0; i < width; ++i )
1810 /* Now fill in the row/column sum tables...
1811 Note: to allow efficient construction of sum/col differences for a
1812 given position row sums are held in a *column major* array
1813 (changing y co-ordinate makes for small index changes)
1814 the col sums are held in a *row major* array
1819 for(j = 0; j <32; ++j )
1822 rowsum = rowsums[j];
1823 for( i = 0; i < width-16; ++i )
1827 colsums[i] = (colsums[i] + pb[down16] )-pb[0];
1828 rowsum = (rowsum + pb[16]) - pb[0];
1832 pb += 16; /* move pb on to next row... rememember we only did width-16! */
1839 static int dist22( uint8_t *s22blk1, uint8_t *s22blk2,int flx,int fh)
1841 uint8_t *p1 = s22blk1;
1842 uint8_t *p2 = s22blk2;
1846 for( j = 0; j < fh; ++j )
1849 #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff)
1850 pipestep(0); pipestep(1);
1851 pipestep(2); pipestep(3);
1852 pipestep(4); pipestep(5);
1853 pipestep(6); pipestep(7);
1865 Sum absolute differences for 4*4 sub-sampled data.
1867 TODO: currently assumes only 16*16 or 16*8 motion compensation will be used...
1868 I.e. 4*4 or 4*2 sub-sampled blocks will be compared.
1872 static int dist44( uint8_t *s44blk1, uint8_t *s44blk2,int qlx,int qh)
1874 register uint8_t *p1 = s44blk1;
1875 register uint8_t *p2 = s44blk2;
1879 /* #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff) */
1880 #define pipestep(o) diff = p1[o]-p2[o]; s += diff < 0 ? -diff : diff;
1881 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1884 p1 += qlx; p2 += qlx;
1885 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1888 p1 += qlx; p2 += qlx;
1889 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1890 p1 += qlx; p2 += qlx;
1891 pipestep(0); pipestep(1); pipestep(2); pipestep(3);
1900 * total squared difference between two (8*h) blocks of 2*2 sub-sampled pels
1901 * blk1,blk2: addresses of top left pels of both blocks
1902 * lx: distance (in bytes) of vertically adjacent pels
1903 * h: height of block (usually 8 or 16)
1906 static int dist2_22(uint8_t *blk1, uint8_t *blk2, int lx, int h)
1908 uint8_t *p1 = blk1, *p2 = blk2;
1924 /* total squared difference between bidirection prediction of (8*h)
1925 * blocks of 2*2 sub-sampled pels and reference
1926 * blk1f, blk1b,blk2: addresses of top left
1928 * lx: distance (in bytes) of vertically adjacent
1930 * h: height of block (usually 4 or 8)
1933 static int bdist2_22(uint8_t *blk1f, uint8_t *blk1b, uint8_t *blk2,
1936 uint8_t *p1f = blk1f,*p1b = blk1b,*p2 = blk2;
1943 v = ((p1f[i]+p1b[i]+1)>>1) - p2[i];
1954 * total squared difference between two (16*h) blocks
1955 * including optional half pel interpolation of blk1 (hx,hy)
1956 * blk1,blk2: addresses of top left pels of both blocks
1957 * lx: distance (in bytes) of vertically adjacent pels
1958 * hx,hy: flags for horizontal and/or vertical interpolation
1959 * h: height of block (usually 8 or 16)
1963 static int dist2(blk1,blk2,lx,hx,hy,h)
1964 uint8_t *blk1,*blk2;
1967 uint8_t *p1,*p1a,*p2;
1977 for (i=0; i<16; i++)
1988 for (i=0; i<16; i++)
1990 v = ((unsigned int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
2001 for (i=0; i<16; i++)
2003 v = ((unsigned int)(p1[i]+p1a[i]+1)>>1) - p2[i];
2011 else /* if (hx && hy) */
2016 for (i=0; i<16; i++)
2018 v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1]+2)>>2) - p2[i];
2032 * absolute difference error between a (16*h) block and a bidirectional
2035 * p2: address of top left pel of block
2036 * pf,hxf,hyf: address and half pel flags of forward ref. block
2037 * pb,hxb,hyb: address and half pel flags of backward ref. block
2038 * h: height of block
2039 * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
2043 static int bdist1(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
2044 uint8_t *pf,*pb,*p2;
2045 int lx,hxf,hyf,hxb,hyb,h;
2047 uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
2063 for (i=0; i<16; i++)
2065 v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
2066 ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
2085 * squared error between a (16*h) block and a bidirectional
2088 * p2: address of top left pel of block
2089 * pf,hxf,hyf: address and half pel flags of forward ref. block
2090 * pb,hxb,hyb: address and half pel flags of backward ref. block
2091 * h: height of block
2092 * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
2096 static int bdist2(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
2097 uint8_t *pf,*pb,*p2;
2098 int lx,hxf,hyf,hxb,hyb,h;
2100 uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
2116 for (i=0; i<16; i++)
2118 v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
2119 ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
2139 * variance of a (size*size) block, multiplied by 256
2140 * p: address of top left pel of block
2141 * lx: distance (in bytes) of vertically adjacent pels
2143 static int variance(uint8_t *p, int size, int lx)
2146 unsigned int v,s,s2;
2150 for (j=0; j<size; j++)
2152 for (i=0; i<size; i++)
2160 return s2 - (s*s)/(size*size);
2164 Compute the variance of the residual of uni-directionally motion
2168 static int unidir_pred_var( const mb_motion_s *motion,
2173 return (*pdist2)(motion->blk, mb, lx, motion->hx, motion->hy, h);
2178 Compute the variance of the residual of bi-directionally motion
2182 static int bidir_pred_var( const mb_motion_s *motion_f,
2183 const mb_motion_s *motion_b,
2187 return (*pbdist2)( motion_f->blk, motion_b->blk,
2189 motion_f->hx, motion_f->hy,
2190 motion_b->hx, motion_b->hy,
2195 Compute SAD for bi-directionally motion compensated blocks...
2198 static int bidir_pred_sad( const mb_motion_s *motion_f,
2199 const mb_motion_s *motion_b,
2203 return (*pbdist1)(motion_f->blk, motion_b->blk,
2205 motion_f->hx, motion_f->hy,
2206 motion_b->hx, motion_b->hy,
2210 static void frame_ME(motion_engine_t *engine,
2211 pict_data_s *picture,
2217 mb_motion_s framef_mc;
2218 mb_motion_s frameb_mc;
2219 mb_motion_s dualpf_mc;
2220 mb_motion_s topfldf_mc;
2221 mb_motion_s botfldf_mc;
2222 mb_motion_s topfldb_mc;
2223 mb_motion_s botfldb_mc;
2226 int vmc,vmcf,vmcr,vmci;
2227 int vmcfieldf,vmcfieldr,vmcfieldi;
2228 subsampled_mb_s ssmb;
2229 int imins[2][2],jmins[2][2];
2230 int imindmv,jmindmv,vmc_dp;
2231 //printf("frame_ME 1\n");
2234 /* A.Stevens fast motion estimation data is appended to actual
2235 luminance information
2237 ssmb.mb = mc->cur[0] + mb_row_start + i;
2238 ssmb.umb = (uint8_t*)(mc->cur[1] + (i>>1) + (mb_row_start>>2));
2239 ssmb.vmb = (uint8_t*)(mc->cur[2] + (i>>1) + (mb_row_start>>2));
2240 ssmb.fmb = (uint8_t*)(mc->cur[0] + fsubsample_offset +
2241 ((i>>1) + (mb_row_start>>2)));
2242 ssmb.qmb = (uint8_t*)(mc->cur[0] + qsubsample_offset +
2243 (i>>2) + (mb_row_start>>4));
2245 /* Compute variance MB as a measure of Intra-coding complexity
2246 We include chrominance information here, scaled to compensate
2247 for sub-sampling. Silly MPEG forcing chrom/lum to have same
2250 var = variance(ssmb.mb,16,width);
2252 //printf("motion %d\n", picture->pict_type);
2253 if (picture->pict_type==I_TYPE)
2255 mbi->mb_type = MB_INTRA;
2257 else if (picture->pict_type==P_TYPE)
2259 /* For P pictures we take into account chrominance. This
2260 provides much better performance at scene changes */
2261 var += chrom_var_sum(&ssmb,16,width);
2263 if (picture->frame_pred_dct)
2265 fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2266 width,i,j,mc->sxf,mc->syf,16,width,height,
2268 framef_mc.fieldoff = 0;
2269 vmc = framef_mc.var +
2270 unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
2271 mbi->motion_type = MC_FRAME;
2275 //printf("frame_ME 2 %p %p\n", mc->oldorg[0], mc->oldref[0]);
2276 frame_estimate(engine, mc->oldorg[0],
2288 //printf("frame_ME 3\n");
2289 vmcf = framef_mc.var +
2290 unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
2293 unidir_chrom_var_sum( &topfldf_mc, mc->oldref, &ssmb, (width<<1), 8 ) +
2295 unidir_chrom_var_sum( &botfldf_mc, mc->oldref, &ssmb, (width<<1), 8 );
2296 /* DEBUG DP currently disabled... */
2299 // dpframe_estimate(engine, picture,mc->oldref[0],&ssmb,
2300 // i,j>>1,imins,jmins,
2302 // &imindmv,&jmindmv, &vmc_dp);
2305 /* NOTE: Typically M =3 so DP actually disabled... */
2306 /* select between dual prime, frame and field prediction */
2307 // if ( M==1 && vmc_dp<vmcf && vmc_dp<vmcfieldf)
2309 // mbi->motion_type = MC_DMV;
2310 // /* No chrominance squared difference measure yet.
2311 // Assume identical to luminance */
2312 // vmc = vmc_dp + vmc_dp;
2315 if ( vmcf < vmcfieldf)
2317 mbi->motion_type = MC_FRAME;
2323 mbi->motion_type = MC_FIELD;
2329 /* select between intra or non-intra coding:
2331 * selection is based on intra block variance (var) vs.
2332 * prediction error variance (vmc)
2334 * Used to be: blocks with small prediction error are always
2335 * coded non-intra even if variance is smaller (is this reasonable?
2337 * TODO: A.Stevens Jul 2000
2338 * The bbmpeg guys have found this to be *unreasonable*.
2339 * I'm not sure I buy their solution using vmc*2. It is probabably
2340 * the vmc>= 9*256 test that is suspect.
2345 if (vmc>var /*&& vmc>=(3*3)*16*16*2*/ )
2347 mbi->mb_type = MB_INTRA;
2353 /* select between MC / No-MC
2355 * use No-MC if var(No-MC) <= 1.25*var(MC)
2356 * (i.e slightly biased towards No-MC)
2358 * blocks with small prediction error are always coded as No-MC
2359 * (requires no motion vectors, allows skipping)
2361 v0 = (*pdist2)(mc->oldref[0]+i+width*j,ssmb.mb,width,0,0,16);
2367 mbi->mb_type = MB_FORWARD;
2368 if (mbi->motion_type==MC_FRAME)
2370 mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
2371 mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
2373 else if (mbi->motion_type==MC_DMV)
2375 /* these are FRAME vectors */
2376 /* same parity vector */
2377 mbi->MV[0][0][0] = dualpf_mc.pos.x - (i<<1);
2378 mbi->MV[0][0][1] = (dualpf_mc.pos.y<<1) - (j<<1);
2380 /* opposite parity vector */
2381 mbi->dmvector[0] = imindmv;
2382 mbi->dmvector[1] = jmindmv;
2386 /* these are FRAME vectors */
2387 mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
2388 mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
2389 mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
2390 mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
2391 mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
2392 mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
2401 mbi->motion_type = MC_FRAME;
2402 mbi->MV[0][0][0] = 0;
2403 mbi->MV[0][0][1] = 0;
2407 else /* if (pict_type==B_TYPE) */
2409 if (picture->frame_pred_dct)
2411 var = variance(ssmb.mb,16,width);
2413 fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2414 width,i,j,mc->sxf,mc->syf,
2418 framef_mc.fieldoff = 0;
2419 vmcf = framef_mc.var;
2422 fullsearch(engine, mc->neworg[0],mc->newref[0],&ssmb,
2423 width,i,j,mc->sxb,mc->syb,
2426 frameb_mc.fieldoff = 0;
2427 vmcr = frameb_mc.var;
2429 /* interpolated (bidirectional) */
2431 vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
2435 /* select between forward/backward/interpolated prediction:
2436 * use the one with smallest mean sqaured prediction error
2438 if (vmcf<=vmcr && vmcf<=vmci)
2441 mbi->mb_type = MB_FORWARD;
2443 else if (vmcr<=vmci)
2446 mbi->mb_type = MB_BACKWARD;
2451 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2454 mbi->motion_type = MC_FRAME;
2458 /* forward prediction */
2459 frame_estimate(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2460 i,j,mc->sxf,mc->syf,
2467 /* backward prediction */
2468 frame_estimate(engine, mc->neworg[0],mc->newref[0],&ssmb,
2469 i,j,mc->sxb,mc->syb,
2475 vmcf = framef_mc.var;
2476 vmcr = frameb_mc.var;
2477 vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
2479 vmcfieldf = topfldf_mc.var + botfldf_mc.var;
2480 vmcfieldr = topfldb_mc.var + botfldb_mc.var;
2481 vmcfieldi = bidir_pred_var( &topfldf_mc, &topfldb_mc, ssmb.mb,
2483 bidir_pred_var( &botfldf_mc, &botfldb_mc, ssmb.mb,
2486 /* select prediction type of minimum distance from the
2487 * six candidates (field/frame * forward/backward/interpolated)
2489 if (vmci<vmcfieldi && vmci<vmcf && vmci<vmcfieldf
2490 && vmci<vmcr && vmci<vmcfieldr)
2492 /* frame, interpolated */
2493 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2494 mbi->motion_type = MC_FRAME;
2497 else if ( vmcfieldi<vmcf && vmcfieldi<vmcfieldf
2498 && vmcfieldi<vmcr && vmcfieldi<vmcfieldr)
2500 /* field, interpolated */
2501 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2502 mbi->motion_type = MC_FIELD;
2505 else if (vmcf<vmcfieldf && vmcf<vmcr && vmcf<vmcfieldr)
2507 /* frame, forward */
2508 mbi->mb_type = MB_FORWARD;
2509 mbi->motion_type = MC_FRAME;
2513 else if ( vmcfieldf<vmcr && vmcfieldf<vmcfieldr)
2515 /* field, forward */
2516 mbi->mb_type = MB_FORWARD;
2517 mbi->motion_type = MC_FIELD;
2520 else if (vmcr<vmcfieldr)
2522 /* frame, backward */
2523 mbi->mb_type = MB_BACKWARD;
2524 mbi->motion_type = MC_FRAME;
2529 /* field, backward */
2530 mbi->mb_type = MB_BACKWARD;
2531 mbi->motion_type = MC_FIELD;
2537 /* select between intra or non-intra coding:
2539 * selection is based on intra block variance (var) vs.
2540 * prediction error variance (vmc)
2542 * Used to be: blocks with small prediction error are always
2543 * coded non-intra even if variance is smaller (is this reasonable?
2545 * TODO: A.Stevens Jul 2000
2546 * The bbmpeg guys have found this to be *unreasonable*.
2547 * I'm not sure I buy their solution using vmc*2 in the first comparison.
2548 * It is probabably the vmc>= 9*256 test that is suspect.
2551 if (vmc>var && vmc>=9*256)
2552 mbi->mb_type = MB_INTRA;
2556 if (mbi->motion_type==MC_FRAME)
2559 mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
2560 mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
2562 mbi->MV[0][1][0] = frameb_mc.pos.x - (i<<1);
2563 mbi->MV[0][1][1] = frameb_mc.pos.y - (j<<1);
2567 /* these are FRAME vectors */
2569 mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
2570 mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
2571 mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
2572 mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
2573 mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
2574 mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
2576 mbi->MV[0][1][0] = topfldb_mc.pos.x - (i<<1);
2577 mbi->MV[0][1][1] = (topfldb_mc.pos.y<<1) - (j<<1);
2578 mbi->MV[1][1][0] = botfldb_mc.pos.x - (i<<1);
2579 mbi->MV[1][1][1] = (botfldb_mc.pos.y<<1) - (j<<1);
2580 mbi->mv_field_sel[0][1] = topfldb_mc.fieldsel;
2581 mbi->mv_field_sel[1][1] = botfldb_mc.fieldsel;
2590 * motion estimation for field pictures
2592 * mbi: pointer to macroblock info structure
2593 * secondfield: indicates second field of a frame (in P fields this means
2594 * that reference field of opposite parity is in curref instead
2596 * ipflag: indicates a P type field which is the second field of a frame
2597 * in which the first field is I type (this restricts predictions
2598 * to be based only on the opposite parity (=I) field)
2602 * mb_type: 0, MB_INTRA, MB_FORWARD, MB_BACKWARD, MB_FORWARD|MB_BACKWARD
2603 * MV[][][]: motion vectors (field format)
2604 * mv_field_sel: top/bottom field
2605 * motion_type: MC_FIELD, MC_16X8
2607 * uses global vars: pict_type, pict_struct
2609 static void field_ME(motion_engine_t *engine,
2610 pict_data_s *picture,
2615 int secondfield, int ipflag)
2618 uint8_t *toporg, *topref, *botorg, *botref;
2619 subsampled_mb_s ssmb;
2620 mb_motion_s fields_mc, dualp_mc;
2621 mb_motion_s fieldf_mc, fieldb_mc;
2622 mb_motion_s field8uf_mc, field8lf_mc;
2623 mb_motion_s field8ub_mc, field8lb_mc;
2624 int var,vmc,v0,dmc,dmcfieldi,dmcfield,dmcfieldf,dmcfieldr,dmc8i;
2631 /* Fast motion data sits at the end of the luminance buffer */
2632 ssmb.mb = mc->cur[0] + i + w2*j;
2633 ssmb.umb = mc->cur[1] + ((i>>1)+(w2>>1)*(j>>1));
2634 ssmb.vmb = mc->cur[2] + ((i>>1)+(w2>>1)*(j>>1));
2635 ssmb.fmb = mc->cur[0] + fsubsample_offset+((i>>1)+(w2>>1)*(j>>1));
2636 ssmb.qmb = mc->cur[0] + qsubsample_offset+ (i>>2)+(w2>>2)*(j>>2);
2638 if (picture->pict_struct==BOTTOM_FIELD)
2641 ssmb.umb += (width >> 1);
2642 ssmb.vmb += (width >> 1);
2643 ssmb.fmb += (width >> 1);
2644 ssmb.qmb += (width >> 2);
2647 var = variance(ssmb.mb,16,w2) +
2648 ( variance(ssmb.umb,8,(width>>1)) + variance(ssmb.vmb,8,(width>>1)))*2;
2650 if (picture->pict_type==I_TYPE)
2651 mbi->mb_type = MB_INTRA;
2652 else if (picture->pict_type==P_TYPE)
2654 toporg = mc->oldorg[0];
2655 topref = mc->oldref[0];
2656 botorg = mc->oldorg[0] + width;
2657 botref = mc->oldref[0] + width;
2661 /* opposite parity field is in same frame */
2662 if (picture->pict_struct==TOP_FIELD)
2664 /* current is top field */
2665 botorg = mc->cur[0] + width;
2666 botref = mc->curref[0] + width;
2670 /* current is bottom field */
2671 toporg = mc->cur[0];
2672 topref = mc->curref[0];
2676 field_estimate(engine,
2678 toporg,topref,botorg,botref,&ssmb,
2679 i,j,mc->sxf,mc->syf,ipflag,
2684 dmcfield = fieldf_mc.sad;
2685 dmc8f = field8uf_mc.sad + field8lf_mc.sad;
2687 // if (M==1 && !ipflag) /* generic condition which permits Dual Prime */
2689 // dpfield_estimate(engine,
2691 // topref,botref,ssmb.mb,i,j,imins,jmins,
2694 // dmc_dp = dualp_mc.sad;
2697 // /* select between dual prime, field and 16x8 prediction */
2698 // if (M==1 && !ipflag && dmc_dp<dmc8f && dmc_dp<dmcfield)
2700 // /* Dual Prime prediction */
2701 // mbi->motion_type = MC_DMV;
2702 // dmc = dualp_mc.sad;
2709 /* 16x8 prediction */
2710 mbi->motion_type = MC_16X8;
2711 /* upper and lower half blocks */
2712 vmc = unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
2713 vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
2717 /* field prediction */
2718 mbi->motion_type = MC_FIELD;
2719 vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16 );
2722 /* select between intra and non-intra coding */
2724 if ( vmc>var && vmc>=9*256)
2725 mbi->mb_type = MB_INTRA;
2728 /* zero MV field prediction from same parity ref. field
2729 * (not allowed if ipflag is set)
2732 v0 = (*pdist2)(((picture->pict_struct==BOTTOM_FIELD)?botref:topref) + i + w2*j,
2734 if (ipflag || (4*v0>5*vmc && v0>=9*256))
2737 mbi->mb_type = MB_FORWARD;
2738 if (mbi->motion_type==MC_FIELD)
2740 mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
2741 mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
2742 mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
2744 else if (mbi->motion_type==MC_DMV)
2746 /* same parity vector */
2747 mbi->MV[0][0][0] = imins - (i<<1);
2748 mbi->MV[0][0][1] = jmins - (j<<1);
2750 /* opposite parity vector */
2751 mbi->dmvector[0] = dualp_mc.pos.x;
2752 mbi->dmvector[1] = dualp_mc.pos.y;
2756 mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
2757 mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
2758 mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
2759 mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
2760 mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
2761 mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
2769 mbi->motion_type = MC_FIELD;
2770 mbi->MV[0][0][0] = 0;
2771 mbi->MV[0][0][1] = 0;
2772 mbi->mv_field_sel[0][0] = (picture->pict_struct==BOTTOM_FIELD);
2776 else /* if (pict_type==B_TYPE) */
2778 /* forward prediction */
2779 field_estimate(engine,
2781 mc->oldorg[0],mc->oldref[0],
2782 mc->oldorg[0]+width,mc->oldref[0]+width,&ssmb,
2783 i,j,mc->sxf,mc->syf,0,
2788 dmcfieldf = fieldf_mc.sad;
2789 dmc8f = field8uf_mc.sad + field8lf_mc.sad;
2791 /* backward prediction */
2792 field_estimate(engine,
2794 mc->neworg[0],mc->newref[0],mc->neworg[0]+width,mc->newref[0]+width,
2796 i,j,mc->sxb,mc->syb,0,
2801 dmcfieldr = fieldb_mc.sad;
2802 dmc8r = field8ub_mc.sad + field8lb_mc.sad;
2804 /* calculate distances for bidirectional prediction */
2806 dmcfieldi = bidir_pred_sad( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
2808 /* 16x8 upper and lower half blocks */
2809 dmc8i = bidir_pred_sad( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 16 );
2810 dmc8i += bidir_pred_sad( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 16 );
2812 /* select prediction type of minimum distance */
2813 if (dmcfieldi<dmc8i && dmcfieldi<dmcfieldf && dmcfieldi<dmc8f
2814 && dmcfieldi<dmcfieldr && dmcfieldi<dmc8r)
2816 /* field, interpolated */
2817 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2818 mbi->motion_type = MC_FIELD;
2819 vmc = bidir_pred_var( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
2821 else if (dmc8i<dmcfieldf && dmc8i<dmc8f
2822 && dmc8i<dmcfieldr && dmc8i<dmc8r)
2824 /* 16x8, interpolated */
2825 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2826 mbi->motion_type = MC_16X8;
2828 /* upper and lower half blocks */
2829 vmc = bidir_pred_var( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 8);
2830 vmc += bidir_pred_var( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 8);
2832 else if (dmcfieldf<dmc8f && dmcfieldf<dmcfieldr && dmcfieldf<dmc8r)
2834 /* field, forward */
2835 mbi->mb_type = MB_FORWARD;
2836 mbi->motion_type = MC_FIELD;
2837 vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16);
2839 else if (dmc8f<dmcfieldr && dmc8f<dmc8r)
2842 mbi->mb_type = MB_FORWARD;
2843 mbi->motion_type = MC_16X8;
2845 /* upper and lower half blocks */
2846 vmc = unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
2847 vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
2849 else if (dmcfieldr<dmc8r)
2851 /* field, backward */
2852 mbi->mb_type = MB_BACKWARD;
2853 mbi->motion_type = MC_FIELD;
2854 vmc = unidir_pred_var( &fieldb_mc, ssmb.mb, w2, 16 );
2858 /* 16x8, backward */
2859 mbi->mb_type = MB_BACKWARD;
2860 mbi->motion_type = MC_16X8;
2862 /* upper and lower half blocks */
2863 vmc = unidir_pred_var( &field8ub_mc, ssmb.mb, w2, 8);
2864 vmc += unidir_pred_var( &field8lb_mc, ssmb.mb, w2, 8);
2868 /* select between intra and non-intra coding */
2869 if (vmc>var && vmc>=9*256)
2870 mbi->mb_type = MB_INTRA;
2874 if (mbi->motion_type==MC_FIELD)
2877 mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
2878 mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
2879 mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
2881 mbi->MV[0][1][0] = fieldb_mc.pos.x - (i<<1);
2882 mbi->MV[0][1][1] = fieldb_mc.pos.y - (j<<1);
2883 mbi->mv_field_sel[0][1] = fieldb_mc.fieldsel;
2888 mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
2889 mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
2890 mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
2891 mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
2892 mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
2893 mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
2895 mbi->MV[0][1][0] = field8ub_mc.pos.x - (i<<1);
2896 mbi->MV[0][1][1] = field8ub_mc.pos.y - (j<<1);
2897 mbi->mv_field_sel[0][1] = field8ub_mc.fieldsel;
2898 mbi->MV[1][1][0] = field8lb_mc.pos.x - (i<<1);
2899 mbi->MV[1][1][1] = field8lb_mc.pos.y - ((j+8)<<1);
2900 mbi->mv_field_sel[1][1] = field8lb_mc.fieldsel;
2910 Initialise motion compensation - currently purely selection of which
2911 versions of the various low level computation routines to use
2917 int cpucap = cpu_accel();
2919 if( cpucap == 0 ) /* No MMX/SSE etc support available */
2923 pdist1_00 = dist1_00;
2924 pdist1_01 = dist1_01;
2925 pdist1_10 = dist1_10;
2926 pdist1_11 = dist1_11;
2930 pdist2_22 = dist2_22;
2931 pbdist2_22 = bdist2_22;
2932 pfind_best_one_pel = find_best_one_pel;
2933 pbuild_sub22_mcomps = build_sub22_mcomps;
2936 else if(cpucap & ACCEL_X86_MMXEXT ) /* AMD MMX or SSE... */
2938 if(verbose) fprintf( stderr, "SETTING EXTENDED MMX for MOTION!\n");
2939 pdist22 = dist22_mmxe;
2940 pdist44 = dist44_mmxe;
2941 pdist1_00 = dist1_00_mmxe;
2942 pdist1_01 = dist1_01_mmxe;
2943 pdist1_10 = dist1_10_mmxe;
2944 pdist1_11 = dist1_11_mmxe;
2945 pbdist1 = bdist1_mmx;
2947 pbdist2 = bdist2_mmx;
2948 pdist2_22 = dist2_22_mmx;
2949 pbdist2_22 = bdist2_22_mmx;
2950 pfind_best_one_pel = find_best_one_pel_mmxe;
2951 pbuild_sub22_mcomps = build_sub22_mcomps_mmxe;
2952 pmblock_sub44_dists = mblock_sub44_dists_mmxe;
2955 else if(cpucap & ACCEL_X86_MMX) /* Ordinary MMX CPU */
2957 if(verbose) fprintf( stderr, "SETTING MMX for MOTION!\n");
2958 pdist22 = dist22_mmx;
2959 pdist44 = dist44_mmx;
2960 pdist1_00 = dist1_00_mmx;
2961 pdist1_01 = dist1_01_mmx;
2962 pdist1_10 = dist1_10_mmx;
2963 pdist1_11 = dist1_11_mmx;
2964 pbdist1 = bdist1_mmx;
2966 pbdist2 = bdist2_mmx;
2967 pdist2_22 = dist2_22_mmx;
2968 pbdist2_22 = bdist2_22_mmx;
2969 pfind_best_one_pel = find_best_one_pel;
2970 pbuild_sub22_mcomps = build_sub22_mcomps;
2971 pmblock_sub44_dists = mblock_sub44_dists_mmx;
2977 void motion_engine_loop(motion_engine_t *engine)
2979 while(!engine->done)
2981 pthread_mutex_lock(&(engine->input_lock));
2985 pict_data_s *picture = engine->pict_data;
2986 motion_comp_s *mc_data = engine->motion_comp;
2987 int secondfield = engine->secondfield;
2988 int ipflag = engine->ipflag;
2989 mbinfo_s *mbi = picture->mbinfo + (engine->start_row / 16) * (width / 16);
2991 int mb_row_incr; /* Offset increment to go down 1 row of mb's */
2994 if (picture->pict_struct == FRAME_PICTURE)
2996 mb_row_incr = 16*width;
2997 mb_row_start = engine->start_row / 16 * mb_row_incr;
2998 /* loop through all macroblocks of a frame picture */
2999 for (j=engine->start_row; j < engine->end_row; j+=16)
3001 for (i=0; i<width; i+=16)
3003 frame_ME(engine, picture, mc_data, mb_row_start, i,j, mbi);
3006 mb_row_start += mb_row_incr;
3011 mb_row_incr = (16 * 2) * width;
3012 mb_row_start = engine->start_row / 16 * mb_row_incr;
3013 /* loop through all macroblocks of a field picture */
3014 for (j=engine->start_row; j < engine->end_row; j+=16)
3016 for (i=0; i<width; i+=16)
3018 field_ME(engine, picture, mc_data, mb_row_start, i,j,
3019 mbi,secondfield,ipflag);
3022 mb_row_start += mb_row_incr;
3027 pthread_mutex_unlock(&(engine->output_lock));
3032 void motion_estimation(pict_data_s *picture,
3033 motion_comp_s *mc_data,
3034 int secondfield, int ipflag)
3038 for(i = 0; i < processors; i++)
3040 motion_engines[i].motion_comp = mc_data;
3042 motion_engines[i].pict_data = picture;
3044 motion_engines[i].secondfield = secondfield;
3045 motion_engines[i].ipflag = ipflag;
3046 pthread_mutex_unlock(&(motion_engines[i].input_lock));
3049 /* Wait for completion */
3050 for(i = 0; i < processors; i++)
3052 pthread_mutex_lock(&(motion_engines[i].output_lock));
3056 void start_motion_engines()
3059 int rows_per_processor = (int)((float)height2 / 16 / processors + 0.5);
3060 int current_row = 0;
3061 pthread_attr_t attr;
3062 pthread_mutexattr_t mutex_attr;
3064 pthread_mutexattr_init(&mutex_attr);
3065 pthread_attr_init(&attr);
3066 motion_engines = calloc(1, sizeof(motion_engine_t) * processors);
3067 for(i = 0; i < processors; i++)
3069 motion_engines[i].start_row = current_row * 16;
3070 current_row += rows_per_processor;
3071 if(current_row > height2 / 16) current_row = height2 / 16;
3072 motion_engines[i].end_row = current_row * 16;
3073 pthread_mutex_init(&(motion_engines[i].input_lock), &mutex_attr);
3074 pthread_mutex_lock(&(motion_engines[i].input_lock));
3075 pthread_mutex_init(&(motion_engines[i].output_lock), &mutex_attr);
3076 pthread_mutex_lock(&(motion_engines[i].output_lock));
3077 motion_engines[i].done = 0;
3078 pthread_create(&(motion_engines[i].tid),
3080 (void*)motion_engine_loop,
3081 &motion_engines[i]);
3085 void stop_motion_engines()
3088 for(i = 0; i < processors; i++)
3090 motion_engines[i].done = 1;
3091 pthread_mutex_unlock(&(motion_engines[i].input_lock));
3092 pthread_join(motion_engines[i].tid, 0);
3093 pthread_mutex_destroy(&(motion_engines[i].input_lock));
3094 pthread_mutex_destroy(&(motion_engines[i].output_lock));
3096 free(motion_engines);