remove Features5, rework gradient plugin, fix paste_edl track title bug, gl_probe...
[goodguy/cinelerra.git] / cinelerra-5.1 / mpeg2enc / motion.c
1 /* motion.c, motion estimation                                              */
2
3 /* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
4
5 /*
6  * Disclaimer of Warranty
7  *
8  * These software programs are available to the user without any license fee or
9  * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
10  * any and all warranties, whether express, implied, or statuary, including any
11  * implied warranties or merchantability or of fitness for a particular
12  * purpose.  In no event shall the copyright-holder be liable for any
13  * incidental, punitive, or consequential damages of any kind whatsoever
14  * arising from the use of these programs.
15  *
16  * This disclaimer of warranty extends to the user of these programs and user's
17  * customers, employees, agents, transferees, successors, and assigns.
18  *
19  * The MPEG Software Simulation Group does not represent or warrant that the
20  * programs furnished hereunder are free of infringement of any third-party
21  * patents.
22  *
23  * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
24  * are subject to royalty fees to patent holders.  Many of these patents are
25  * general enough such that they are unavoidable regardless of implementation
26  * design.
27  *
28  */
29
30 #include <limits.h>
31 #include <stdio.h>
32 #include <string.h>
33 #include <stdlib.h>
34 #include "config.h"
35 #include "global.h"
36 #include "cpu_accel.h"
37 #include "simd.h"
38
39 /* Macro-block Motion compensation results record */
40
41 typedef struct _blockcrd {
42         int16_t x;
43         int16_t y;
44 } blockxy;
45
46 struct mb_motion
47 {
48         blockxy pos;        // Half-pel co-ordinates of source block
49         int sad;                        // Sum of absolute difference
50         int var;
51         uint8_t *blk ;          // Source block data (in luminace data array)
52         int hx, hy;                     // Half-pel offsets
53         int fieldsel;           // 0 = top 1 = bottom
54         int fieldoff;       // Offset from start of frame data to first line
55                                                 // of field.    (top = 0, bottom = width );
56 };
57
58 typedef struct mb_motion mb_motion_s;
59
60
61 struct subsampled_mb
62 {
63         uint8_t *mb;            // One pel
64         uint8_t *fmb;           // Two-pel subsampled
65         uint8_t *qmb;           // Four-pel subsampled
66         uint8_t *umb;           // U compoenent one-pel
67         uint8_t *vmb;       // V component  one-pel
68 };
69
70 typedef struct subsampled_mb subsampled_mb_s;
71
72
73 static void frame_ME (motion_engine_t *engine,
74                                                                         pict_data_s *picture,
75                                                                   motion_comp_s *mc,
76                                                                   int mboffset,
77                                                                   int i, int j, struct mbinfo *mbi);
78
79 static void field_ME (motion_engine_t *engine,
80                                                                         pict_data_s *picture,
81                                                                   motion_comp_s *mc,
82                                                                   int mboffset,
83                                                                   int i, int j, 
84                                                                   struct mbinfo *mbi, 
85                                                                   int secondfield, 
86                                                                   int ipflag);
87
88 static void frame_estimate (motion_engine_t *engine,
89         uint8_t *org,
90          uint8_t *ref, 
91          subsampled_mb_s *ssmb,
92          int i, int j,
93          int sx, int sy, 
94           mb_motion_s *bestfr,
95           mb_motion_s *besttop,
96           mb_motion_s *bestbot,
97          int imins[2][2], int jmins[2][2]);
98
99 static void field_estimate (motion_engine_t *engine,
100                                                         pict_data_s *picture,
101                                                         uint8_t *toporg,
102                                                         uint8_t *topref, 
103                                                         uint8_t *botorg, 
104                                                         uint8_t *botref,
105                                                         subsampled_mb_s *ssmb,
106                                                         int i, int j, int sx, int sy, int ipflag,
107                                                         mb_motion_s *bestfr,
108                                                         mb_motion_s *best8u,
109                                                         mb_motion_s *best8l,
110                                                         mb_motion_s *bestsp);
111 #if 0
112 static void dpframe_estimate (motion_engine_t *engine,
113         pict_data_s *picture,
114         uint8_t *ref,
115         subsampled_mb_s *ssmb,
116         int i, int j, int iminf[2][2], int jminf[2][2],
117         mb_motion_s *dpbest,
118         int *imindmvp, int *jmindmvp, 
119         int *vmcp);
120
121 static void dpfield_estimate (motion_engine_t *engine,
122         pict_data_s *picture,
123         uint8_t *topref,
124         uint8_t *botref, 
125         uint8_t *mb,
126         int i, int j, 
127         int imins, int jmins, 
128         mb_motion_s *dpbest,
129         int *vmcp);
130 #endif
131 static void fullsearch (motion_engine_t *engine, 
132         uint8_t *org, uint8_t *ref,
133         subsampled_mb_s *ssblk,
134         int lx, int i0, int j0, 
135         int sx, int sy, int h, 
136         int xmax, int ymax,
137         mb_motion_s *motion );
138
139 static void find_best_one_pel( motion_engine_t *engine, 
140                                                                 uint8_t *org, uint8_t *blk,
141                                                            int searched_size,
142                                                            int i0, int j0,
143                                                            int ilow, int jlow,
144                                                            int xmax, int ymax,
145                                                            int lx, int h, 
146                                                            mb_motion_s *res
147         );
148 static int build_sub22_mcomps( motion_engine_t *engine, 
149                                                                 int i0,  int j0, int ihigh, int jhigh, 
150                                                                 int null_mc_sad,
151                                                                 uint8_t *s22org,  uint8_t *s22blk, 
152                                                            int flx, int fh,  int searched_sub44_size );
153
154 #ifdef X86_CPU
155 static void find_best_one_pel_mmxe( motion_engine_t *engine, 
156                                                                 uint8_t *org, uint8_t *blk,
157                                                            int searched_size,
158                                                            int i0, int j0,
159                                                            int ilow, int jlow,
160                                                            int xmax, int ymax,
161                                                            int lx, int h, 
162                                                            mb_motion_s *res
163         );
164
165 static int build_sub22_mcomps_mmxe( motion_engine_t *engine, int i0,  int j0, int ihigh, int jhigh, 
166                                                          int null_mc_sad,
167                                                          uint8_t *s22org,  uint8_t *s22blk, 
168                                                          int flx, int fh,  int searched_sub44_size );
169 static int (*pmblock_sub44_dists)( uint8_t *blk,  uint8_t *ref,
170                                                         int ilow, int jlow,
171                                                         int ihigh, int jhigh, 
172                                                         int h, int rowstride, 
173                                                         int threshold,
174                                                         mc_result_s *resvec);
175 #endif
176
177 static int unidir_pred_var( const mb_motion_s *motion, 
178                                                         uint8_t *mb,  int lx, int h);
179 static int bidir_pred_var( const mb_motion_s *motion_f,  
180                                                    const mb_motion_s *motion_b, 
181                                                    uint8_t *mb,  int lx, int h);
182 static int bidir_pred_sad( const mb_motion_s *motion_f,  
183                                                    const mb_motion_s *motion_b, 
184                                                    uint8_t *mb,  int lx, int h);
185
186 static int variance(  uint8_t *mb, int size, int lx);
187
188 static int dist22 ( uint8_t *blk1, uint8_t *blk2,  int qlx, int qh);
189
190 static int dist44 ( uint8_t *blk1, uint8_t *blk2,  int flx, int fh);
191 static int dist2_22( uint8_t *blk1, uint8_t *blk2,
192                                          int lx, int h);
193 static int bdist2_22( uint8_t *blk1f, uint8_t *blk1b, 
194                                           uint8_t *blk2,
195                                           int lx, int h);
196
197
198 static void (*pfind_best_one_pel)( motion_engine_t *engine,
199                                                                 uint8_t *org, uint8_t *blk,
200                                                            int searched_size,
201                                                            int i0, int j0,
202                                                            int ilow, int jlow,
203                                                            int xmax, int ymax,
204                                                            int lx, int h, 
205                                                            mb_motion_s *res
206         );
207 static int (*pbuild_sub22_mcomps)( motion_engine_t *engine,
208                                                                         int i0,  int j0, int ihigh, int jhigh, 
209                                                                    int null_mc_sad,
210                                                                    uint8_t *s22org,  uint8_t *s22blk, 
211                                                                    int flx, int fh,  int searched_sub44_size );
212
213 static int (*pdist2_22)( uint8_t *blk1, uint8_t *blk2,
214                                                  int lx, int h);
215 static int (*pbdist2_22)( uint8_t *blk1f, uint8_t *blk1b, 
216                                                   uint8_t *blk2,
217                                                   int lx, int h);
218
219 static int dist1_00( uint8_t *blk1, uint8_t *blk2,  int lx, int h, int distlim);
220 static int dist1_01(uint8_t *blk1, uint8_t *blk2, int lx, int h);
221 static int dist1_10(uint8_t *blk1, uint8_t *blk2, int lx, int h);
222 static int dist1_11(uint8_t *blk1, uint8_t *blk2, int lx, int h);
223 static int dist2 (uint8_t *blk1, uint8_t *blk2,
224                                                           int lx, int hx, int hy, int h);
225 static int bdist2 (uint8_t *pf, uint8_t *pb,
226         uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
227 static int bdist1 (uint8_t *pf, uint8_t *pb,
228                                    uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
229
230
231 static int (*pdist22) ( uint8_t *blk1, uint8_t *blk2,  int flx, int fh);
232 static int (*pdist44) ( uint8_t *blk1, uint8_t *blk2,  int qlx, int qh);
233 static int (*pdist1_00) ( uint8_t *blk1, uint8_t *blk2,  int lx, int h, int distlim);
234 static int (*pdist1_01) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
235 static int (*pdist1_10) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
236 static int (*pdist1_11) ( uint8_t *blk1, uint8_t *blk2, int lx, int h);
237
238 static int (*pdist2) (uint8_t *blk1, uint8_t *blk2,
239                                           int lx, int hx, int hy, int h);
240   
241   
242 static int (*pbdist2) (uint8_t *pf, uint8_t *pb,
243                                            uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
244
245 static int (*pbdist1) (uint8_t *pf, uint8_t *pb,
246                                            uint8_t *p2, int lx, int hxf, int hyf, int hxb, int hyb, int h);
247
248
249 /* DEBUGGER...
250 static void check_mc( const mb_motion_s *motion, int rx, int ry, int i, int j, char *str )
251 {
252         rx *= 2; ry *= 2;
253         if( motion->sad < 0 || motion->sad > 0x10000 )
254         {
255                 printf( "SAD ooops %s\n", str );
256                 exit(1);
257         }
258         if( motion->pos.x-i*2 < -rx || motion->pos.x-i*2 >= rx )
259         {
260                 printf( "X MC ooops %s = %d [%d]\n", str, motion->pos.x-i*2,rx );
261                 exit(1);
262         }
263         if( motion->pos.y-j*2 < -ry || motion->pos.y-j*2 >= ry )
264         {
265                 printf( "Y MC ooops %s %d [%d]\n", str, motion->pos.y-j*2, ry );
266                 exit(1);
267         }
268 }
269
270 static void init_mc( mb_motion_s *motion )
271 {
272         motion->sad = -123;
273         motion->pos.x = -1000;
274         motion->pos.y = -1000;
275 }
276 */
277
278 /* unidir_NI_var_sum
279    Compute the combined variance of luminance and chrominance information
280    for a particular non-intra macro block after unidirectional
281    motion compensation...  
282
283    Note: results are scaled to give chrominance equal weight to
284    chrominance.  The variance of the luminance portion is computed
285    at the time the motion compensation is computed.
286
287    TODO: Perhaps we should compute the whole thing in fullsearch not
288    seperate it.  However, that would involve a lot of fiddling with
289    field_* and until its thoroughly debugged and tested I think I'll
290    leave that alone. Furthermore, it is unclear if its really worth
291    doing these computations for B *and* P frames.
292
293    TODO: BUG: ONLY works for 420 video...
294
295 */
296
297 static int unidir_chrom_var_sum( mb_motion_s *lum_mc, 
298                                                           uint8_t **ref, 
299                                                           subsampled_mb_s *ssblk,
300                                                           int lx, int h )
301 {
302         int uvlx = (lx>>1);
303         int uvh = (h>>1);
304         /* N.b. MC co-ordinates are computed in half-pel units! */
305         int cblkoffset = (lum_mc->fieldoff>>1) +
306                 (lum_mc->pos.x>>2) + (lum_mc->pos.y>>2)*uvlx;
307         
308         return  ((*pdist2_22)( ref[1] + cblkoffset, ssblk->umb, uvlx, uvh) +
309                          (*pdist2_22)( ref[2] + cblkoffset, ssblk->vmb, uvlx, uvh))*2;
310 }
311
312 /*
313   bidir_NI_var_sum
314    Compute the combined variance of luminance and chrominance information
315    for a particular non-intra macro block after unidirectional
316    motion compensation...  
317
318    Note: results are scaled to give chrominance equal weight to
319    chrominance.  The variance of the luminance portion is computed
320    at the time the motion compensation is computed.
321
322    Note: results scaled to give chrominance equal weight to chrominance.
323   
324   TODO: BUG: ONLY works for 420 video...
325
326   NOTE: Currently unused but may be required if it turns out that taking
327   chrominance into account in B frames is needed.
328
329  */
330
331 int bidir_chrom_var_sum( mb_motion_s *lum_mc_f, 
332                                            mb_motion_s *lum_mc_b, 
333                                            uint8_t **ref_f, 
334                                            uint8_t **ref_b,
335                                            subsampled_mb_s *ssblk,
336                                            int lx, int h )
337 {
338         int uvlx = (lx>>1);
339         int uvh = (h>>1);
340         /* N.b. MC co-ordinates are computed in half-pel units! */
341         int cblkoffset_f = (lum_mc_f->fieldoff>>1) + 
342                 (lum_mc_f->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
343         int cblkoffset_b = (lum_mc_b->fieldoff>>1) + 
344                 (lum_mc_b->pos.x>>2) + (lum_mc_f->pos.y>>2)*uvlx;
345         
346         return  (
347                 (*pbdist2_22)( ref_f[1] + cblkoffset_f, ref_b[1] + cblkoffset_b,
348                                            ssblk->umb, uvlx, uvh ) +
349                 (*pbdist2_22)( ref_f[2] + cblkoffset_f, ref_b[2] + cblkoffset_b,
350                                            ssblk->vmb, uvlx, uvh ))*2;
351
352 }
353
354 static int chrom_var_sum( subsampled_mb_s *ssblk, int h, int lx )
355 {
356         return (variance(ssblk->umb,(h>>1),(lx>>1)) + 
357                         variance(ssblk->vmb,(h>>1),(lx>>1))) * 2;
358 }
359
360 /*
361  * frame picture motion estimation
362  *
363  * org: top left pel of source reference frame
364  * ref: top left pel of reconstructed reference frame
365  * ssmb:  macroblock to be matched
366  * i,j: location of mb relative to ref (=center of search window)
367  * sx,sy: half widths of search window
368  * besfr: location and SAD of best frame prediction
369  * besttop: location of best field pred. for top field of mb
370  * bestbo : location of best field pred. for bottom field of mb
371  */
372
373 static void frame_estimate(motion_engine_t *engine, 
374         uint8_t *org,
375         uint8_t *ref,
376         subsampled_mb_s *ssmb,
377         int i, int j, int sx, int sy,
378         mb_motion_s *bestfr,
379         mb_motion_s *besttop,
380         mb_motion_s *bestbot,
381         int imins[2][2],
382         int jmins[2][2]
383         )
384 {
385         subsampled_mb_s  botssmb;
386         mb_motion_s topfld_mc;
387         mb_motion_s botfld_mc;
388
389         botssmb.mb = ssmb->mb+width;
390         botssmb.fmb = ssmb->mb+(width>>1);
391         botssmb.qmb = ssmb->qmb+(width>>2);
392         botssmb.umb = ssmb->umb+(width>>1);
393         botssmb.vmb = ssmb->vmb+(width>>1);
394
395         /* frame prediction */
396         fullsearch(engine, org,ref,ssmb,width,i,j,sx,sy,16,width,height,
397                                                   bestfr );
398         bestfr->fieldsel = 0;
399         bestfr->fieldoff = 0;
400
401         /* predict top field from top field */
402         fullsearch(engine, org,ref,ssmb,width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
403                            &topfld_mc);
404
405         /* predict top field from bottom field */
406         fullsearch(engine, org+width,ref+width,ssmb, width<<1,i,j>>1,sx,sy>>1,8,
407                            width,height>>1, &botfld_mc);
408
409         /* set correct field selectors... */
410         topfld_mc.fieldsel = 0;
411         botfld_mc.fieldsel = 1;
412         topfld_mc.fieldoff = 0;
413         botfld_mc.fieldoff = width;
414
415         imins[0][0] = topfld_mc.pos.x;
416         jmins[0][0] = topfld_mc.pos.y;
417         imins[1][0] = botfld_mc.pos.x;
418         jmins[1][0] = botfld_mc.pos.y;
419
420         /* select prediction for top field */
421         if (topfld_mc.sad<=botfld_mc.sad)
422         {
423                 *besttop = topfld_mc;
424         }
425         else
426         {
427                 *besttop = botfld_mc;
428         }
429
430         /* predict bottom field from top field */
431         fullsearch(engine, org,ref,&botssmb,
432                                         width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
433                                         &topfld_mc);
434
435         /* predict bottom field from bottom field */
436         fullsearch(engine, org+width,ref+width,&botssmb,
437                                         width<<1,i,j>>1,sx,sy>>1,8,width,height>>1,
438                                         &botfld_mc);
439
440         /* set correct field selectors... */
441         topfld_mc.fieldsel = 0;
442         botfld_mc.fieldsel = 1;
443         topfld_mc.fieldoff = 0;
444         botfld_mc.fieldoff = width;
445
446         imins[0][1] = topfld_mc.pos.x;
447         jmins[0][1] = topfld_mc.pos.y;
448         imins[1][1] = botfld_mc.pos.x;
449         jmins[1][1] = botfld_mc.pos.y;
450
451         /* select prediction for bottom field */
452         if (botfld_mc.sad<=topfld_mc.sad)
453         {
454                 *bestbot = botfld_mc;
455         }
456         else
457         {
458                 *bestbot = topfld_mc;
459         }
460 }
461
462 /*
463  * field picture motion estimation subroutine
464  *
465  * toporg: address of original top reference field
466  * topref: address of reconstructed top reference field
467  * botorg: address of original bottom reference field
468  * botref: address of reconstructed bottom reference field
469  * ssmmb:  macroblock to be matched
470  * i,j: location of mb (=center of search window)
471  * sx,sy: half width/height of search window
472  *
473  * bestfld: location and distance of best field prediction
474  * best8u: location of best 16x8 pred. for upper half of mb
475  * best8lp: location of best 16x8 pred. for lower half of mb
476  * bdestsp: location and distance of best same parity field
477  *                    prediction (needed for dual prime, only valid if
478  *                    ipflag==0)
479  */
480
481 static void field_estimate (motion_engine_t *engine,
482         pict_data_s *picture,
483         uint8_t *toporg,
484         uint8_t *topref, 
485         uint8_t *botorg, 
486         uint8_t *botref,
487         subsampled_mb_s *ssmb,
488         int i, int j, int sx, int sy, int ipflag,
489         mb_motion_s *bestfld,
490         mb_motion_s *best8u,
491         mb_motion_s *best8l,
492         mb_motion_s *bestsp)
493
494 {
495         mb_motion_s topfld_mc;
496         mb_motion_s botfld_mc;
497         int dt, db;
498         int notop, nobot;
499         subsampled_mb_s botssmb;
500
501         botssmb.mb = ssmb->mb+width;
502         botssmb.umb = ssmb->umb+(width>>1);
503         botssmb.vmb = ssmb->vmb+(width>>1);
504         botssmb.fmb = ssmb->fmb+(width>>1);
505         botssmb.qmb = ssmb->qmb+(width>>2);
506
507         /* if ipflag is set, predict from field of opposite parity only */
508         notop = ipflag && (picture->pict_struct==TOP_FIELD);
509         nobot = ipflag && (picture->pict_struct==BOTTOM_FIELD);
510
511         /* field prediction */
512
513         /* predict current field from top field */
514         if (notop)
515                 topfld_mc.sad = dt = 65536; /* infinity */
516         else
517                 fullsearch(engine, toporg,topref,ssmb,width<<1,
518                                    i,j,sx,sy>>1,16,width,height>>1,
519                                    &topfld_mc);
520         dt = topfld_mc.sad;
521         /* predict current field from bottom field */
522         if (nobot)
523                 botfld_mc.sad = db = 65536; /* infinity */
524         else
525                 fullsearch(engine, botorg,botref,ssmb,width<<1,
526                                    i,j,sx,sy>>1,16,width,height>>1,
527                                    &botfld_mc);
528         db = botfld_mc.sad;
529         /* Set correct field selectors */
530         topfld_mc.fieldsel = 0;
531         botfld_mc.fieldsel = 1;
532         topfld_mc.fieldoff = 0;
533         botfld_mc.fieldoff = width;
534
535         /* same parity prediction (only valid if ipflag==0) */
536         if (picture->pict_struct==TOP_FIELD)
537         {
538                 *bestsp = topfld_mc;
539         }
540         else
541         {
542                 *bestsp = botfld_mc;
543         }
544
545         /* select field prediction */
546         if (dt<=db)
547         {
548                 *bestfld = topfld_mc;
549         }
550         else
551         {
552                 *bestfld = botfld_mc;
553         }
554
555
556         /* 16x8 motion compensation */
557
558         /* predict upper half field from top field */
559         if (notop)
560                 topfld_mc.sad = dt = 65536;
561         else
562                 fullsearch(engine, toporg,topref,ssmb,width<<1,
563                                    i,j,sx,sy>>1,8,width,height>>1,
564                                     &topfld_mc);
565         dt = topfld_mc.sad;
566         /* predict upper half field from bottom field */
567         if (nobot)
568                 botfld_mc.sad = db = 65536;
569         else
570                 fullsearch(engine, botorg,botref,ssmb,width<<1,
571                                    i,j,sx,sy>>1,8,width,height>>1,
572                                     &botfld_mc);
573         db = botfld_mc.sad;
574
575         /* Set correct field selectors */
576         topfld_mc.fieldsel = 0;
577         botfld_mc.fieldsel = 1;
578         topfld_mc.fieldoff = 0;
579         botfld_mc.fieldoff = width;
580
581         /* select prediction for upper half field */
582         if (dt<=db)
583         {
584                 *best8u = topfld_mc;
585         }
586         else
587         {
588                 *best8u = botfld_mc;
589         }
590
591         /* predict lower half field from top field */
592         /*
593           N.b. For interlaced data width<<4 (width*16) takes us 8 rows
594           down in the same field.  
595           Thus for the fast motion data (2*2
596           sub-sampled) we need to go 4 rows down in the same field.
597           This requires adding width*4 = (width<<2).
598           For the 4*4 sub-sampled motion data we need to go down 2 rows.
599           This requires adding width = width
600          
601         */
602         if (notop)
603                 topfld_mc.sad = dt = 65536;
604         else
605                 fullsearch(engine, toporg,topref,&botssmb,
606                                                 width<<1,
607                                                 i,j+8,sx,sy>>1,8,width,height>>1,
608                                    /* &imint,&jmint, &dt,*/ &topfld_mc);
609         dt = topfld_mc.sad;
610         /* predict lower half field from bottom field */
611         if (nobot)
612                 botfld_mc.sad = db = 65536;
613         else
614                 fullsearch(engine, botorg,botref,&botssmb,width<<1,
615                                                 i,j+8,sx,sy>>1,8,width,height>>1,
616                                    /* &iminb,&jminb, &db,*/ &botfld_mc);
617         db = botfld_mc.sad;
618         /* Set correct field selectors */
619         topfld_mc.fieldsel = 0;
620         botfld_mc.fieldsel = 1;
621         topfld_mc.fieldoff = 0;
622         botfld_mc.fieldoff = width;
623
624         /* select prediction for lower half field */
625         if (dt<=db)
626         {
627                 *best8l = topfld_mc;
628         }
629         else
630         {
631                 *best8l = botfld_mc;
632         }
633 }
634 #if 0
635 static void dpframe_estimate (motion_engine_t *engine,
636         pict_data_s *picture,
637         uint8_t *ref,
638         subsampled_mb_s *ssmb,
639         
640         int i, int j, int iminf[2][2], int jminf[2][2],
641         mb_motion_s *best_mc,
642         int *imindmvp, int *jmindmvp,
643         int *vmcp)
644 {
645         int pref,ppred,delta_x,delta_y;
646         int is,js,it,jt,ib,jb,it0,jt0,ib0,jb0;
647         int imins,jmins,imint,jmint,iminb,jminb,imindmv,jmindmv;
648         int vmc,local_dist;
649
650         /* Calculate Dual Prime distortions for 9 delta candidates
651          * for each of the four minimum field vectors
652          * Note: only for P pictures!
653          */
654
655         /* initialize minimum dual prime distortion to large value */
656         vmc = INT_MAX;
657
658         for (pref=0; pref<2; pref++)
659         {
660                 for (ppred=0; ppred<2; ppred++)
661                 {
662                         /* convert Cartesian absolute to relative motion vector
663                          * values (wrt current macroblock address (i,j)
664                          */
665                         is = iminf[pref][ppred] - (i<<1);
666                         js = jminf[pref][ppred] - (j<<1);
667
668                         if (pref!=ppred)
669                         {
670                                 /* vertical field shift adjustment */
671                                 if (ppred==0)
672                                         js++;
673                                 else
674                                         js--;
675
676                                 /* mvxs and mvys scaling*/
677                                 is<<=1;
678                                 js<<=1;
679                                 if (picture->topfirst == ppred)
680                                 {
681                                         /* second field: scale by 1/3 */
682                                         is = (is>=0) ? (is+1)/3 : -((-is+1)/3);
683                                         js = (js>=0) ? (js+1)/3 : -((-js+1)/3);
684                                 }
685                                 else
686                                         continue;
687                         }
688
689                         /* vector for prediction from field of opposite 'parity' */
690                         if (picture->topfirst)
691                         {
692                                 /* vector for prediction of top field from bottom field */
693                                 it0 = ((is+(is>0))>>1);
694                                 jt0 = ((js+(js>0))>>1) - 1;
695
696                                 /* vector for prediction of bottom field from top field */
697                                 ib0 = ((3*is+(is>0))>>1);
698                                 jb0 = ((3*js+(js>0))>>1) + 1;
699                         }
700                         else
701                         {
702                                 /* vector for prediction of top field from bottom field */
703                                 it0 = ((3*is+(is>0))>>1);
704                                 jt0 = ((3*js+(js>0))>>1) - 1;
705
706                                 /* vector for prediction of bottom field from top field */
707                                 ib0 = ((is+(is>0))>>1);
708                                 jb0 = ((js+(js>0))>>1) + 1;
709                         }
710
711                         /* convert back to absolute half-pel field picture coordinates */
712                         is += i<<1;
713                         js += j<<1;
714                         it0 += i<<1;
715                         jt0 += j<<1;
716                         ib0 += i<<1;
717                         jb0 += j<<1;
718
719                         if (is >= 0 && is <= (width-16)<<1 &&
720                                 js >= 0 && js <= (height-16))
721                         {
722                                 for (delta_y=-1; delta_y<=1; delta_y++)
723                                 {
724                                         for (delta_x=-1; delta_x<=1; delta_x++)
725                                         {
726                                                 /* opposite field coordinates */
727                                                 it = it0 + delta_x;
728                                                 jt = jt0 + delta_y;
729                                                 ib = ib0 + delta_x;
730                                                 jb = jb0 + delta_y;
731
732                                                 if (it >= 0 && it <= (width-16)<<1 &&
733                                                         jt >= 0 && jt <= (height-16) &&
734                                                         ib >= 0 && ib <= (width-16)<<1 &&
735                                                         jb >= 0 && jb <= (height-16))
736                                                 {
737                                                         /* compute prediction error */
738                                                         local_dist = (*pbdist2)(
739                                                                 ref + (is>>1) + (width<<1)*(js>>1),
740                                                                 ref + width + (it>>1) + (width<<1)*(jt>>1),
741                                                                 ssmb->mb,             /* current mb location */
742                                                                 width<<1,       /* adjacent line distance */
743                                                                 is&1, js&1, it&1, jt&1, /* half-pel flags */
744                                                                 8);             /* block height */
745                                                         local_dist += (*pbdist2)(
746                                                                 ref + width + (is>>1) + (width<<1)*(js>>1),
747                                                                 ref + (ib>>1) + (width<<1)*(jb>>1),
748                                                                 ssmb->mb + width,     /* current mb location */
749                                                                 width<<1,       /* adjacent line distance */
750                                                                 is&1, js&1, ib&1, jb&1, /* half-pel flags */
751                                                                 8);             /* block height */
752
753                                                         /* update delta with least distortion vector */
754                                                         if (local_dist < vmc)
755                                                         {
756                                                                 imins = is;
757                                                                 jmins = js;
758                                                                 imint = it;
759                                                                 jmint = jt;
760                                                                 iminb = ib;
761                                                                 jminb = jb;
762                                                                 imindmv = delta_x;
763                                                                 jmindmv = delta_y;
764                                                                 vmc = local_dist;
765                                                         }
766                                                 }
767                                         }  /* end delta x loop */
768                                 } /* end delta y loop */
769                         }
770                 }
771         }
772
773         /* TODO: This is now likely to be obsolete... */
774         /* Compute L1 error for decision purposes */
775         local_dist = (*pbdist1)(
776                 ref + (imins>>1) + (width<<1)*(jmins>>1),
777                 ref + width + (imint>>1) + (width<<1)*(jmint>>1),
778                 ssmb->mb,
779                 width<<1,
780                 imins&1, jmins&1, imint&1, jmint&1,
781                 8);
782 //printf("motion 1 %p\n", pbdist1);
783         local_dist += (*pbdist1)(
784                 ref + width + (imins>>1) + (width<<1)*(jmins>>1),
785                 ref + (iminb>>1) + (width<<1)*(jminb>>1),
786                 ssmb->mb + width,
787                 width<<1,
788                 imins&1, jmins&1, iminb&1, jminb&1,
789                 8);
790 //printf("motion 2\n");
791
792         best_mc->sad = local_dist;
793         best_mc->pos.x = imins;
794         best_mc->pos.y = jmins;
795         *vmcp = vmc;
796         *imindmvp = imindmv;
797         *jmindmvp = jmindmv;
798 //printf("motion 2\n");
799 }
800
801 static void dpfield_estimate(motion_engine_t *engine,
802         pict_data_s *picture,
803         uint8_t *topref,
804         uint8_t *botref, 
805         uint8_t *mb,
806         int i, int j, int imins, int jmins, 
807         mb_motion_s *bestdp_mc,
808         int *vmcp
809         )
810
811 {
812         uint8_t *sameref, *oppref;
813         int io0,jo0,io,jo,delta_x,delta_y,mvxs,mvys,mvxo0,mvyo0;
814         int imino,jmino,imindmv,vmc_dp,local_dist;
815
816         /* Calculate Dual Prime distortions for 9 delta candidates */
817         /* Note: only for P pictures! */
818
819         /* Assign opposite and same reference pointer */
820         if (picture->pict_struct==TOP_FIELD)
821         {
822                 sameref = topref;    
823                 oppref = botref;
824         }
825         else 
826         {
827                 sameref = botref;
828                 oppref = topref;
829         }
830
831         /* convert Cartesian absolute to relative motion vector
832          * values (wrt current macroblock address (i,j)
833          */
834         mvxs = imins - (i<<1);
835         mvys = jmins - (j<<1);
836
837         /* vector for prediction from field of opposite 'parity' */
838         mvxo0 = (mvxs+(mvxs>0)) >> 1;  /* mvxs / / */
839         mvyo0 = (mvys+(mvys>0)) >> 1;  /* mvys / / 2*/
840
841                         /* vertical field shift correction */
842         if (picture->pict_struct==TOP_FIELD)
843                 mvyo0--;
844         else
845                 mvyo0++;
846
847                         /* convert back to absolute coordinates */
848         io0 = mvxo0 + (i<<1);
849         jo0 = mvyo0 + (j<<1);
850
851                         /* initialize minimum dual prime distortion to large value */
852         vmc_dp = 1 << 30;
853
854         for (delta_y = -1; delta_y <= 1; delta_y++)
855         {
856                 for (delta_x = -1; delta_x <=1; delta_x++)
857                 {
858                         /* opposite field coordinates */
859                         io = io0 + delta_x;
860                         jo = jo0 + delta_y;
861
862                         if (io >= 0 && io <= (width-16)<<1 &&
863                                 jo >= 0 && jo <= (height2-16)<<1)
864                         {
865                                 /* compute prediction error */
866                                 local_dist = (*pbdist2)(
867                                         sameref + (imins>>1) + width2*(jmins>>1),
868                                         oppref  + (io>>1)    + width2*(jo>>1),
869                                         mb,             /* current mb location */
870                                         width2,         /* adjacent line distance */
871                                         imins&1, jmins&1, io&1, jo&1, /* half-pel flags */
872                                         16);            /* block height */
873
874                                 /* update delta with least distortion vector */
875                                 if (local_dist < vmc_dp)
876                                 {
877                                         imino = io;
878                                         jmino = jo;
879                                         imindmv = delta_x;
880                                         vmc_dp = local_dist;
881                                 }
882                         }
883                 }  /* end delta x loop */
884         } /* end delta y loop */
885
886         /* Compute L1 error for decision purposes */
887         bestdp_mc->sad =
888                 (*pbdist1)(
889                         sameref + (imins>>1) + width2*(jmins>>1),
890                         oppref  + (imino>>1) + width2*(jmino>>1),
891                         mb,             /* current mb location */
892                         width2,         /* adjacent line distance */
893                         imins&1, jmins&1, imino&1, jmino&1, /* half-pel flags */
894                         16);            /* block height */
895
896         bestdp_mc->pos.x = imindmv;
897         bestdp_mc->pos.x = imindmv;
898         *vmcp = vmc_dp;
899 }
900 #endif
901
902 /* 
903  *   Vectors of motion compensations 
904  */
905
906 /*
907         Take a vector of motion compensations and repeatedly make passes
908         discarding all elements whose sad "weight" is above the current mean weight.
909 */
910
911 static void sub_mean_reduction( mc_result_s *matches, int len, 
912                                                                 int times,
913                                                             int *newlen_res, int *minweight_res)
914 {
915         int i,j;
916         int weight_sum;
917         int mean_weight;
918         if( len == 0 )
919         {
920                 *minweight_res = 100000;
921                 *newlen_res = 0;
922                 return;
923         }
924
925         for(;;)
926         {
927                 weight_sum = 0;
928                 for( i = 0; i < len ; ++i )
929                         weight_sum += matches[i].weight;
930                 mean_weight = weight_sum / len;
931                 
932                 if( times <= 0)
933                         break;
934                         
935                 j = 0;
936                 for( i =0; i < len; ++i )
937                 {
938                         if( matches[i].weight <= mean_weight )
939                         {
940                                 matches[j] = matches[i];
941                                 ++j;
942                         }
943                 }
944                 len = j;
945                 --times;
946         }
947         *newlen_res = len;
948         *minweight_res = mean_weight;
949 }
950
951 /*
952   Build a vector of the top   4*4 sub-sampled motion compensations in the box
953   (ilow,jlow) to (ihigh,jhigh).
954
955         The algorithm is as follows:
956         1. coarse matches on an 8*8 grid of positions are collected that fall below
957         a (very conservative) sad threshold (basically 50% more than moving average of
958         the mean sad of such matches).
959         
960         2. The worse than-average matches are discarded.
961         
962         3. The remaining coarse matches are expanded with the left/lower neighbouring
963         4*4 grid matches. Again only those better than a threshold (this time the mean
964         of the 8*8 grid matches are retained.
965         
966         4. Multiple passes are made discarding worse than-average matches.  The number of
967         passes is specified by the user.  The default it 2 (leaving roughly 1/4 of the matches).
968         
969         The net result is very fast and find good matches if they're to be found.  I.e. the
970         penalty over exhaustive search is pretty low.
971         
972         NOTE: The "discard below average" trick depends critically on having some variation in
973         the matches.  The slight penalty imposed for distant matches (reasonably since the 
974         motion vectors have to be encoded) is *vital* as otherwise pathologically bad
975         performance results on highly uniform images.
976         
977         TODO: We should probably allow the user to eliminate the initial thinning of 8*8
978         grid matches if ultimate quality is demanded (e.g. for low bit-rate applications).
979
980 */
981
982
983 static int build_sub44_mcomps(motion_engine_t *engine,
984          int ilow, int jlow, int ihigh, int jhigh, 
985                                                         int i0, int j0,
986                                                                 int null_mc_sad,
987                                                         uint8_t *s44org, uint8_t *s44blk, int qlx, int qh )
988 {
989         uint8_t *s44orgblk;
990         int istrt = ilow-i0;
991         int jstrt = jlow-j0;
992         int iend = ihigh-i0;
993         int jend = jhigh-j0;
994         int mean_weight;
995         int threshold;
996
997 #ifdef X86_CPU
998
999         /*int rangex, rangey;
1000         static int rough_num_mcomps;
1001         static mc_result_s rough_mcomps[MAX_44_MATCHES];
1002         int k;
1003         */
1004 #else
1005         int i,j;
1006         int s1;
1007         uint8_t *old_s44orgblk;
1008 #endif
1009         /* N.b. we may ignore the right hand block of the pair going over the
1010            right edge as we have carefully allocated the buffer oversize to ensure
1011            no memory faults.  The later motion compensation calculations
1012            performed on the results of this pass will filter out
1013            out-of-range blocks...
1014         */
1015
1016
1017         engine->sub44_num_mcomps = 0;
1018         
1019         threshold = 6*null_mc_sad / (4*4*mc_44_red);
1020         s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
1021         
1022         /* Exhaustive search on 4*4 sub-sampled data.  This is affordable because
1023                 (a)     it is only 16th of the size of the real 1-pel data 
1024                 (b) we ignore those matches with an sad above our threshold.    
1025         */
1026 #ifndef X86_CPU
1027
1028                 /* Invariant:  s44orgblk = s44org+(i>>2)+qlx*(j>>2) */
1029                 s44orgblk = s44org+(ilow>>2)+qlx*(jlow>>2);
1030                 for( j = jstrt; j <= jend; j += 4 )
1031                 {
1032                         old_s44orgblk = s44orgblk;
1033                         for( i = istrt; i <= iend; i += 4 )
1034                         {
1035                                 s1 = ((*pdist44)( s44orgblk,s44blk,qlx,qh) & 0xffff) + abs(i-i0) + abs(j-j0);
1036                                 if( s1 < threshold )
1037                                 {
1038                                         engine->sub44_mcomps[engine->sub44_num_mcomps].x = i;
1039                                         engine->sub44_mcomps[engine->sub44_num_mcomps].y = j;
1040                                         engine->sub44_mcomps[engine->sub44_num_mcomps].weight = s1 ;
1041                                         ++engine->sub44_num_mcomps;
1042                                 }
1043                                 s44orgblk += 1;
1044                         }
1045                         s44orgblk = old_s44orgblk + qlx;
1046                 }
1047                         
1048 #else
1049
1050                 engine->sub44_num_mcomps
1051                         = (*pmblock_sub44_dists)( s44orgblk, s44blk,
1052                                                                   istrt, jstrt,
1053                                                                   iend, jend, 
1054                                                                   qh, qlx, 
1055                                                                   threshold,
1056                                                                   engine->sub44_mcomps);
1057
1058 #endif  
1059                 /* If we're really pushing quality we reduce once otherwise twice. */
1060                         
1061                 sub_mean_reduction( engine->sub44_mcomps, engine->sub44_num_mcomps, 1+(mc_44_red>1),
1062                                                     &engine->sub44_num_mcomps, &mean_weight);
1063
1064
1065         return engine->sub44_num_mcomps;
1066 }
1067
1068
1069 /* Build a vector of the best 2*2 sub-sampled motion
1070   compensations using the best 4*4 matches as starting points.  As
1071   with with the 4*4 matches We don't collect them densely as they're
1072   just search starting points for 1-pel search and ones that are 1 out
1073   should still give better than average matches...
1074
1075 */
1076
1077
1078 static int build_sub22_mcomps(motion_engine_t *engine,
1079                 int i0,  int j0, int ihigh, int jhigh, 
1080                 int null_mc_sad,
1081                 uint8_t *s22org,  uint8_t *s22blk, 
1082                 int flx, int fh,  int searched_sub44_size )
1083 {
1084         int i,k,s;
1085         int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
1086
1087         int min_weight;
1088         int ilim = ihigh-i0;
1089         int jlim = jhigh-j0;
1090         blockxy matchrec;
1091         uint8_t *s22orgblk;
1092
1093         engine->sub22_num_mcomps = 0;
1094         for( k = 0; k < searched_sub44_size; ++k )
1095         {
1096
1097                 matchrec.x = engine->sub44_mcomps[k].x;
1098                 matchrec.y = engine->sub44_mcomps[k].y;
1099
1100                 s22orgblk =  s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
1101                 for( i = 0; i < 4; ++i )
1102                 {
1103                         if( matchrec.x <= ilim && matchrec.y <= jlim )
1104                         {       
1105                                 s = (*pdist22)( s22orgblk,s22blk,flx,fh);
1106                                 if( s < threshold )
1107                                 {
1108                                         engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
1109                                         engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
1110                                         engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
1111                                         ++engine->sub22_num_mcomps;
1112                                 }
1113                         }
1114
1115                         if( i == 1 )
1116                         {
1117                                 s22orgblk += flx-1;
1118                                 matchrec.x -= 2;
1119                                 matchrec.y += 2;
1120                         }
1121                         else
1122                         {
1123                                 s22orgblk += 1;
1124                                 matchrec.x += 2;
1125                                 
1126                         }
1127                 }
1128
1129         }
1130
1131         
1132         sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
1133                                                 &engine->sub22_num_mcomps, &min_weight );
1134         return engine->sub22_num_mcomps;
1135 }
1136
1137 #ifdef X86_CPU
1138 int build_sub22_mcomps_mmxe(motion_engine_t *engine,
1139         int i0,  int j0, int ihigh, int jhigh, 
1140         int null_mc_sad,
1141         uint8_t *s22org,  uint8_t *s22blk, 
1142         int flx, int fh,  int searched_sub44_size )
1143 {
1144         int i,k,s;
1145         int threshold = 6*null_mc_sad / (2 * 2*mc_22_red);
1146
1147         int min_weight;
1148         int ilim = ihigh-i0;
1149         int jlim = jhigh-j0;
1150         blockxy matchrec;
1151         uint8_t *s22orgblk;
1152         int resvec[4];
1153
1154         /* TODO: The calculation of the lstrow offset really belongs in
1155        asm code... */
1156         int lstrow=(fh-1)*flx;
1157
1158         engine->sub22_num_mcomps = 0;
1159         for( k = 0; k < searched_sub44_size; ++k )
1160         {
1161
1162                 matchrec.x = engine->sub44_mcomps[k].x;
1163                 matchrec.y = engine->sub44_mcomps[k].y;
1164
1165                 s22orgblk =  s22org +((matchrec.y+j0)>>1)*flx +((matchrec.x+i0)>>1);
1166                 mblockq_dist22_mmxe(s22orgblk+lstrow, s22blk+lstrow, flx, fh, resvec);
1167                 for( i = 0; i < 4; ++i )
1168                 {
1169                         if( matchrec.x <= ilim && matchrec.y <= jlim )
1170                         {       
1171                                 s =resvec[i];
1172                                 if( s < threshold )
1173                                 {
1174                                         engine->sub22_mcomps[engine->sub22_num_mcomps].x = (int8_t)matchrec.x;
1175                                         engine->sub22_mcomps[engine->sub22_num_mcomps].y = (int8_t)matchrec.y;
1176                                         engine->sub22_mcomps[engine->sub22_num_mcomps].weight = s;
1177                                         ++engine->sub22_num_mcomps;
1178                                 }
1179                         }
1180
1181                         if( i == 1 )
1182                         {
1183                                 matchrec.x -= 2;
1184                                 matchrec.y += 2;
1185                         }
1186                         else
1187                         {
1188                                 matchrec.x += 2;
1189                         }
1190                 }
1191
1192         }
1193
1194         
1195         sub_mean_reduction( engine->sub22_mcomps, engine->sub22_num_mcomps, 2,
1196                                                 &engine->sub22_num_mcomps, &min_weight );
1197         return engine->sub22_num_mcomps;
1198 }
1199
1200 #endif
1201
1202 /*
1203   Search for the best 1-pel match within 1-pel of a good 2*2-pel match.
1204         TODO: Its a bit silly to cart around absolute M/C co-ordinates that
1205         eventually get turned into relative ones anyway...
1206 */
1207
1208
1209 static void find_best_one_pel(motion_engine_t *engine,
1210          uint8_t *org, uint8_t *blk,
1211          int searched_size,
1212          int i0, int j0,
1213          int ilow, int jlow,
1214          int xmax, int ymax,
1215          int lx, int h, 
1216          mb_motion_s *res
1217         )
1218
1219 {
1220         int i,k;
1221         int d;
1222         blockxy minpos = res->pos;
1223         int dmin = INT_MAX;
1224         uint8_t *orgblk;
1225         int penalty;
1226         blockxy matchrec;
1227  
1228         for( k = 0; k < searched_size; ++k )
1229         {       
1230                 matchrec.x = i0 + engine->sub22_mcomps[k].x;
1231                 matchrec.y = j0 + engine->sub22_mcomps[k].y;
1232                 orgblk = org + matchrec.x+lx*matchrec.y;
1233                 penalty = abs(matchrec.x)+abs(matchrec.y);
1234
1235                 for( i = 0; i < 4; ++i )
1236                 {
1237                         if( matchrec.x <= xmax && matchrec.y <= ymax )
1238                         {
1239                 
1240                                 d = penalty+(*pdist1_00)(orgblk,blk,lx,h, dmin);
1241                                 if (d<dmin)
1242                                 {
1243                                         dmin = d;
1244                                         minpos = matchrec;
1245                                 }
1246                         }
1247                         if( i == 1 )
1248                         {
1249                                 orgblk += lx-1;
1250                                 matchrec.x -= 1;
1251                                 matchrec.y += 1;
1252                         }
1253                         else
1254                         {
1255                                 orgblk += 1;
1256                                 matchrec.x += 1;
1257                         }
1258                 }
1259         }
1260
1261         res->pos = minpos;
1262         res->blk = org + minpos.x+lx*minpos.y;
1263         res->sad = dmin;
1264
1265 }
1266
1267 #ifdef X86_CPU
1268 void find_best_one_pel_mmxe(motion_engine_t *engine,
1269         uint8_t *org, uint8_t *blk,
1270         int searched_size,
1271         int i0, int j0,
1272         int ilow, int jlow,
1273         int xmax, int ymax,
1274         int lx, int h, 
1275         mb_motion_s *res
1276         )
1277
1278 {
1279         int i,k;
1280         int d;
1281         blockxy minpos = res->pos;
1282         int dmin = INT_MAX;
1283         uint8_t *orgblk;
1284         int penalty;
1285         blockxy matchrec;
1286         int resvec[4];
1287
1288  
1289         for( k = 0; k < searched_size; ++k )
1290         {       
1291                 matchrec.x = i0 + engine->sub22_mcomps[k].x;
1292                 matchrec.y = j0 + engine->sub22_mcomps[k].y;
1293                 orgblk = org + matchrec.x+lx*matchrec.y;
1294                 penalty = abs(matchrec.x)+abs(matchrec.y);
1295                 
1296
1297                 mblockq_dist1_mmxe(orgblk,blk,lx,h, resvec);
1298
1299                 for( i = 0; i < 4; ++i )
1300                 {
1301                         if( matchrec.x <= xmax && matchrec.y <= ymax )
1302                         {
1303                 
1304                                 d = penalty+resvec[i];
1305                                 if (d<dmin)
1306                                 {
1307                                         dmin = d;
1308                                         minpos = matchrec;
1309                                 }
1310                         }
1311                         if( i == 1 )
1312                         {
1313                                 orgblk += lx-1;
1314                                 matchrec.x -= 1;
1315                                 matchrec.y += 1;
1316                         }
1317                         else
1318                         {
1319                                 orgblk += 1;
1320                                 matchrec.x += 1;
1321                         }
1322                 }
1323         }
1324
1325         res->pos = minpos;
1326         res->blk = org + minpos.x+lx*minpos.y;
1327         res->sad = dmin;
1328
1329 }
1330 #endif 
1331  
1332 /*
1333  * full search block matching
1334  *
1335  * A.Stevens 2000: This is now a big misnomer.  The search is now a hierarchical/sub-sampling
1336  * search not a full search.  However, experiments have shown it is always close to
1337  * optimal and almost always very close or optimal.
1338  *
1339  * blk: top left pel of (16*h) block
1340  * s22blk: top element of fast motion compensation block corresponding to blk
1341  * h: height of block
1342  * lx: distance (in bytes) of vertically adjacent pels in ref,blk
1343  * org: top left pel of source reference picture
1344  * ref: top left pel of reconstructed reference picture
1345  * i0,j0: center of search window
1346  * sx,sy: half widths of search window
1347  * xmax,ymax: right/bottom limits of search area
1348  * iminp,jminp: pointers to where the result is stored
1349  *              result is given as half pel offset from ref(0,0)
1350  *              i.e. NOT relative to (i0,j0)
1351  */
1352
1353
1354 static void fullsearch(motion_engine_t *engine,
1355         uint8_t *org,
1356         uint8_t *ref,
1357         subsampled_mb_s *ssblk,
1358         int lx, int i0, int j0, 
1359         int sx, int sy, int h,
1360         int xmax, int ymax,
1361         /* int *iminp, int *jminp, int *sadminp, */
1362         mb_motion_s *res
1363         )
1364 {
1365         mb_motion_s best;
1366         /* int imin, jmin, dmin */
1367         int i,j,ilow,ihigh,jlow,jhigh;
1368         int d;
1369
1370         /* NOTE: Surprisingly, the initial motion compensation search
1371            works better when the original image not the reference (reconstructed)
1372            image is used. 
1373         */
1374         uint8_t *s22org = (uint8_t*)(org+fsubsample_offset);
1375         uint8_t *s44org = (uint8_t*)(org+qsubsample_offset);
1376         uint8_t *orgblk;
1377         int flx = lx >> 1;
1378         int qlx = lx >> 2;
1379         int fh = h >> 1;
1380         int qh = h >> 2;
1381
1382         /* xmax and ymax into more useful form... */
1383         xmax -= 16;
1384         ymax -= h;
1385   
1386   
1387         /* The search radii are *always* multiples of 4 to avoid messiness
1388            in the initial 4*4 pel search.  This is handled by the
1389            parameter checking/processing code in readparmfile() */
1390   
1391         /* Create a distance-order mcomps of possible motion compensations
1392           based on the fast estimation data - 4*4 pel sums (4*4
1393           sub-sampled) rather than actual pel's.  1/16 the size...  */
1394         jlow = j0-sy;
1395         jlow = jlow < 0 ? 0 : jlow;
1396         jhigh =  j0+(sy-1);
1397         jhigh = jhigh > ymax ? ymax : jhigh;
1398         ilow = i0-sx;
1399         ilow = ilow < 0 ? 0 : ilow;
1400         ihigh =  i0+(sx-1);
1401         ihigh = ihigh > xmax ? xmax : ihigh;
1402
1403         /*
1404            Very rarely this may fail to find matchs due to all the good
1405            looking ones being over threshold. hence we make sure we
1406            fall back to a 0 motion compensation in this case.
1407            
1408                  The sad for the 0 motion compensation is also very useful as
1409                  a basis for setting thresholds for rejecting really dud 4*4
1410                  and 2*2 sub-sampled matches.
1411         */
1412         memset(&best, 0, sizeof(best));
1413         best.sad = (*pdist1_00)(ref + i0 + j0 * lx,
1414                 ssblk->mb,
1415                 lx,
1416                 h,
1417                 best.sad);
1418         best.pos.x = i0;
1419         best.pos.y = j0;
1420
1421         engine->sub44_num_mcomps = build_sub44_mcomps(engine,
1422                                                                          ilow, jlow, ihigh, jhigh,
1423                                                                           i0, j0,
1424                                                                           best.sad,
1425                                                                           s44org, 
1426                                                                           ssblk->qmb, qlx, qh );
1427
1428         
1429         /* Now create a distance-ordered mcomps of possible motion
1430            compensations based on the fast estimation data - 2*2 pel sums
1431            using the best fraction of the 4*4 estimates However we cover
1432            only coarsely... on 4-pel boundaries...  */
1433
1434         engine->sub22_num_mcomps = (*pbuild_sub22_mcomps)( engine, i0, j0, ihigh,  jhigh, 
1435                                                                                            best.sad,
1436                                                                                            s22org, ssblk->fmb, flx, fh, 
1437                                                                                            engine->sub44_num_mcomps );
1438
1439                 
1440     /* Now choose best 1-pel match from what approximates (not exact
1441            due to the pre-processing trick with the mean) the top 1/2 of
1442            the 2*2 matches 
1443         */
1444         
1445
1446         (*pfind_best_one_pel)( engine, ref, ssblk->mb, engine->sub22_num_mcomps,
1447                                            i0, j0,
1448                                            ilow, jlow, xmax, ymax, 
1449                                            lx, h, &best );
1450
1451         /* Final polish: half-pel search of best candidate against
1452            reconstructed image. 
1453         */
1454
1455         best.pos.x <<= 1; 
1456         best.pos.y <<= 1;
1457         best.hx = 0;
1458         best.hy = 0;
1459
1460         ilow = best.pos.x - (best.pos.x>(ilow<<1));
1461         ihigh = best.pos.x + (best.pos.x<((ihigh)<<1));
1462         jlow = best.pos.y - (best.pos.y>(jlow<<1));
1463         jhigh =  best.pos.y+ (best.pos.y<((jhigh)<<1));
1464
1465         for (j=jlow; j<=jhigh; j++)
1466         {
1467                 for (i=ilow; i<=ihigh; i++)
1468                 {
1469                         orgblk = ref+(i>>1)+((j>>1)*lx);
1470                         if( i&1 )
1471                         {
1472                                 if( j & 1 )
1473                                         d = (*pdist1_11)(orgblk,ssblk->mb,lx,h);
1474                                 else
1475                                         d = (*pdist1_01)(orgblk,ssblk->mb,lx,h);
1476                         }
1477                         else
1478                         {
1479                                 if( j & 1 )
1480                                         d = (*pdist1_10)(orgblk,ssblk->mb,lx,h);
1481                                 else
1482                                         d = (*pdist1_00)(orgblk,ssblk->mb,lx,h,best.sad);
1483                         }
1484                         if (d<best.sad)
1485                         {
1486                                 best.sad = d;
1487                                 best.pos.x = i;
1488                                 best.pos.y = j;
1489                                 best.blk = orgblk;
1490                                 best.hx = i&1;
1491                                 best.hy = j&1;
1492                         }
1493                 }
1494         }
1495         best.var = (*pdist2)(best.blk, ssblk->mb, lx, best.hx, best.hy, h);
1496         *res = best;
1497 }
1498
1499 /*
1500  * total absolute difference between two (16*h) blocks
1501  * including optional half pel interpolation of blk1 (hx,hy)
1502  * blk1,blk2: addresses of top left pels of both blocks
1503  * lx:        distance (in bytes) of vertically adjacent pels
1504  * hx,hy:     flags for horizontal and/or vertical interpolation
1505  * h:         height of block (usually 8 or 16)
1506  * distlim:   bail out if sum exceeds this value
1507  */
1508
1509 /* A.Stevens 2000: New version for highly pipelined CPUs where branching is
1510    costly.  Really it sucks that C doesn't define a stdlib abs that could
1511    be realised as a compiler intrinsic using appropriate CPU instructions.
1512    That 1970's heritage...
1513 */
1514
1515
1516 static int dist1_00(uint8_t *blk1,uint8_t *blk2,
1517                                         int lx, int h,int distlim)
1518 {
1519         uint8_t *p1,*p2;
1520         int j;
1521         int s;
1522         int v;
1523
1524         s = 0;
1525         p1 = blk1;
1526         p2 = blk2;
1527         for (j=0; j<h; j++)
1528         {
1529
1530 #define pipestep(o) v = p1[o]-p2[o]; s+= abs(v);
1531                 pipestep(0);  pipestep(1);  pipestep(2);  pipestep(3);
1532                 pipestep(4);  pipestep(5);  pipestep(6);  pipestep(7);
1533                 pipestep(8);  pipestep(9);  pipestep(10); pipestep(11);
1534                 pipestep(12); pipestep(13); pipestep(14); pipestep(15);
1535 #undef pipestep
1536
1537                 if (s >= distlim)
1538                         break;
1539                         
1540                 p1+= lx;
1541                 p2+= lx;
1542         }
1543         return s;
1544 }
1545
1546 static int dist1_01(uint8_t *blk1,uint8_t *blk2,
1547                                         int lx, int h)
1548 {
1549         uint8_t *p1,*p2;
1550         int i,j;
1551         int s;
1552         int v;
1553
1554         s = 0;
1555         p1 = blk1;
1556         p2 = blk2;
1557         for (j=0; j<h; j++)
1558         {
1559                 for (i=0; i<16; i++)
1560                 {
1561
1562                         v = ((unsigned int)(p1[i]+p1[i+1])>>1) - p2[i];
1563                         /*
1564                           v = ((p1[i]>>1)+(p1[i+1]>>1)>>1) - (p2[i]>>1);
1565                         */
1566                         s+=abs(v);
1567                 }
1568                 p1+= lx;
1569                 p2+= lx;
1570         }
1571         return s;
1572 }
1573
1574 static int dist1_10(uint8_t *blk1,uint8_t *blk2,
1575                                         int lx, int h)
1576 {
1577         uint8_t *p1,*p1a,*p2;
1578         int i,j;
1579         int s;
1580         int v;
1581
1582         s = 0;
1583         p1 = blk1;
1584         p2 = blk2;
1585         p1a = p1 + lx;
1586         for (j=0; j<h; j++)
1587         {
1588                 for (i=0; i<16; i++)
1589                 {
1590                         v = ((unsigned int)(p1[i]+p1a[i])>>1) - p2[i];
1591                         s+=abs(v);
1592                 }
1593                 p1 = p1a;
1594                 p1a+= lx;
1595                 p2+= lx;
1596         }
1597
1598         return s;
1599 }
1600
1601 static int dist1_11(uint8_t *blk1,uint8_t *blk2,
1602                                         int lx, int h)
1603 {
1604         uint8_t *p1,*p1a,*p2;
1605         int i,j;
1606         int s;
1607         int v;
1608
1609         s = 0;
1610         p1 = blk1;
1611         p2 = blk2;
1612         p1a = p1 + lx;
1613           
1614         for (j=0; j<h; j++)
1615         {
1616                 for (i=0; i<16; i++)
1617                 {
1618                         v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1])>>2) - p2[i];
1619                         s+=abs(v);
1620                 }
1621                 p1 = p1a;
1622                 p1a+= lx;
1623                 p2+= lx;
1624         }
1625         return s;
1626 }
1627
1628 /* USED only during debugging...
1629 void check_fast_motion_data(uint8_t *blk, char *label )
1630 {
1631   uint8_t *b, *nb;
1632   uint8_t *pb,*p;
1633   uint8_t *qb;
1634   uint8_t *start_s22blk, *start_s44blk;
1635   int i;
1636   unsigned int mismatch;
1637   int nextfieldline;
1638   start_s22blk = (blk+height*width);
1639   start_s44blk = (blk+height*width+(height>>1)*(width>>1));
1640
1641   if (pict_struct==FRAME_PICTURE)
1642         {
1643           nextfieldline = width;
1644         }
1645   else
1646         {
1647           nextfieldline = 2*width;
1648         }
1649
1650         mismatch = 0;
1651         pb = start_s22blk;
1652         qb = start_s44blk;
1653         p = blk;
1654         while( pb < qb )
1655         {
1656                 for( i = 0; i < nextfieldline/2; ++i )
1657                 {
1658                         mismatch |= ((p[0] + p[1] + p[nextfieldline] + p[nextfieldline+1])>>2) != *pb;
1659                         p += 2;
1660                         ++pb;                   
1661                 }
1662                 p += nextfieldline;
1663         }
1664                 if( mismatch )
1665                         {
1666                                 printf( "Mismatch detected check %s for buffer %08x\n", label,  blk );
1667                                         exit(1);
1668                         }
1669 }
1670 */
1671
1672 /* 
1673    Append fast motion estimation data to original luminance
1674    data.  N.b. memory allocation for luminance data allows space
1675    for this information...
1676  */
1677
1678 void fast_motion_data(uint8_t *blk, int picture_struct )
1679 {
1680         uint8_t *b, *nb;
1681         uint8_t *pb;
1682         uint8_t *qb;
1683         uint8_t *start_s22blk, *start_s44blk;
1684         int i;
1685         int nextfieldline;
1686 #ifdef TEST_RCSEARCH
1687         uint16_t *start_rowblk, *start_colblk;
1688         uint16_t *pc, *pr,*p;
1689         int rowsum;
1690         int j,s;
1691         int down16 = width*16;
1692         uint16_t sums[32];
1693         uint16_t rowsums[2048];
1694         uint16_t colsums[2048];  /* TODO: BUG: should resize with width */
1695         start_rowblk = (uint16_t *)blk+rowsums_offset;
1696         start_colblk = (uint16_t *)blk+colsums_offset;
1697 #endif 
1698   
1699
1700         /* In an interlaced field the "next" line is 2 width's down 
1701            rather than 1 width down                                 */
1702
1703         if (picture_struct==FRAME_PICTURE)
1704         {
1705                 nextfieldline = width;
1706         }
1707         else
1708         {
1709                 nextfieldline = 2*width;
1710         }
1711
1712         start_s22blk   = blk+fsubsample_offset;
1713         start_s44blk   = blk+qsubsample_offset;
1714         b = blk;
1715         nb = (blk+nextfieldline);
1716         /* Sneaky stuff here... we can do lines in both fields at once */
1717         pb = (uint8_t *) start_s22blk;
1718
1719         while( nb < start_s22blk )
1720         {
1721                 for( i = 0; i < nextfieldline/4; ++i ) /* We're doing 4 pels horizontally at once */
1722                 {
1723                         /* TODO: A.Stevens this has to be the most word-length dependent
1724                            code in the world.  Better than MMX assembler though I guess... */
1725                         pb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
1726                         pb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;     
1727                         pb += 2;
1728                         b += 4;
1729                         nb += 4;
1730                 }
1731                 b += nextfieldline;
1732                 nb = b + nextfieldline;
1733         }
1734
1735
1736         /* Now create the 4*4 sub-sampled data from the 2*2 
1737            N.b. the 2*2 sub-sampled motion data preserves the interlace structure of the
1738            original.  Albeit half as many lines and pixels...
1739         */
1740
1741         nextfieldline = nextfieldline >> 1;
1742
1743         qb = start_s44blk;
1744         b  = start_s22blk;
1745         nb = (start_s22blk+nextfieldline);
1746
1747         while( nb < start_s44blk )
1748         {
1749                 for( i = 0; i < nextfieldline/4; ++i )
1750                 {
1751                         /* TODO: BRITTLE: A.Stevens - this only works for uint8_t = uint8_t */
1752                         qb[0] = (b[0]+b[1]+nb[0]+nb[1])>>2;
1753                         qb[1] = (b[2]+b[3]+nb[2]+nb[3])>>2;
1754                         qb += 2;
1755                         b += 4;
1756                         nb += 4;
1757                 }
1758                 b += nextfieldline;
1759                 nb = b + nextfieldline;
1760         }
1761
1762 #ifdef TEST_RCSEARCH
1763         /* TODO: BUG: THIS CODE DOES NOT YET ALLOW FOR INTERLACED FIELDS.... */
1764   
1765         /*
1766           Initial row sums....
1767         */
1768         pb = blk;
1769         for(j = 0; j < height; ++j )
1770         {
1771                 rowsum = 0;
1772                 for( i = 0; i < 16; ++ i )
1773                 {
1774                         rowsum += pb[i];
1775                 }
1776                 rowsums[j] = rowsum;
1777                 pb += width;
1778         }
1779   
1780         /*
1781           Initial column sums
1782         */
1783         for( i = 0; i < width; ++i )
1784         {
1785                 colsums[i] = 0;
1786         }
1787         pb = blk;
1788         for( j = 0; j < 16; ++j )
1789         {
1790                 for( i = 0; i < width; ++i )
1791                 {
1792                         colsums[i] += *pb;
1793                         ++pb;
1794                 }
1795         }
1796   
1797         /* Now fill in the row/column sum tables...
1798            Note: to allow efficient construction of sum/col differences for a
1799            given position row sums are held in a *column major* array
1800            (changing y co-ordinate makes for small index changes)
1801            the col sums are held in a *row major* array
1802         */
1803   
1804         pb = blk;
1805         pc = start_colblk;
1806         for(j = 0; j <32; ++j )
1807         {
1808                 pr = start_rowblk;
1809                 rowsum = rowsums[j];
1810                 for( i = 0; i < width-16; ++i )
1811                 {
1812                         pc[i] = colsums[i];
1813                         pr[j] = rowsum;
1814                         colsums[i] = (colsums[i] + pb[down16] )-pb[0];
1815                         rowsum = (rowsum + pb[16]) - pb[0];
1816                         ++pb;
1817                         pr += height;
1818                 }
1819                 pb += 16;   /* move pb on to next row... rememember we only did width-16! */
1820                 pc += width;
1821         }
1822 #endif          
1823 }
1824
1825
1826 static int dist22( uint8_t *s22blk1, uint8_t *s22blk2,int flx,int fh)
1827 {
1828         uint8_t *p1 = s22blk1;
1829         uint8_t *p2 = s22blk2;
1830         int s = 0;
1831         int j;
1832
1833         for( j = 0; j < fh; ++j )
1834         {
1835                 int diff;
1836 #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff)
1837                 pipestep(0); pipestep(1);
1838                 pipestep(2); pipestep(3);
1839                 pipestep(4); pipestep(5);
1840                 pipestep(6); pipestep(7);
1841                 p1 += flx;
1842                 p2 += flx;
1843 #undef pipestep
1844         }
1845
1846         return s;
1847 }
1848
1849
1850
1851 /*
1852   Sum absolute differences for 4*4 sub-sampled data.  
1853
1854   TODO: currently assumes  only 16*16 or 16*8 motion compensation will be used...
1855   I.e. 4*4 or 4*2 sub-sampled blocks will be compared.
1856  */
1857
1858
1859 static int dist44( uint8_t *s44blk1, uint8_t *s44blk2,int qlx,int qh)
1860 {
1861         uint8_t *p1 = s44blk1;
1862         uint8_t *p2 = s44blk2;
1863         int s = 0;
1864         int diff;
1865
1866         /* #define pipestep(o) diff = p1[o]-p2[o]; s += abs(diff) */
1867 #define pipestep(o) diff = p1[o]-p2[o]; s += diff < 0 ? -diff : diff;
1868         pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
1869         if( qh > 1 )
1870         {
1871                 p1 += qlx; p2 += qlx;
1872                 pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
1873                 if( qh > 2 )
1874                 {
1875                         p1 += qlx; p2 += qlx;
1876                         pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
1877                         p1 += qlx; p2 += qlx;
1878                         pipestep(0); pipestep(1);        pipestep(2); pipestep(3);
1879                 }
1880         }
1881
1882
1883         return s;
1884 }
1885
1886 /*
1887  * total squared difference between two (8*h) blocks of 2*2 sub-sampled pels
1888  * blk1,blk2: addresses of top left pels of both blocks
1889  * lx:        distance (in bytes) of vertically adjacent pels
1890  * h:         height of block (usually 8 or 16)
1891  */
1892  
1893 static int dist2_22(uint8_t *blk1, uint8_t *blk2, int lx, int h)
1894 {
1895         uint8_t *p1 = blk1, *p2 = blk2;
1896         int i,j,v;
1897         int s = 0;
1898         for (j=0; j<h; j++)
1899         {
1900                 for (i=0; i<8; i++)
1901                 {
1902                         v = p1[i] - p2[i];
1903                         s+= v*v;
1904                 }
1905                 p1+= lx;
1906                 p2+= lx;
1907         }
1908         return s;
1909 }
1910
1911 /* total squared difference between bidirection prediction of (8*h)
1912  * blocks of 2*2 sub-sampled pels and reference 
1913  * blk1f, blk1b,blk2: addresses of top left
1914  * pels of blocks 
1915  * lx: distance (in bytes) of vertically adjacent
1916  * pels 
1917  * h: height of block (usually 4 or 8)
1918  */
1919  
1920 static int bdist2_22(uint8_t *blk1f, uint8_t *blk1b, uint8_t *blk2, 
1921                                          int lx, int h)
1922 {
1923         uint8_t *p1f = blk1f,*p1b = blk1b,*p2 = blk2;
1924         int i,j,v;
1925         int s = 0;
1926         for (j=0; j<h; j++)
1927         {
1928                 for (i=0; i<8; i++)
1929                 {
1930                         v = ((p1f[i]+p1b[i]+1)>>1) - p2[i];
1931                         s+= v*v;
1932                 }
1933                 p1f+= lx;
1934                 p1b+= lx;
1935                 p2+= lx;
1936         }
1937         return s;
1938 }
1939
1940 /*
1941  * total squared difference between two (16*h) blocks
1942  * including optional half pel interpolation of blk1 (hx,hy)
1943  * blk1,blk2: addresses of top left pels of both blocks
1944  * lx:        distance (in bytes) of vertically adjacent pels
1945  * hx,hy:     flags for horizontal and/or vertical interpolation
1946  * h:         height of block (usually 8 or 16)
1947  */
1948  
1949
1950 static int dist2(blk1,blk2,lx,hx,hy,h)
1951         uint8_t *blk1,*blk2;
1952         int lx,hx,hy,h;
1953 {
1954         uint8_t *p1,*p1a,*p2;
1955         int i,j;
1956         int s,v;
1957
1958         s = 0;
1959         p1 = blk1;
1960         p2 = blk2;
1961         if (!hx && !hy)
1962                 for (j=0; j<h; j++)
1963                 {
1964                         for (i=0; i<16; i++)
1965                         {
1966                                 v = p1[i] - p2[i];
1967                                 s+= v*v;
1968                         }
1969                         p1+= lx;
1970                         p2+= lx;
1971                 }
1972         else if (hx && !hy)
1973                 for (j=0; j<h; j++)
1974                 {
1975                         for (i=0; i<16; i++)
1976                         {
1977                                 v = ((unsigned int)(p1[i]+p1[i+1]+1)>>1) - p2[i];
1978                                 s+= v*v;
1979                         }
1980                         p1+= lx;
1981                         p2+= lx;
1982                 }
1983         else if (!hx && hy)
1984         {
1985                 p1a = p1 + lx;
1986                 for (j=0; j<h; j++)
1987                 {
1988                         for (i=0; i<16; i++)
1989                         {
1990                                 v = ((unsigned int)(p1[i]+p1a[i]+1)>>1) - p2[i];
1991                                 s+= v*v;
1992                         }
1993                         p1 = p1a;
1994                         p1a+= lx;
1995                         p2+= lx;
1996                 }
1997         }
1998         else /* if (hx && hy) */
1999         {
2000                 p1a = p1 + lx;
2001                 for (j=0; j<h; j++)
2002                 {
2003                         for (i=0; i<16; i++)
2004                         {
2005                                 v = ((unsigned int)(p1[i]+p1[i+1]+p1a[i]+p1a[i+1]+2)>>2) - p2[i];
2006                                 s+= v*v;
2007                         }
2008                         p1 = p1a;
2009                         p1a+= lx;
2010                         p2+= lx;
2011                 }
2012         }
2013  
2014         return s;
2015 }
2016
2017
2018 /*
2019  * absolute difference error between a (16*h) block and a bidirectional
2020  * prediction
2021  *
2022  * p2: address of top left pel of block
2023  * pf,hxf,hyf: address and half pel flags of forward ref. block
2024  * pb,hxb,hyb: address and half pel flags of backward ref. block
2025  * h: height of block
2026  * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
2027  */
2028  
2029
2030 static int bdist1(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
2031         uint8_t *pf,*pb,*p2;
2032         int lx,hxf,hyf,hxb,hyb,h;
2033 {
2034         uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
2035         int i,j;
2036         int s,v;
2037
2038         pfa = pf + hxf;
2039         pfb = pf + lx*hyf;
2040         pfc = pfb + hxf;
2041
2042         pba = pb + hxb;
2043         pbb = pb + lx*hyb;
2044         pbc = pbb + hxb;
2045
2046         s = 0;
2047
2048         for (j=0; j<h; j++)
2049         {
2050                 for (i=0; i<16; i++)
2051                 {
2052                         v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
2053                                   ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
2054                                 - *p2++;
2055                         s += abs(v);
2056                 }
2057                 p2+= lx-16;
2058                 pf+= lx-16;
2059                 pfa+= lx-16;
2060                 pfb+= lx-16;
2061                 pfc+= lx-16;
2062                 pb+= lx-16;
2063                 pba+= lx-16;
2064                 pbb+= lx-16;
2065                 pbc+= lx-16;
2066         }
2067
2068         return s;
2069 }
2070
2071 /*
2072  * squared error between a (16*h) block and a bidirectional
2073  * prediction
2074  *
2075  * p2: address of top left pel of block
2076  * pf,hxf,hyf: address and half pel flags of forward ref. block
2077  * pb,hxb,hyb: address and half pel flags of backward ref. block
2078  * h: height of block
2079  * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
2080  */
2081  
2082
2083 static int bdist2(pf,pb,p2,lx,hxf,hyf,hxb,hyb,h)
2084         uint8_t *pf,*pb,*p2;
2085         int lx,hxf,hyf,hxb,hyb,h;
2086 {
2087         uint8_t *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
2088         int i,j;
2089         int s,v;
2090
2091         pfa = pf + hxf;
2092         pfb = pf + lx*hyf;
2093         pfc = pfb + hxf;
2094
2095         pba = pb + hxb;
2096         pbb = pb + lx*hyb;
2097         pbc = pbb + hxb;
2098
2099         s = 0;
2100
2101         for (j=0; j<h; j++)
2102         {
2103                 for (i=0; i<16; i++)
2104                 {
2105                         v = ((((unsigned int)(*pf++ + *pfa++ + *pfb++ + *pfc++ + 2)>>2) +
2106                                   ((unsigned int)(*pb++ + *pba++ + *pbb++ + *pbc++ + 2)>>2) + 1)>>1)
2107                                 - *p2++;
2108                         s+=v*v;
2109                 }
2110                 p2+= lx-16;
2111                 pf+= lx-16;
2112                 pfa+= lx-16;
2113                 pfb+= lx-16;
2114                 pfc+= lx-16;
2115                 pb+= lx-16;
2116                 pba+= lx-16;
2117                 pbb+= lx-16;
2118                 pbc+= lx-16;
2119         }
2120
2121         return s;
2122 }
2123
2124
2125 /*
2126  * variance of a (size*size) block, multiplied by 256
2127  * p:  address of top left pel of block
2128  * lx: distance (in bytes) of vertically adjacent pels
2129  */
2130 static int variance(uint8_t *p, int size,       int lx)
2131 {
2132         int i,j;
2133         unsigned int v,s,s2;
2134
2135         s = s2 = 0;
2136
2137         for (j=0; j<size; j++)
2138         {
2139                 for (i=0; i<size; i++)
2140                 {
2141                         v = *p++;
2142                         s+= v;
2143                         s2+= v*v;
2144                 }
2145                 p+= lx-size;
2146         }
2147         return s2 - (s*s)/(size*size);
2148 }
2149
2150 /*
2151   Compute the variance of the residual of uni-directionally motion
2152   compensated block.
2153  */
2154
2155 static int unidir_pred_var( const mb_motion_s *motion,
2156                                                         uint8_t *mb,  
2157                                                         int lx, 
2158                                                         int h)
2159 {
2160         return (*pdist2)(motion->blk, mb, lx, motion->hx, motion->hy, h);
2161 }
2162
2163
2164 /*
2165   Compute the variance of the residual of bi-directionally motion
2166   compensated block.
2167  */
2168
2169 static int bidir_pred_var( const mb_motion_s *motion_f, 
2170                                                    const mb_motion_s *motion_b,
2171                                                    uint8_t *mb,  
2172                                                    int lx, int h)
2173 {
2174         return (*pbdist2)( motion_f->blk, motion_b->blk,
2175                                            mb, lx, 
2176                                            motion_f->hx, motion_f->hy,
2177                                            motion_b->hx, motion_b->hy,
2178                                            h);
2179 }
2180
2181 /*
2182   Compute SAD for bi-directionally motion compensated blocks...
2183  */
2184
2185 static int bidir_pred_sad( const mb_motion_s *motion_f, 
2186                                                    const mb_motion_s *motion_b,
2187                                                    uint8_t *mb,  
2188                                                    int lx, int h)
2189 {
2190         return (*pbdist1)(motion_f->blk, motion_b->blk, 
2191                                          mb, lx, 
2192                                          motion_f->hx, motion_f->hy,
2193                                          motion_b->hx, motion_b->hy,
2194                                          h);
2195 }
2196
2197 static void frame_ME(motion_engine_t *engine,
2198                                         pict_data_s *picture,
2199                                          motion_comp_s *mc,
2200                                          int mb_row_start,
2201                                          int i, int j, 
2202                                          mbinfo_s *mbi)
2203 {
2204         mb_motion_s framef_mc;
2205         mb_motion_s frameb_mc;
2206         mb_motion_s dualpf_mc;
2207         mb_motion_s topfldf_mc;
2208         mb_motion_s botfldf_mc;
2209         mb_motion_s topfldb_mc;
2210         mb_motion_s botfldb_mc;
2211
2212         int var,v0;
2213         int vmc,vmcf,vmcr,vmci;
2214         int vmcfieldf,vmcfieldr,vmcfieldi;
2215         subsampled_mb_s ssmb;
2216         int imins[2][2],jmins[2][2];
2217 //      int vmc_dp;
2218         int imindmv,jmindmv;
2219         imindmv = 0;  jmindmv = 0;
2220         memset(&dualpf_mc,0,sizeof(dualpf_mc));
2221
2222 //printf("frame_ME 1\n");
2223
2224
2225         /* A.Stevens fast motion estimation data is appended to actual
2226            luminance information 
2227         */
2228         ssmb.mb = mc->cur[0] + mb_row_start + i;
2229         ssmb.umb = (uint8_t*)(mc->cur[1] + (i>>1) + (mb_row_start>>2));
2230         ssmb.vmb = (uint8_t*)(mc->cur[2] + (i>>1) + (mb_row_start>>2));
2231         ssmb.fmb = (uint8_t*)(mc->cur[0] + fsubsample_offset + 
2232                                                   ((i>>1) + (mb_row_start>>2)));
2233         ssmb.qmb = (uint8_t*)(mc->cur[0] + qsubsample_offset + 
2234                                                   (i>>2) + (mb_row_start>>4));
2235
2236         /* Compute variance MB as a measure of Intra-coding complexity
2237            We include chrominance information here, scaled to compensate
2238            for sub-sampling.  Silly MPEG forcing chrom/lum to have same
2239            quantisations...
2240          */
2241         var = variance(ssmb.mb,16,width);
2242
2243 //printf("motion %d\n", picture->pict_type);
2244         if (picture->pict_type==I_TYPE)
2245         {
2246                 mbi->mb_type = MB_INTRA;
2247         }
2248         else if (picture->pict_type==P_TYPE)
2249         {
2250                 /* For P pictures we take into account chrominance. This
2251                    provides much better performance at scene changes */
2252                 var += chrom_var_sum(&ssmb,16,width);
2253
2254                 if (picture->frame_pred_dct)
2255                 {
2256                         fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2257                                            width,i,j,mc->sxf,mc->syf,16,width,height,
2258                                             &framef_mc);
2259                         framef_mc.fieldoff = 0;
2260                         vmc = framef_mc.var +
2261                                 unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
2262                         mbi->motion_type = MC_FRAME;
2263                 }
2264                 else
2265                 {
2266 //printf("frame_ME 2 %p %p\n", mc->oldorg[0], mc->oldref[0]);
2267                         frame_estimate(engine, mc->oldorg[0],
2268                                 mc->oldref[0],
2269                                 &ssmb,
2270                                 i,
2271                                 j,
2272                                 mc->sxf,
2273                                 mc->syf,
2274                                 &framef_mc,
2275                                 &topfldf_mc,
2276                                 &botfldf_mc,
2277                                 imins,
2278                                 jmins);
2279 //printf("frame_ME 3\n");
2280                         vmcf = framef_mc.var + 
2281                                 unidir_chrom_var_sum( &framef_mc, mc->oldref, &ssmb, width, 16 );
2282                         vmcfieldf = 
2283                                 topfldf_mc.var + 
2284                                 unidir_chrom_var_sum( &topfldf_mc, mc->oldref, &ssmb, (width<<1), 8 ) +
2285                                 botfldf_mc.var + 
2286                                 unidir_chrom_var_sum( &botfldf_mc, mc->oldref, &ssmb, (width<<1), 8 );
2287                         /* DEBUG DP currently disabled... */
2288 //                      if ( M==1)
2289 //                      {
2290 //                              dpframe_estimate(engine, picture,mc->oldref[0],&ssmb,
2291 //                                                               i,j>>1,imins,jmins,
2292 //                                                               &dualpf_mc,
2293 //                                                               &imindmv,&jmindmv, &vmc_dp);
2294 //                      }
2295
2296                         /* NOTE: Typically M =3 so DP actually disabled... */
2297                         /* select between dual prime, frame and field prediction */
2298 //                      if ( M==1 && vmc_dp<vmcf && vmc_dp<vmcfieldf)
2299 //                      {
2300 //                              mbi->motion_type = MC_DMV;
2301 //                              /* No chrominance squared difference  measure yet.
2302 //                                 Assume identical to luminance */
2303 //                              vmc = vmc_dp + vmc_dp;
2304 //                      }
2305 //                      else 
2306                         if ( vmcf < vmcfieldf)
2307                         {
2308                                 mbi->motion_type = MC_FRAME;
2309                                 vmc = vmcf;
2310                                         
2311                         }
2312                         else
2313                         {
2314                                 mbi->motion_type = MC_FIELD;
2315                                 vmc = vmcfieldf;
2316                         }
2317                 }
2318
2319
2320                 /* select between intra or non-intra coding:
2321                  *
2322                  * selection is based on intra block variance (var) vs.
2323                  * prediction error variance (vmc)
2324                  *
2325                  * Used to be: blocks with small prediction error are always 
2326                  * coded non-intra even if variance is smaller (is this reasonable?
2327                  *
2328                  * TODO: A.Stevens Jul 2000
2329                  * The bbmpeg guys have found this to be *unreasonable*.
2330                  * I'm not sure I buy their solution using vmc*2.  It is probabably
2331                  * the vmc>= 9*256 test that is suspect.
2332                  * 
2333                  */
2334
2335
2336                 if (vmc>var /*&& vmc>=(3*3)*16*16*2*/ )
2337                 {
2338                         mbi->mb_type = MB_INTRA;
2339                         mbi->var = var;
2340                 }
2341                 
2342                 else
2343                 {
2344                         /* select between MC / No-MC
2345                          *
2346                          * use No-MC if var(No-MC) <= 1.25*var(MC)
2347                          * (i.e slightly biased towards No-MC)
2348                          *
2349                          * blocks with small prediction error are always coded as No-MC
2350                          * (requires no motion vectors, allows skipping)
2351                          */
2352                         v0 = (*pdist2)(mc->oldref[0]+i+width*j,ssmb.mb,width,0,0,16);
2353
2354                         if (4*v0>5*vmc )
2355                         {
2356                                 /* use MC */
2357                                 var = vmc;
2358                                 mbi->mb_type = MB_FORWARD;
2359                                 if (mbi->motion_type==MC_FRAME)
2360                                 {
2361                                         mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
2362                                         mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
2363                                 }
2364                                 else if (mbi->motion_type==MC_DMV)
2365                                 {
2366                                         /* these are FRAME vectors */
2367                                         /* same parity vector */
2368                                         mbi->MV[0][0][0] = dualpf_mc.pos.x - (i<<1);
2369                                         mbi->MV[0][0][1] = (dualpf_mc.pos.y<<1) - (j<<1);
2370
2371                                         /* opposite parity vector */
2372                                         mbi->dmvector[0] = imindmv;
2373                                         mbi->dmvector[1] = jmindmv;
2374                                 }
2375                                 else
2376                                 {
2377                                         /* these are FRAME vectors */
2378                                         mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
2379                                         mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
2380                                         mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
2381                                         mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
2382                                         mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
2383                                         mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
2384                                 }
2385                         }
2386                         else
2387                         {
2388
2389                                 /* No-MC */
2390                                 var = v0;
2391                                 mbi->mb_type = 0;
2392                                 mbi->motion_type = MC_FRAME;
2393                                 mbi->MV[0][0][0] = 0;
2394                                 mbi->MV[0][0][1] = 0;
2395                         }
2396                 }
2397         }
2398         else /* if (pict_type==B_TYPE) */
2399         {
2400                 if (picture->frame_pred_dct)
2401                 {
2402                         var = variance(ssmb.mb,16,width);
2403                         /* forward */
2404                         fullsearch(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2405                                            width,i,j,mc->sxf,mc->syf,
2406                                            16,width,height,
2407                                            &framef_mc
2408                                            );
2409                         framef_mc.fieldoff = 0;
2410                         vmcf = framef_mc.var;
2411
2412                         /* backward */
2413                         fullsearch(engine, mc->neworg[0],mc->newref[0],&ssmb,
2414                                            width,i,j,mc->sxb,mc->syb,
2415                                            16,width,height,
2416                                            &frameb_mc);
2417                         frameb_mc.fieldoff = 0;
2418                         vmcr = frameb_mc.var;
2419
2420                         /* interpolated (bidirectional) */
2421
2422                         vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
2423
2424                         /* decisions */
2425
2426                         /* select between forward/backward/interpolated prediction:
2427                          * use the one with smallest mean sqaured prediction error
2428                          */
2429                         if (vmcf<=vmcr && vmcf<=vmci)
2430                         {
2431                                 vmc = vmcf;
2432                                 mbi->mb_type = MB_FORWARD;
2433                         }
2434                         else if (vmcr<=vmci)
2435                         {
2436                                 vmc = vmcr;
2437                                 mbi->mb_type = MB_BACKWARD;
2438                         }
2439                         else
2440                         {
2441                                 vmc = vmci;
2442                                 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2443                         }
2444
2445                         mbi->motion_type = MC_FRAME;
2446                 }
2447                 else
2448                 {
2449                         /* forward prediction */
2450                         frame_estimate(engine, mc->oldorg[0],mc->oldref[0],&ssmb,
2451                                                    i,j,mc->sxf,mc->syf,
2452                                                    &framef_mc,
2453                                                    &topfldf_mc,
2454                                                    &botfldf_mc,
2455                                                    imins,jmins);
2456
2457
2458                         /* backward prediction */
2459                         frame_estimate(engine, mc->neworg[0],mc->newref[0],&ssmb,
2460                                                    i,j,mc->sxb,mc->syb,
2461                                                    &frameb_mc,
2462                                                    &topfldb_mc,
2463                                                    &botfldb_mc,
2464                                                    imins,jmins);
2465
2466                         vmcf = framef_mc.var;
2467                         vmcr = frameb_mc.var;
2468                         vmci = bidir_pred_var( &framef_mc, &frameb_mc, ssmb.mb, width, 16 );
2469
2470                         vmcfieldf = topfldf_mc.var + botfldf_mc.var;
2471                         vmcfieldr = topfldb_mc.var + botfldb_mc.var;
2472                         vmcfieldi = bidir_pred_var( &topfldf_mc, &topfldb_mc, ssmb.mb, 
2473                                                                                 width<<1, 8) +
2474                                         bidir_pred_var( &botfldf_mc, &botfldb_mc, ssmb.mb, 
2475                                                                                 width<<1, 8);
2476
2477                         /* select prediction type of minimum distance from the
2478                          * six candidates (field/frame * forward/backward/interpolated)
2479                          */
2480                         if (vmci<vmcfieldi && vmci<vmcf && vmci<vmcfieldf
2481                                   && vmci<vmcr && vmci<vmcfieldr)
2482                         {
2483                                 /* frame, interpolated */
2484                                 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2485                                 mbi->motion_type = MC_FRAME;
2486                                 vmc = vmci;
2487                         }
2488                         else if ( vmcfieldi<vmcf && vmcfieldi<vmcfieldf
2489                                            && vmcfieldi<vmcr && vmcfieldi<vmcfieldr)
2490                         {
2491                                 /* field, interpolated */
2492                                 mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2493                                 mbi->motion_type = MC_FIELD;
2494                                 vmc = vmcfieldi;
2495                         }
2496                         else if (vmcf<vmcfieldf && vmcf<vmcr && vmcf<vmcfieldr)
2497                         {
2498                                 /* frame, forward */
2499                                 mbi->mb_type = MB_FORWARD;
2500                                 mbi->motion_type = MC_FRAME;
2501                                 vmc = vmcf;
2502
2503                         }
2504                         else if ( vmcfieldf<vmcr && vmcfieldf<vmcfieldr)
2505                         {
2506                                 /* field, forward */
2507                                 mbi->mb_type = MB_FORWARD;
2508                                 mbi->motion_type = MC_FIELD;
2509                                 vmc = vmcfieldf;
2510                         }
2511                         else if (vmcr<vmcfieldr)
2512                         {
2513                                 /* frame, backward */
2514                                 mbi->mb_type = MB_BACKWARD;
2515                                 mbi->motion_type = MC_FRAME;
2516                                 vmc = vmcr;
2517                         }
2518                         else
2519                         {
2520                                 /* field, backward */
2521                                 mbi->mb_type = MB_BACKWARD;
2522                                 mbi->motion_type = MC_FIELD;
2523                                 vmc = vmcfieldr;
2524
2525                         }
2526                 }
2527
2528                 /* select between intra or non-intra coding:
2529                  *
2530                  * selection is based on intra block variance (var) vs.
2531                  * prediction error variance (vmc)
2532                  *
2533                  * Used to be: blocks with small prediction error are always 
2534                  * coded non-intra even if variance is smaller (is this reasonable?
2535                  *
2536                  * TODO: A.Stevens Jul 2000
2537                  * The bbmpeg guys have found this to be *unreasonable*.
2538                  * I'm not sure I buy their solution using vmc*2 in the first comparison.
2539                  * It is probabably the vmc>= 9*256 test that is suspect.
2540                  *
2541                  */
2542                 if (vmc>var && vmc>=9*256)
2543                         mbi->mb_type = MB_INTRA;
2544                 else
2545                 {
2546                         var = vmc;
2547                         if (mbi->motion_type==MC_FRAME)
2548                         {
2549                                 /* forward */
2550                                 mbi->MV[0][0][0] = framef_mc.pos.x - (i<<1);
2551                                 mbi->MV[0][0][1] = framef_mc.pos.y - (j<<1);
2552                                 /* backward */
2553                                 mbi->MV[0][1][0] = frameb_mc.pos.x - (i<<1);
2554                                 mbi->MV[0][1][1] = frameb_mc.pos.y - (j<<1);
2555                         }
2556                         else
2557                         {
2558                                 /* these are FRAME vectors */
2559                                 /* forward */
2560                                 mbi->MV[0][0][0] = topfldf_mc.pos.x - (i<<1);
2561                                 mbi->MV[0][0][1] = (topfldf_mc.pos.y<<1) - (j<<1);
2562                                 mbi->MV[1][0][0] = botfldf_mc.pos.x - (i<<1);
2563                                 mbi->MV[1][0][1] = (botfldf_mc.pos.y<<1) - (j<<1);
2564                                 mbi->mv_field_sel[0][0] = topfldf_mc.fieldsel;
2565                                 mbi->mv_field_sel[1][0] = botfldf_mc.fieldsel;
2566                                 /* backward */
2567                                 mbi->MV[0][1][0] = topfldb_mc.pos.x - (i<<1);
2568                                 mbi->MV[0][1][1] = (topfldb_mc.pos.y<<1) - (j<<1);
2569                                 mbi->MV[1][1][0] = botfldb_mc.pos.x - (i<<1);
2570                                 mbi->MV[1][1][1] = (botfldb_mc.pos.y<<1) - (j<<1);
2571                                 mbi->mv_field_sel[0][1] = topfldb_mc.fieldsel;
2572                                 mbi->mv_field_sel[1][1] = botfldb_mc.fieldsel;
2573                         }
2574                 }
2575         }
2576 }
2577
2578
2579
2580 /*
2581  * motion estimation for field pictures
2582  *
2583  * mbi:    pointer to macroblock info structure
2584  * secondfield: indicates second field of a frame (in P fields this means
2585  *              that reference field of opposite parity is in curref instead
2586  *              of oldref)
2587  * ipflag: indicates a P type field which is the second field of a frame
2588  *         in which the first field is I type (this restricts predictions
2589  *         to be based only on the opposite parity (=I) field)
2590  *
2591  * results:
2592  * mbi->
2593  *  mb_type: 0, MB_INTRA, MB_FORWARD, MB_BACKWARD, MB_FORWARD|MB_BACKWARD
2594  *  MV[][][]: motion vectors (field format)
2595  *  mv_field_sel: top/bottom field
2596  *  motion_type: MC_FIELD, MC_16X8
2597  *
2598  * uses global vars: pict_type, pict_struct
2599  */
2600 static void field_ME(motion_engine_t *engine,
2601         pict_data_s *picture,
2602         motion_comp_s *mc,
2603         int mb_row_start,
2604         int i, int j, 
2605         mbinfo_s *mbi, 
2606         int secondfield, int ipflag)
2607 {
2608         int w2;
2609         uint8_t *toporg, *topref, *botorg, *botref;
2610         subsampled_mb_s ssmb;
2611         mb_motion_s fields_mc, dualp_mc;
2612         mb_motion_s fieldf_mc, fieldb_mc;
2613         mb_motion_s field8uf_mc, field8lf_mc;
2614         mb_motion_s field8ub_mc, field8lb_mc;
2615         int var,vmc,v0,dmcfieldi,dmcfield,dmcfieldf,dmcfieldr,dmc8i;
2616         int imins,jmins;
2617         int dmc8f,dmc8r;
2618 //      int vmc_dp,dmc_dp,dmc;
2619         imins = 0;  jmins = 0;
2620         memset(&dualp_mc,0,sizeof(dualp_mc));
2621         w2 = width<<1;
2622
2623         /* Fast motion data sits at the end of the luminance buffer */
2624         ssmb.mb = mc->cur[0] + i + w2*j;
2625         ssmb.umb = mc->cur[1] + ((i>>1)+(w2>>1)*(j>>1));
2626         ssmb.vmb = mc->cur[2] + ((i>>1)+(w2>>1)*(j>>1));
2627         ssmb.fmb = mc->cur[0] + fsubsample_offset+((i>>1)+(w2>>1)*(j>>1));
2628         ssmb.qmb = mc->cur[0] + qsubsample_offset+ (i>>2)+(w2>>2)*(j>>2);
2629
2630         if (picture->pict_struct==BOTTOM_FIELD)
2631         {
2632                 ssmb.mb += width;
2633                 ssmb.umb += (width >> 1);
2634                 ssmb.vmb += (width >> 1);
2635                 ssmb.fmb += (width >> 1);
2636                 ssmb.qmb += (width >> 2);
2637         }
2638
2639         var = variance(ssmb.mb,16,w2) + 
2640                 ( variance(ssmb.umb,8,(width>>1)) + variance(ssmb.vmb,8,(width>>1)))*2;
2641
2642         if (picture->pict_type==I_TYPE)
2643                 mbi->mb_type = MB_INTRA;
2644         else if (picture->pict_type==P_TYPE)
2645         {
2646                 toporg = mc->oldorg[0];
2647                 topref = mc->oldref[0];
2648                 botorg = mc->oldorg[0] + width;
2649                 botref = mc->oldref[0] + width;
2650
2651                 if (secondfield)
2652                 {
2653                         /* opposite parity field is in same frame */
2654                         if (picture->pict_struct==TOP_FIELD)
2655                         {
2656                                 /* current is top field */
2657                                 botorg = mc->cur[0] + width;
2658                                 botref = mc->curref[0] + width;
2659                         }
2660                         else
2661                         {
2662                                 /* current is bottom field */
2663                                 toporg = mc->cur[0];
2664                                 topref = mc->curref[0];
2665                         }
2666                 }
2667
2668                 field_estimate(engine,
2669                                                 picture,
2670                                            toporg,topref,botorg,botref,&ssmb,
2671                                            i,j,mc->sxf,mc->syf,ipflag,
2672                                            &fieldf_mc,
2673                                            &field8uf_mc,
2674                                            &field8lf_mc,
2675                                            &fields_mc);
2676                 dmcfield = fieldf_mc.sad;
2677                 dmc8f = field8uf_mc.sad + field8lf_mc.sad;
2678
2679 //              if (M==1 && !ipflag)  /* generic condition which permits Dual Prime */
2680 //              {
2681 //                      dpfield_estimate(engine,
2682 //                                                      picture,
2683 //                                                       topref,botref,ssmb.mb,i,j,imins,jmins,
2684 //                                                       &dualp_mc,
2685 //                                                       &vmc_dp);
2686 //                      dmc_dp = dualp_mc.sad;
2687 //              }
2688                 
2689 //              /* select between dual prime, field and 16x8 prediction */
2690 //              if (M==1 && !ipflag && dmc_dp<dmc8f && dmc_dp<dmcfield)
2691 //              {
2692 //                      /* Dual Prime prediction */
2693 //                      mbi->motion_type = MC_DMV;
2694 //                      dmc = dualp_mc.sad;
2695 //                      vmc = vmc_dp;
2696 //
2697 //              }
2698 //              else 
2699                 if (dmc8f<dmcfield)
2700                 {
2701                         /* 16x8 prediction */
2702                         mbi->motion_type = MC_16X8;
2703                         /* upper and lower half blocks */
2704                         vmc =  unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
2705                         vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
2706                 }
2707                 else
2708                 {
2709                         /* field prediction */
2710                         mbi->motion_type = MC_FIELD;
2711                         vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16 );
2712                 }
2713
2714                 /* select between intra and non-intra coding */
2715
2716                 if ( vmc>var && vmc>=9*256)
2717                         mbi->mb_type = MB_INTRA;
2718                 else
2719                 {
2720                         /* zero MV field prediction from same parity ref. field
2721                          * (not allowed if ipflag is set)
2722                          */
2723                         if (!ipflag)
2724                                 v0 = (*pdist2)(((picture->pict_struct==BOTTOM_FIELD)?botref:topref) + i + w2*j,
2725                                                            ssmb.mb,w2,0,0,16);
2726                         if (ipflag  || (4*v0>5*vmc && v0>=9*256))
2727                         {
2728                                 var = vmc;
2729                                 mbi->mb_type = MB_FORWARD;
2730                                 if (mbi->motion_type==MC_FIELD)
2731                                 {
2732                                         mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
2733                                         mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
2734                                         mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
2735                                 }
2736                                 else if (mbi->motion_type==MC_DMV)
2737                                 {
2738                                         /* same parity vector */
2739                                         mbi->MV[0][0][0] = imins - (i<<1);
2740                                         mbi->MV[0][0][1] = jmins - (j<<1);
2741
2742                                         /* opposite parity vector */
2743                                         mbi->dmvector[0] = dualp_mc.pos.x;
2744                                         mbi->dmvector[1] = dualp_mc.pos.y;
2745                                 }
2746                                 else
2747                                 {
2748                                         mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
2749                                         mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
2750                                         mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
2751                                         mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
2752                                         mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
2753                                         mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
2754                                 }
2755                         }
2756                         else
2757                         {
2758                                 /* No MC */
2759                                 var = v0;
2760                                 mbi->mb_type = 0;
2761                                 mbi->motion_type = MC_FIELD;
2762                                 mbi->MV[0][0][0] = 0;
2763                                 mbi->MV[0][0][1] = 0;
2764                                 mbi->mv_field_sel[0][0] = (picture->pict_struct==BOTTOM_FIELD);
2765                         }
2766                 }
2767         }
2768         else /* if (pict_type==B_TYPE) */
2769         {
2770                 /* forward prediction */
2771                 field_estimate(engine, 
2772                                                 picture,
2773                                            mc->oldorg[0],mc->oldref[0],
2774                                            mc->oldorg[0]+width,mc->oldref[0]+width,&ssmb,
2775                                            i,j,mc->sxf,mc->syf,0,
2776                                            &fieldf_mc,
2777                                            &field8uf_mc,
2778                                            &field8lf_mc,
2779                                            &fields_mc);
2780                 dmcfieldf = fieldf_mc.sad;
2781                 dmc8f = field8uf_mc.sad + field8lf_mc.sad;
2782
2783                 /* backward prediction */
2784                 field_estimate(engine,
2785                                                 picture,
2786                                            mc->neworg[0],mc->newref[0],mc->neworg[0]+width,mc->newref[0]+width,
2787                                            &ssmb,
2788                                            i,j,mc->sxb,mc->syb,0,
2789                                            &fieldb_mc,
2790                                            &field8ub_mc,
2791                                            &field8lb_mc,
2792                                            &fields_mc);
2793                 dmcfieldr = fieldb_mc.sad;
2794                 dmc8r = field8ub_mc.sad + field8lb_mc.sad;
2795
2796                 /* calculate distances for bidirectional prediction */
2797                 /* field */
2798                 dmcfieldi = bidir_pred_sad( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
2799
2800                 /* 16x8 upper and lower half blocks */
2801                 dmc8i =  bidir_pred_sad( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 16 );
2802                 dmc8i += bidir_pred_sad( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 16 );
2803
2804                 /* select prediction type of minimum distance */
2805                 if (dmcfieldi<dmc8i && dmcfieldi<dmcfieldf && dmcfieldi<dmc8f
2806                         && dmcfieldi<dmcfieldr && dmcfieldi<dmc8r)
2807                 {
2808                         /* field, interpolated */
2809                         mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2810                         mbi->motion_type = MC_FIELD;
2811                         vmc = bidir_pred_var( &fieldf_mc, &fieldb_mc, ssmb.mb, w2, 16);
2812                 }
2813                 else if (dmc8i<dmcfieldf && dmc8i<dmc8f
2814                                  && dmc8i<dmcfieldr && dmc8i<dmc8r)
2815                 {
2816                         /* 16x8, interpolated */
2817                         mbi->mb_type = MB_FORWARD|MB_BACKWARD;
2818                         mbi->motion_type = MC_16X8;
2819
2820                         /* upper and lower half blocks */
2821                         vmc =  bidir_pred_var( &field8uf_mc, &field8ub_mc, ssmb.mb, w2, 8);
2822                         vmc += bidir_pred_var( &field8lf_mc, &field8lb_mc, ssmb.mb, w2, 8);
2823                 }
2824                 else if (dmcfieldf<dmc8f && dmcfieldf<dmcfieldr && dmcfieldf<dmc8r)
2825                 {
2826                         /* field, forward */
2827                         mbi->mb_type = MB_FORWARD;
2828                         mbi->motion_type = MC_FIELD;
2829                         vmc = unidir_pred_var( &fieldf_mc, ssmb.mb, w2, 16);
2830                 }
2831                 else if (dmc8f<dmcfieldr && dmc8f<dmc8r)
2832                 {
2833                         /* 16x8, forward */
2834                         mbi->mb_type = MB_FORWARD;
2835                         mbi->motion_type = MC_16X8;
2836
2837                         /* upper and lower half blocks */
2838                         vmc =  unidir_pred_var( &field8uf_mc, ssmb.mb, w2, 8);
2839                         vmc += unidir_pred_var( &field8lf_mc, ssmb.mb, w2, 8);
2840                 }
2841                 else if (dmcfieldr<dmc8r)
2842                 {
2843                         /* field, backward */
2844                         mbi->mb_type = MB_BACKWARD;
2845                         mbi->motion_type = MC_FIELD;
2846                         vmc = unidir_pred_var( &fieldb_mc, ssmb.mb, w2, 16 );
2847                 }
2848                 else
2849                 {
2850                         /* 16x8, backward */
2851                         mbi->mb_type = MB_BACKWARD;
2852                         mbi->motion_type = MC_16X8;
2853
2854                         /* upper and lower half blocks */
2855                         vmc =  unidir_pred_var( &field8ub_mc, ssmb.mb, w2, 8);
2856                         vmc += unidir_pred_var( &field8lb_mc, ssmb.mb, w2, 8);
2857
2858                 }
2859
2860                 /* select between intra and non-intra coding */
2861                 if (vmc>var && vmc>=9*256)
2862                         mbi->mb_type = MB_INTRA;
2863                 else
2864                 {
2865                         var = vmc;
2866                         if (mbi->motion_type==MC_FIELD)
2867                         {
2868                                 /* forward */
2869                                 mbi->MV[0][0][0] = fieldf_mc.pos.x - (i<<1);
2870                                 mbi->MV[0][0][1] = fieldf_mc.pos.y - (j<<1);
2871                                 mbi->mv_field_sel[0][0] = fieldf_mc.fieldsel;
2872                                 /* backward */
2873                                 mbi->MV[0][1][0] = fieldb_mc.pos.x - (i<<1);
2874                                 mbi->MV[0][1][1] = fieldb_mc.pos.y - (j<<1);
2875                                 mbi->mv_field_sel[0][1] = fieldb_mc.fieldsel;
2876                         }
2877                         else /* MC_16X8 */
2878                         {
2879                                 /* forward */
2880                                 mbi->MV[0][0][0] = field8uf_mc.pos.x - (i<<1);
2881                                 mbi->MV[0][0][1] = field8uf_mc.pos.y - (j<<1);
2882                                 mbi->mv_field_sel[0][0] = field8uf_mc.fieldsel;
2883                                 mbi->MV[1][0][0] = field8lf_mc.pos.x - (i<<1);
2884                                 mbi->MV[1][0][1] = field8lf_mc.pos.y - ((j+8)<<1);
2885                                 mbi->mv_field_sel[1][0] = field8lf_mc.fieldsel;
2886                                 /* backward */
2887                                 mbi->MV[0][1][0] = field8ub_mc.pos.x - (i<<1);
2888                                 mbi->MV[0][1][1] = field8ub_mc.pos.y - (j<<1);
2889                                 mbi->mv_field_sel[0][1] = field8ub_mc.fieldsel;
2890                                 mbi->MV[1][1][0] = field8lb_mc.pos.x - (i<<1);
2891                                 mbi->MV[1][1][1] = field8lb_mc.pos.y - ((j+8)<<1);
2892                                 mbi->mv_field_sel[1][1] = field8lb_mc.fieldsel;
2893                         }
2894                 }
2895         }
2896
2897 }
2898
2899
2900
2901 /*
2902   Initialise motion compensation - currently purely selection of which
2903   versions of the various low level computation routines to use
2904   
2905   */
2906
2907 void init_motion()
2908 {
2909         int cpucap = cpu_accel();
2910
2911         if( cpucap  == 0 )      /* No MMX/SSE etc support available */
2912         {
2913                 pdist22 = dist22;
2914                 pdist44 = dist44;
2915                 pdist1_00 = dist1_00;
2916                 pdist1_01 = dist1_01;
2917                 pdist1_10 = dist1_10;
2918                 pdist1_11 = dist1_11;
2919                 pbdist1 = bdist1;
2920                 pdist2 = dist2;
2921                 pbdist2 = bdist2;
2922                 pdist2_22 = dist2_22;
2923                 pbdist2_22 = bdist2_22;
2924                 pfind_best_one_pel = find_best_one_pel;
2925                 pbuild_sub22_mcomps     = build_sub22_mcomps;
2926          }
2927 #ifdef X86_CPU
2928         else if(cpucap & ACCEL_X86_MMXEXT ) /* AMD MMX or SSE... */
2929         {
2930                 if(verbose) fprintf( stderr, "SETTING EXTENDED MMX for MOTION!\n");
2931                 pdist22 = dist22_mmxe;
2932                 pdist44 = dist44_mmxe;
2933                 pdist1_00 = dist1_00_mmxe;
2934                 pdist1_01 = dist1_01_mmxe;
2935                 pdist1_10 = dist1_10_mmxe;
2936                 pdist1_11 = dist1_11_mmxe;
2937                 pbdist1 = bdist1_mmx;
2938                 pdist2 = dist2_mmx;
2939                 pbdist2 = bdist2_mmx;
2940                 pdist2_22 = dist2_22_mmx;
2941                 pbdist2_22 = bdist2_22_mmx;
2942                 pfind_best_one_pel = find_best_one_pel_mmxe;
2943                 pbuild_sub22_mcomps     = build_sub22_mcomps_mmxe;
2944                 pmblock_sub44_dists = mblock_sub44_dists_mmxe;
2945
2946         }
2947         else if(cpucap & ACCEL_X86_MMX) /* Ordinary MMX CPU */
2948         {
2949                 if(verbose) fprintf( stderr, "SETTING MMX for MOTION!\n");
2950                 pdist22 = dist22_mmx;
2951                 pdist44 = dist44_mmx;
2952                 pdist1_00 = dist1_00_mmx;
2953                 pdist1_01 = dist1_01_mmx;
2954                 pdist1_10 = dist1_10_mmx;
2955                 pdist1_11 = dist1_11_mmx;
2956                 pbdist1 = bdist1_mmx;
2957                 pdist2 = dist2_mmx;
2958                 pbdist2 = bdist2_mmx;
2959                 pdist2_22 = dist2_22_mmx;
2960                 pbdist2_22 = bdist2_22_mmx;
2961                 pfind_best_one_pel = find_best_one_pel;
2962                 pbuild_sub22_mcomps     = build_sub22_mcomps;
2963                 pmblock_sub44_dists = mblock_sub44_dists_mmx;
2964         }
2965 #endif
2966 }
2967
2968
2969 void motion_engine_loop(motion_engine_t *engine)
2970 {
2971         while(!engine->done)
2972         {
2973                 pthread_mutex_lock(&(engine->input_lock));
2974
2975                 if(!engine->done)
2976                 {
2977                         pict_data_s *picture = engine->pict_data;
2978                         motion_comp_s *mc_data = engine->motion_comp;
2979                         int secondfield = engine->secondfield;
2980                         int ipflag = engine->ipflag;
2981                         mbinfo_s *mbi = picture->mbinfo + (engine->start_row / 16) * (width / 16);
2982                         int i, j;
2983                         int mb_row_incr;                        /* Offset increment to go down 1 row of mb's */
2984                         int mb_row_start;
2985
2986                         if (picture->pict_struct == FRAME_PICTURE)
2987                         {                       
2988                                 mb_row_incr = 16*width;
2989                                 mb_row_start = engine->start_row / 16 * mb_row_incr;
2990 /* loop through all macroblocks of a frame picture */
2991                                 for (j=engine->start_row; j < engine->end_row; j+=16)
2992                                 {
2993                                         for (i=0; i<width; i+=16)
2994                                         {
2995                                                 frame_ME(engine, picture, mc_data, mb_row_start, i,j, mbi);
2996                                                 mbi++;
2997                                         }
2998                                         mb_row_start += mb_row_incr;
2999                                 }
3000                         }
3001                         else
3002                         {               
3003                                 mb_row_incr = (16 * 2) * width;
3004                                 mb_row_start = engine->start_row / 16 * mb_row_incr;
3005 /* loop through all macroblocks of a field picture */
3006                                 for (j=engine->start_row; j < engine->end_row; j+=16)
3007                                 {
3008                                         for (i=0; i<width; i+=16)
3009                                         {
3010                                                 field_ME(engine, picture, mc_data, mb_row_start, i,j,
3011                                                                  mbi,secondfield,ipflag);
3012                                                 mbi++;
3013                                         }
3014                                         mb_row_start += mb_row_incr;
3015                                 }
3016                         }
3017                 }
3018
3019                 pthread_mutex_unlock(&(engine->output_lock));
3020         }
3021 }
3022
3023
3024 void motion_estimation(pict_data_s *picture,
3025         motion_comp_s *mc_data,
3026         int secondfield, int ipflag)
3027 {
3028         int i;
3029 /* Start loop */
3030         for(i = 0; i < processors; i++)
3031         {
3032                 motion_engines[i].motion_comp = mc_data;
3033
3034                 motion_engines[i].pict_data = picture;
3035
3036                 motion_engines[i].secondfield = secondfield;
3037                 motion_engines[i].ipflag = ipflag;
3038                 pthread_mutex_unlock(&(motion_engines[i].input_lock));
3039         }
3040
3041 /* Wait for completion */
3042         for(i = 0; i < processors; i++)
3043         {
3044                 pthread_mutex_lock(&(motion_engines[i].output_lock));
3045         }
3046 }
3047
3048 void start_motion_engines()
3049 {
3050         int i;
3051         int rows_per_processor = (int)((float)height2 / 16 / processors + 0.5);
3052         int current_row = 0;
3053         pthread_attr_t  attr;
3054         pthread_mutexattr_t mutex_attr;
3055
3056         pthread_mutexattr_init(&mutex_attr);
3057         pthread_attr_init(&attr);
3058         motion_engines = calloc(1, sizeof(motion_engine_t) * processors);
3059         for(i = 0; i < processors; i++)
3060         {
3061                 motion_engines[i].start_row = current_row * 16;
3062                 current_row += rows_per_processor;
3063                 if(current_row > height2 / 16) current_row = height2 / 16;
3064                 motion_engines[i].end_row = current_row * 16;
3065                 pthread_mutex_init(&(motion_engines[i].input_lock), &mutex_attr);
3066                 pthread_mutex_lock(&(motion_engines[i].input_lock));
3067                 pthread_mutex_init(&(motion_engines[i].output_lock), &mutex_attr);
3068                 pthread_mutex_lock(&(motion_engines[i].output_lock));
3069                 motion_engines[i].done = 0;
3070                 pthread_create(&(motion_engines[i].tid), 
3071                         &attr, 
3072                         (void*)motion_engine_loop, 
3073                         &motion_engines[i]);
3074         }
3075 }
3076
3077 void stop_motion_engines()
3078 {
3079         int i;
3080         for(i = 0; i < processors; i++)
3081         {
3082                 motion_engines[i].done = 1;
3083                 pthread_mutex_unlock(&(motion_engines[i].input_lock));
3084                 pthread_join(motion_engines[i].tid, 0);
3085                 pthread_mutex_destroy(&(motion_engines[i].input_lock));
3086                 pthread_mutex_destroy(&(motion_engines[i].output_lock));
3087         }
3088         free(motion_engines);
3089 }
3090
3091