Merge CV, ver=5.1; ops/methods from HV, and interface from CV where possible
[goodguy/history.git] / cinelerra-5.1 / mpeg2enc / mblock_sub44_sads_x86_h.c
diff --git a/cinelerra-5.1/mpeg2enc/mblock_sub44_sads_x86_h.c b/cinelerra-5.1/mpeg2enc/mblock_sub44_sads_x86_h.c
new file mode 100644 (file)
index 0000000..f3cd0b1
--- /dev/null
@@ -0,0 +1,98 @@
+/*
+ *
+ * mblock_sub44_sads_x86_h.c
+ * Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+ *
+ * Fast block sum-absolute difference computation for a rectangular area 4*x
+ * by y where y > h against a 4 by h block.
+ *
+ * Used for 4*4 sub-sampled motion compensation calculations.
+ * 
+ *
+ * This file is part of mpeg2enc, a free MPEG-2 video stream encoder
+ * based on the original MSSG reference design
+ *
+ * mpeg2enc is free software; you can redistribute new parts
+ * and/or modify under the terms of the GNU General Public License 
+ * as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * mpeg2enc is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * See the files for those sections (c) MSSG
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ */
+
+/*
+ *
+ * Generates a vector sad's for 4*4 sub-sampled pel (qpel) data (with
+ * co-ordinates and top-left qpel address) from specified rectangle
+ * against a specified 16*h pel (4*4 qpel) reference block.  The
+ * generated vector contains results only for those sad's that fall
+ * below twice the running best sad and are aligned on 8-pel
+ * boundaries
+ *
+ * Invariant: blk points to top-left sub-sampled pel for macroblock
+ * at (ilow,ihigh)
+ * i{low,high) j(low,high) must be multiples of 4.
+ *
+ * sad = Sum Absolute Differences
+ *
+ * NOTES: for best efficiency i{low,high) should be multiples of 16.
+ *
+ * */
+
+int SIMD_SUFFIX(mblock_sub44_dists)( uint8_t *blk,  uint8_t *ref,
+                                                                        int ilow,int jlow,
+                                                                        int ihigh, int jhigh, 
+                                                                        int h, int rowstride, 
+                                                                        int threshold,
+                                                                        mc_result_s *resvec)
+{
+       int32_t x,y;
+       uint8_t *currowblk = blk;
+       uint8_t *curblk;
+       mc_result_s *cres = resvec;
+       int      gridrowstride = (rowstride);
+
+       for( y=jlow; y <= jhigh ; y+=4)
+       {
+               curblk = currowblk;
+               for( x = ilow; x <= ihigh; x += 4)
+               {
+                       int weight;
+                       if( (x & 15) == (ilow & 15) )
+                       {
+                               load_blk( curblk, rowstride, h );
+                       }
+                       weight = SIMD_SUFFIX(qblock_sad)(ref, h, rowstride);
+                       if( weight <= threshold )
+                       {
+                               threshold = intmin(weight<<2,threshold);
+                               /* Rough and-ready absolute distance penalty */
+                               /* NOTE: This penalty is *vital* to correct operation 
+                                  as otherwise the sub-mean filtering won't work on very
+                                  uniform images.
+                                */
+                               cres->weight = (uint16_t)weight+((intabs(x)+intabs(y))>>3);
+                               cres->x = (uint8_t)x;
+                               cres->y = (uint8_t)y;
+                               ++cres;
+                       }
+                       curblk += 1;
+                       shift_blk(8);
+               }
+               currowblk += gridrowstride;
+       }
+       emms();
+       return cres - resvec;
+}
+
+#undef concat