Merge CV, ver=5.1; ops/methods from HV, and interface from CV where possible

[goodguy/history.git] / cinelerra-5.1 / mpeg2enc / quantize_x86.c
diff --git a/cinelerra-5.1/mpeg2enc/quantize_x86.c b/cinelerra-5.1/mpeg2enc/quantize_x86.c

new file mode 100644 (file)

index 0000000..37bf918
--- /dev/null
+++ b/cinelerra-5.1/mpeg2enc/quantize_x86.c
@@ -0,0 +1,472 @@
+/* quantize_x86.c Quantization / inverse quantization    
+   In compiler (gcc) embdeed assmbley language...
+*/
+
+/* Copyright (C) 2000 Andrew Stevens */
+
+/* This program is free software; you can redistribute it
+ *  and/or modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version 2 of
+ *  the License, or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ *  General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ *
+ */
+
+
+
+/* 
+ * 3DNow version of
+ * Quantisation for non-intra blocks using Test Model 5 quantization
+ *
+ * this quantizer has a bias of 1/8 stepsize towards zero
+ * (except for the DC coefficient)
+ *
+ *     PRECONDITION: src dst point to *disinct* memory buffers...
+ *                   of block_count *adjacent* int16_t[64] arrays...
+ *
+ * RETURN: 1 If non-zero coefficients left after quantisaion 0 otherwise
+ */
+
+#include "config.h"
+#include <stdio.h>
+#include <math.h>
+#include <fenv.h>
+#include "global.h"
+#include "cpu_accel.h"
+#include "simd.h"
+#include "attributes.h"
+#include "mmx.h"
+
+/* 
+ * Quantisation for non-intra blocks 
+ *
+ * Various versions for various SIMD instruction sets.  Not all of them
+ * bother to implement the test model 5 quantisation of the reference source
+ * (this has a bias of 1/8 stepsize towards zero - except for the DC coefficient).
+ *
+ * Actually, as far as I can tell even the reference source doesn't quite do it
+ * for non-intra (though it *does* for intra).
+ * 
+ * Careful analysis of the code also suggests what it actually does is truncate
+ * with a modest bias towards 1 (the d>>2 factor)
+ *
+ *     PRECONDITION: src dst point to *disinct* memory buffers...
+ *                   of block_count *adjacent* int16_t[64] arrays...
+ *
+ *RETURN: A bit-mask of block_count bits indicating non-zero blocks (a 1).
+ */
+       
+
+/*
+ * 3D-Now version: simply truncates to zero, however, the tables have a 2% bias
+ * upwards which partly compensates.
+ */
+ 
+int quant_non_intra_hv_3dnow(  
+       pict_data_s *picture,
+       int16_t *src, int16_t *dst,
+       int mquant,
+       int *nonsat_mquant)
+{
+       int saturated;
+       int satlim = dctsatlim;
+       float *i_quant_matf; 
+       int   coeff_count = 64*block_count;
+       uint32_t nzflag, flags;
+       int16_t *psrc, *pdst;
+       float *piqf;
+       int i;
+       uint32_t tmp;
+
+       /* Initialise zero block flags */
+       /* Load 1 into mm6 */
+       __asm__ ( "movl %0, %%eax\n" 
+                         "movd %%eax, %%mm6\n"
+                         : :"g" (1) : "eax" );
+       /* Load satlim into mm1 */
+       movd_m2r( satlim, mm1 );
+       punpcklwd_r2r( mm1, mm1 );
+       punpckldq_r2r( mm1, mm1 );
+restart:
+       i_quant_matf = i_inter_q_tblf[mquant];
+       flags = 0;
+       piqf = i_quant_matf;
+       saturated = 0;
+       nzflag = 0;
+       psrc = src;
+       pdst = dst;
+       for (i=0; i < coeff_count ; i+=4)
+       {
+
+               /* TODO: For maximum efficiency this should be unrolled to allow
+                  f.p. and int MMX to be interleaved... 
+               */
+
+               /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
+                */
+
+               movq_m2r( *(mmx_t *)&psrc[0], mm2 );
+               movq_r2r( mm2, mm7 );
+               psraw_i2r( 16, mm7 );   /* Replicate sign bits mm2 in mm7 */
+               movq_r2r( mm2, mm3 );
+               punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */
+               punpckhwd_r2r( mm7, mm3);
+
+               /* Multiply by sixteen... */
+               pslld_i2r( 4, mm2 );
+               pslld_i2r( 4, mm3 );
+               
+               /*
+                  Load the inverse quantisation factors from the 
+                  table in to mm4 and mm5
+                  Interleaved with converting mm2 and mm3 to float's
+                  to (hopefully) maximise parallelism.
+                */
+               movq_m2r( *(mmx_t*)&piqf[0], mm4);
+               pi2fd_r2r( mm2, mm2);
+               movq_m2r( *(mmx_t*)&piqf[2], mm5);
+               pi2fd_r2r( mm3, mm3);
+
+               /* "Divide" by multiplying by inverse quantisation
+                and convert back to integers*/
+               pfmul_r2r( mm4, mm2 );
+               pf2id_r2r( mm2, mm2);
+               pfmul_r2r( mm5, mm3);
+               pf2id_r2r( mm3, mm3);
+
+
+               /* Convert the two pairs of double words into four words */
+               packssdw_r2r(  mm3, mm2);
+
+
+               /* Accumulate saturation... */
+               movq_r2r( mm2, mm4 );
+
+               pxor_r2r( mm5, mm5 );   // mm5 = -mm2
+               pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim) 
+               psubw_r2r( mm2, mm5 );
+               pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim
+               por_r2r( mm5, mm4 );  // mm4 = abs(mm2) > satlim
+               movq_r2r( mm4, mm5 );
+               psrlq_i2r( 32, mm5);
+               por_r2r( mm5, mm4 );
+
+               movd_m2r( saturated, mm5 ); // saturated |= mm4
+               por_r2r( mm4, mm5 );
+               movd_r2m( mm5, saturated );
+
+               /* Store and accumulate zero-ness */
+               movq_r2r( mm2, mm3 );
+               movq_r2m( mm2, *(mmx_t*)pdst );
+               psrlq_i2r( 32, mm3 );
+               por_r2r( mm3, mm2 );
+               movd_r2m( mm2, tmp );
+               flags |= tmp;
+
+               piqf += 4;
+               pdst += 4;
+               psrc += 4;
+
+               if( (i & 63) == (63/4)*4 )
+               {
+
+                       if( saturated )
+                       {
+                               int new_mquant = next_larger_quant_hv( picture, mquant );
+                               if( new_mquant != mquant )
+                               {
+                                       mquant = new_mquant;
+                                       goto restart;
+                               }
+                               else
+                               {
+                                       return quant_non_intra_hv(picture, src, dst, mquant, 
+                                                                                  nonsat_mquant);
+                               }
+                       }
+
+                       nzflag = (nzflag<<1) | !!flags;
+                       flags = 0;
+                       piqf = i_quant_matf;
+               }
+                       
+       }
+       femms();
+
+       //nzflag = (nzflag<<1) | (!!flags);
+       return nzflag;
+}
+
+
+
+
+
+
+
+/* Disabled in mjpegtools */
+
+#if 0
+/*
+ * SSE version: simply truncates to zero, however, the tables have a 2% bias
+ * upwards which partly compensates.
+ */
+static int trunc_mxcsr = 0x7f80;
+ 
+int quant_non_intra_hv_sse(    
+       pict_data_s *picture,
+       int16_t *src, int16_t *dst,
+       int mquant,
+       int *nonsat_mquant)
+{
+       int saturated;
+       int satlim = dctsatlim;
+       float *i_quant_matf; 
+       int   coeff_count = 64*block_count;
+       uint32_t nzflag, flags;
+       int16_t *psrc, *pdst;
+       float *piqf;
+       int i;
+       uint32_t tmp;
+
+       /* Initialise zero block flags */
+       /* Load 1 into mm6 */
+       __asm__ ( "movl %0, %%eax\n" 
+                         "movd %%eax, %%mm6\n"
+                         : :"g" (1) : "eax" );
+       /* Set up SSE rounding mode */
+       __asm__ ( "ldmxcsr (%0)\n" : : "X" (trunc_mxcsr) );
+
+       /* Load satlim into mm1 */
+       movd_m2r( satlim, mm1 );
+       punpcklwd_r2r( mm1, mm1 );
+       punpckldq_r2r( mm1, mm1 );
+restart:
+       i_quant_matf = i_inter_q_tblf[mquant];
+       flags = 0;
+       piqf = i_quant_matf;
+       saturated = 0;
+       nzflag = 0;
+       psrc = src;
+       pdst = dst;
+       for (i=0; i < coeff_count ; i+=4)
+       {
+
+               /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
+                */
+
+               movq_m2r( *(mmx_t *)&psrc[0], mm2 );
+               movq_r2r( mm2, mm7 );
+               psraw_i2r( 16, mm7 );   /* Replicate sign bits mm2 in mm7 */
+               movq_r2r( mm2, mm3 );
+               punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */
+               punpckhwd_r2r( mm7, mm3);
+
+               /* Multiply by sixteen... */
+               pslld_i2r( 4, mm2 );
+               pslld_i2r( 4, mm3 );
+               
+               /*
+                 Convert mm2 and mm3 to float's  in xmm2 and xmm3
+                */
+               cvtpi2ps_r2r( mm2, xmm2 );
+               cvtpi2ps_r2r( mm3, xmm3 );
+               shufps_r2ri(  xmm3, xmm2, 0*1 + 1*4 + 0 * 16 + 1 * 64 );
+
+               /* "Divide" by multiplying by inverse quantisation
+                and convert back to integers*/
+               mulps_m2r( *(mmx_t*)&piqf[0], xmm2 );
+               cvtps2pi_r2r( xmm2, mm2 );
+               shufps_r2ri( xmm2, xmm2, 2*1 + 3*4 + 0 * 16 + 1 * 64 );
+               cvtps2pi_r2r( xmm2, mm3 );
+
+               /* Convert the two pairs of double words into four words */
+               packssdw_r2r(  mm3, mm2);
+
+
+               /* Accumulate saturation... */
+               movq_r2r( mm2, mm4 );
+
+               pxor_r2r( mm5, mm5 );   // mm5 = -mm2
+               pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim) 
+               psubw_r2r( mm2, mm5 );
+               pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim
+               por_r2r( mm5, mm4 );  // mm4 = abs(mm2) > satlim
+               movq_r2r( mm4, mm5 );
+               psrlq_i2r( 32, mm5);
+               por_r2r( mm5, mm4 );
+
+               movd_m2r( saturated, mm5 ); // saturated |= mm4
+               por_r2r( mm4, mm5 );
+               movd_r2m( mm5, saturated );
+
+               /* Store and accumulate zero-ness */
+               movq_r2r( mm2, mm3 );
+               movq_r2m( mm2, *(mmx_t*)pdst );
+               psrlq_i2r( 32, mm3 );
+               por_r2r( mm3, mm2 );
+               movd_r2m( mm2, tmp );
+               flags |= tmp;
+                               
+               piqf += 4;
+               pdst += 4;
+               psrc += 4;
+
+               if( (i & 63) == (63/4)*4 )
+               {
+
+                       if( saturated )
+                       {
+                               int new_mquant = next_larger_quant_hv( picture, mquant );
+                               if( new_mquant != mquant )
+                               {
+                                       mquant = new_mquant;
+                                       goto restart;
+                               }
+                               else
+                               {
+                                       return quant_non_intra_hv(picture, src, dst, mquant, 
+                                                                                  nonsat_mquant);
+                               }
+                       }
+
+                       nzflag = (nzflag<<1) | !!flags;
+                       flags = 0;
+                       piqf = i_quant_matf;
+               }
+                       
+       }
+       emms();
+
+       //nzflag = (nzflag<<1) | (!!flags);
+       return nzflag;
+}
+
+#endif
+
+/*
+ * The ordinary MMX version.  Due to the limited dynamic range afforded by working
+ * with 16-bit int's it (a) has to jump through some gory fudge-factor hoops
+ * (b) give up in tough cases and fall back on the reference code. Fortunately, the
+ * latter happens *very* rarely.
+ *
+ * TODO Replace the inefficient block-by-block call to the assembler by a sweep
+ * through the whole lot...
+ */
+                                                                                                                                                                                                                                                                                    
+int quant_non_intra_hv_mmx(
+       pict_data_s *picture,
+       int16_t *src, int16_t *dst,
+       int mquant,
+       int *nonsat_mquant)
+{
+
+       int nzflag;
+       int clipvalue  = dctsatlim;
+       int flags = 0;
+       int saturated = 0;
+       uint16_t *quant_mat = inter_q;
+       int comp;
+       uint16_t *i_quant_mat = i_inter_q;
+       int imquant;
+       int16_t *psrc, *pdst;
+
+       /* MMX routine does not work right for MQ=2 ... (no unsigned mult) */
+       if( mquant == 2 )
+       {
+               return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant);
+       }
+       /* If available use the fast MMX quantiser.  It returns
+          flags to signal if coefficients are outside its limited range or
+          saturation would occur with the specified quantisation
+          factor
+          Top 16 bits - non zero quantised coefficient present
+          Bits 8-15   - Saturation occurred
+          Bits 0-7    - Coefficient out of range.
+       */
+
+       nzflag = 0;
+       pdst = dst;
+       psrc = src;
+       comp = 0; 
+       do
+       {
+               imquant = (IQUANT_SCALE/mquant);
+               flags = quantize_ni_mmx( pdst, psrc, quant_mat, i_quant_mat, 
+                                                                               imquant, mquant, clipvalue );
+               nzflag = (nzflag << 1) |( !!(flags & 0xffff0000));
+  
+               /* If we're saturating simply bump up quantization and start
+                       from scratch...  if we can't avoid saturation by
+                       quantising then we're hosed and we fall back to
+                       saturation using the old C code.  */
+  
+               if( (flags & 0xff00) != 0 )
+               {
+                       int new_mquant = next_larger_quant_hv( picture, mquant );
+                       if( new_mquant != mquant )
+                       {
+                               mquant = new_mquant;
+                       }
+                       else
+                       {
+                               saturated = 1;
+                               break;
+                       }
+
+                       comp = 0; 
+                       nzflag = 0;
+                       pdst = dst;
+                       psrc = src;
+               }
+               else
+               {
+                       ++comp;
+                       pdst += 64;
+                       psrc +=64;
+               }
+               /* Fall back to 32-bit(or better - if some hero(ine) made this work on
+                       non 32-bit int machines ;-)) if out of dynamic range for MMX...
+               */
+       }
+       while( comp < block_count  && (flags & 0xff) == 0  );
+
+
+       /* Coefficient out of range or can't avoid saturation:
+       fall back to the original 32-bit int version: this is rare */
+       if(  (flags & 0xff) != 0 || saturated)
+       {
+               return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant);
+       }
+
+       *nonsat_mquant = mquant;
+       return nzflag;
+}
+
+
+void iquant1_intra(int16_t *src, int16_t *dst, int dc_prec, int mquant)
+{
+  int i, val;
+  uint16_t *quant_mat = intra_q;
+
+  dst[0] = src[0] << (3-dc_prec);
+  for (i=1; i<64; i++)
+  {
+    val = (int)(src[i]*quant_mat[i]*mquant)/16;
+
+    /* mismatch control */
+    if ((val&1)==0 && val!=0)
+      val+= (val>0) ? -1 : 1;
+
+    /* saturation */
+    dst[i] = (val>2047) ? 2047 : ((val<-2048) ? -2048 : val);
+  }
+}