--- /dev/null
+/* quantize_x86.c Quantization / inverse quantization
+ In compiler (gcc) embdeed assmbley language...
+*/
+
+/* Copyright (C) 2000 Andrew Stevens */
+
+/* This program is free software; you can redistribute it
+ * and/or modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
+ * 02111-1307, USA.
+ *
+ */
+
+
+
+/*
+ * 3DNow version of
+ * Quantisation for non-intra blocks using Test Model 5 quantization
+ *
+ * this quantizer has a bias of 1/8 stepsize towards zero
+ * (except for the DC coefficient)
+ *
+ * PRECONDITION: src dst point to *disinct* memory buffers...
+ * of block_count *adjacent* int16_t[64] arrays...
+ *
+ * RETURN: 1 If non-zero coefficients left after quantisaion 0 otherwise
+ */
+
+#include "config.h"
+#include <stdio.h>
+#include <math.h>
+#include <fenv.h>
+#include "global.h"
+#include "cpu_accel.h"
+#include "simd.h"
+#include "attributes.h"
+#include "mmx.h"
+
+/*
+ * Quantisation for non-intra blocks
+ *
+ * Various versions for various SIMD instruction sets. Not all of them
+ * bother to implement the test model 5 quantisation of the reference source
+ * (this has a bias of 1/8 stepsize towards zero - except for the DC coefficient).
+ *
+ * Actually, as far as I can tell even the reference source doesn't quite do it
+ * for non-intra (though it *does* for intra).
+ *
+ * Careful analysis of the code also suggests what it actually does is truncate
+ * with a modest bias towards 1 (the d>>2 factor)
+ *
+ * PRECONDITION: src dst point to *disinct* memory buffers...
+ * of block_count *adjacent* int16_t[64] arrays...
+ *
+ *RETURN: A bit-mask of block_count bits indicating non-zero blocks (a 1).
+ */
+
+
+/*
+ * 3D-Now version: simply truncates to zero, however, the tables have a 2% bias
+ * upwards which partly compensates.
+ */
+
+int quant_non_intra_hv_3dnow(
+ pict_data_s *picture,
+ int16_t *src, int16_t *dst,
+ int mquant,
+ int *nonsat_mquant)
+{
+ int saturated;
+ int satlim = dctsatlim;
+ float *i_quant_matf;
+ int coeff_count = 64*block_count;
+ uint32_t nzflag, flags;
+ int16_t *psrc, *pdst;
+ float *piqf;
+ int i;
+ uint32_t tmp;
+
+ /* Initialise zero block flags */
+ /* Load 1 into mm6 */
+ __asm__ ( "movl %0, %%eax\n"
+ "movd %%eax, %%mm6\n"
+ : :"g" (1) : "eax" );
+ /* Load satlim into mm1 */
+ movd_m2r( satlim, mm1 );
+ punpcklwd_r2r( mm1, mm1 );
+ punpckldq_r2r( mm1, mm1 );
+restart:
+ i_quant_matf = i_inter_q_tblf[mquant];
+ flags = 0;
+ piqf = i_quant_matf;
+ saturated = 0;
+ nzflag = 0;
+ psrc = src;
+ pdst = dst;
+ for (i=0; i < coeff_count ; i+=4)
+ {
+
+ /* TODO: For maximum efficiency this should be unrolled to allow
+ f.p. and int MMX to be interleaved...
+ */
+
+ /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
+ */
+
+ movq_m2r( *(mmx_t *)&psrc[0], mm2 );
+ movq_r2r( mm2, mm7 );
+ psraw_i2r( 16, mm7 ); /* Replicate sign bits mm2 in mm7 */
+ movq_r2r( mm2, mm3 );
+ punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */
+ punpckhwd_r2r( mm7, mm3);
+
+ /* Multiply by sixteen... */
+ pslld_i2r( 4, mm2 );
+ pslld_i2r( 4, mm3 );
+
+ /*
+ Load the inverse quantisation factors from the
+ table in to mm4 and mm5
+ Interleaved with converting mm2 and mm3 to float's
+ to (hopefully) maximise parallelism.
+ */
+ movq_m2r( *(mmx_t*)&piqf[0], mm4);
+ pi2fd_r2r( mm2, mm2);
+ movq_m2r( *(mmx_t*)&piqf[2], mm5);
+ pi2fd_r2r( mm3, mm3);
+
+ /* "Divide" by multiplying by inverse quantisation
+ and convert back to integers*/
+ pfmul_r2r( mm4, mm2 );
+ pf2id_r2r( mm2, mm2);
+ pfmul_r2r( mm5, mm3);
+ pf2id_r2r( mm3, mm3);
+
+
+ /* Convert the two pairs of double words into four words */
+ packssdw_r2r( mm3, mm2);
+
+
+ /* Accumulate saturation... */
+ movq_r2r( mm2, mm4 );
+
+ pxor_r2r( mm5, mm5 ); // mm5 = -mm2
+ pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim)
+ psubw_r2r( mm2, mm5 );
+ pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim
+ por_r2r( mm5, mm4 ); // mm4 = abs(mm2) > satlim
+ movq_r2r( mm4, mm5 );
+ psrlq_i2r( 32, mm5);
+ por_r2r( mm5, mm4 );
+
+ movd_m2r( saturated, mm5 ); // saturated |= mm4
+ por_r2r( mm4, mm5 );
+ movd_r2m( mm5, saturated );
+
+ /* Store and accumulate zero-ness */
+ movq_r2r( mm2, mm3 );
+ movq_r2m( mm2, *(mmx_t*)pdst );
+ psrlq_i2r( 32, mm3 );
+ por_r2r( mm3, mm2 );
+ movd_r2m( mm2, tmp );
+ flags |= tmp;
+
+ piqf += 4;
+ pdst += 4;
+ psrc += 4;
+
+ if( (i & 63) == (63/4)*4 )
+ {
+
+ if( saturated )
+ {
+ int new_mquant = next_larger_quant_hv( picture, mquant );
+ if( new_mquant != mquant )
+ {
+ mquant = new_mquant;
+ goto restart;
+ }
+ else
+ {
+ return quant_non_intra_hv(picture, src, dst, mquant,
+ nonsat_mquant);
+ }
+ }
+
+ nzflag = (nzflag<<1) | !!flags;
+ flags = 0;
+ piqf = i_quant_matf;
+ }
+
+ }
+ femms();
+
+ //nzflag = (nzflag<<1) | (!!flags);
+ return nzflag;
+}
+
+
+
+
+
+
+
+/* Disabled in mjpegtools */
+
+#if 0
+/*
+ * SSE version: simply truncates to zero, however, the tables have a 2% bias
+ * upwards which partly compensates.
+ */
+static int trunc_mxcsr = 0x7f80;
+
+int quant_non_intra_hv_sse(
+ pict_data_s *picture,
+ int16_t *src, int16_t *dst,
+ int mquant,
+ int *nonsat_mquant)
+{
+ int saturated;
+ int satlim = dctsatlim;
+ float *i_quant_matf;
+ int coeff_count = 64*block_count;
+ uint32_t nzflag, flags;
+ int16_t *psrc, *pdst;
+ float *piqf;
+ int i;
+ uint32_t tmp;
+
+ /* Initialise zero block flags */
+ /* Load 1 into mm6 */
+ __asm__ ( "movl %0, %%eax\n"
+ "movd %%eax, %%mm6\n"
+ : :"g" (1) : "eax" );
+ /* Set up SSE rounding mode */
+ __asm__ ( "ldmxcsr (%0)\n" : : "X" (trunc_mxcsr) );
+
+ /* Load satlim into mm1 */
+ movd_m2r( satlim, mm1 );
+ punpcklwd_r2r( mm1, mm1 );
+ punpckldq_r2r( mm1, mm1 );
+restart:
+ i_quant_matf = i_inter_q_tblf[mquant];
+ flags = 0;
+ piqf = i_quant_matf;
+ saturated = 0;
+ nzflag = 0;
+ psrc = src;
+ pdst = dst;
+ for (i=0; i < coeff_count ; i+=4)
+ {
+
+ /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
+ */
+
+ movq_m2r( *(mmx_t *)&psrc[0], mm2 );
+ movq_r2r( mm2, mm7 );
+ psraw_i2r( 16, mm7 ); /* Replicate sign bits mm2 in mm7 */
+ movq_r2r( mm2, mm3 );
+ punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */
+ punpckhwd_r2r( mm7, mm3);
+
+ /* Multiply by sixteen... */
+ pslld_i2r( 4, mm2 );
+ pslld_i2r( 4, mm3 );
+
+ /*
+ Convert mm2 and mm3 to float's in xmm2 and xmm3
+ */
+ cvtpi2ps_r2r( mm2, xmm2 );
+ cvtpi2ps_r2r( mm3, xmm3 );
+ shufps_r2ri( xmm3, xmm2, 0*1 + 1*4 + 0 * 16 + 1 * 64 );
+
+ /* "Divide" by multiplying by inverse quantisation
+ and convert back to integers*/
+ mulps_m2r( *(mmx_t*)&piqf[0], xmm2 );
+ cvtps2pi_r2r( xmm2, mm2 );
+ shufps_r2ri( xmm2, xmm2, 2*1 + 3*4 + 0 * 16 + 1 * 64 );
+ cvtps2pi_r2r( xmm2, mm3 );
+
+ /* Convert the two pairs of double words into four words */
+ packssdw_r2r( mm3, mm2);
+
+
+ /* Accumulate saturation... */
+ movq_r2r( mm2, mm4 );
+
+ pxor_r2r( mm5, mm5 ); // mm5 = -mm2
+ pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim)
+ psubw_r2r( mm2, mm5 );
+ pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim
+ por_r2r( mm5, mm4 ); // mm4 = abs(mm2) > satlim
+ movq_r2r( mm4, mm5 );
+ psrlq_i2r( 32, mm5);
+ por_r2r( mm5, mm4 );
+
+ movd_m2r( saturated, mm5 ); // saturated |= mm4
+ por_r2r( mm4, mm5 );
+ movd_r2m( mm5, saturated );
+
+ /* Store and accumulate zero-ness */
+ movq_r2r( mm2, mm3 );
+ movq_r2m( mm2, *(mmx_t*)pdst );
+ psrlq_i2r( 32, mm3 );
+ por_r2r( mm3, mm2 );
+ movd_r2m( mm2, tmp );
+ flags |= tmp;
+
+ piqf += 4;
+ pdst += 4;
+ psrc += 4;
+
+ if( (i & 63) == (63/4)*4 )
+ {
+
+ if( saturated )
+ {
+ int new_mquant = next_larger_quant_hv( picture, mquant );
+ if( new_mquant != mquant )
+ {
+ mquant = new_mquant;
+ goto restart;
+ }
+ else
+ {
+ return quant_non_intra_hv(picture, src, dst, mquant,
+ nonsat_mquant);
+ }
+ }
+
+ nzflag = (nzflag<<1) | !!flags;
+ flags = 0;
+ piqf = i_quant_matf;
+ }
+
+ }
+ emms();
+
+ //nzflag = (nzflag<<1) | (!!flags);
+ return nzflag;
+}
+
+#endif
+
+/*
+ * The ordinary MMX version. Due to the limited dynamic range afforded by working
+ * with 16-bit int's it (a) has to jump through some gory fudge-factor hoops
+ * (b) give up in tough cases and fall back on the reference code. Fortunately, the
+ * latter happens *very* rarely.
+ *
+ * TODO Replace the inefficient block-by-block call to the assembler by a sweep
+ * through the whole lot...
+ */
+
+int quant_non_intra_hv_mmx(
+ pict_data_s *picture,
+ int16_t *src, int16_t *dst,
+ int mquant,
+ int *nonsat_mquant)
+{
+
+ int nzflag;
+ int clipvalue = dctsatlim;
+ int flags = 0;
+ int saturated = 0;
+ uint16_t *quant_mat = inter_q;
+ int comp;
+ uint16_t *i_quant_mat = i_inter_q;
+ int imquant;
+ int16_t *psrc, *pdst;
+
+ /* MMX routine does not work right for MQ=2 ... (no unsigned mult) */
+ if( mquant == 2 )
+ {
+ return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant);
+ }
+ /* If available use the fast MMX quantiser. It returns
+ flags to signal if coefficients are outside its limited range or
+ saturation would occur with the specified quantisation
+ factor
+ Top 16 bits - non zero quantised coefficient present
+ Bits 8-15 - Saturation occurred
+ Bits 0-7 - Coefficient out of range.
+ */
+
+ nzflag = 0;
+ pdst = dst;
+ psrc = src;
+ comp = 0;
+ do
+ {
+ imquant = (IQUANT_SCALE/mquant);
+ flags = quantize_ni_mmx( pdst, psrc, quant_mat, i_quant_mat,
+ imquant, mquant, clipvalue );
+ nzflag = (nzflag << 1) |( !!(flags & 0xffff0000));
+
+ /* If we're saturating simply bump up quantization and start
+ from scratch... if we can't avoid saturation by
+ quantising then we're hosed and we fall back to
+ saturation using the old C code. */
+
+ if( (flags & 0xff00) != 0 )
+ {
+ int new_mquant = next_larger_quant_hv( picture, mquant );
+ if( new_mquant != mquant )
+ {
+ mquant = new_mquant;
+ }
+ else
+ {
+ saturated = 1;
+ break;
+ }
+
+ comp = 0;
+ nzflag = 0;
+ pdst = dst;
+ psrc = src;
+ }
+ else
+ {
+ ++comp;
+ pdst += 64;
+ psrc +=64;
+ }
+ /* Fall back to 32-bit(or better - if some hero(ine) made this work on
+ non 32-bit int machines ;-)) if out of dynamic range for MMX...
+ */
+ }
+ while( comp < block_count && (flags & 0xff) == 0 );
+
+
+ /* Coefficient out of range or can't avoid saturation:
+ fall back to the original 32-bit int version: this is rare */
+ if( (flags & 0xff) != 0 || saturated)
+ {
+ return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant);
+ }
+
+ *nonsat_mquant = mquant;
+ return nzflag;
+}
+
+
+void iquant1_intra(int16_t *src, int16_t *dst, int dc_prec, int mquant)
+{
+ int i, val;
+ uint16_t *quant_mat = intra_q;
+
+ dst[0] = src[0] << (3-dc_prec);
+ for (i=1; i<64; i++)
+ {
+ val = (int)(src[i]*quant_mat[i]*mquant)/16;
+
+ /* mismatch control */
+ if ((val&1)==0 && val!=0)
+ val+= (val>0) ? -1 : 1;
+
+ /* saturation */
+ dst[i] = (val>2047) ? 2047 : ((val<-2048) ? -2048 : val);
+ }
+}