/* quantize_x86.c Quantization / inverse quantization In compiler (gcc) embdeed assmbley language... */ /* Copyright (C) 2000 Andrew Stevens */ /* This program is free software; you can redistribute it * and/or modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version 2 of * the License, or (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; if not, write to the Free Software * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA * 02111-1307, USA. * */ /* * 3DNow version of * Quantisation for non-intra blocks using Test Model 5 quantization * * this quantizer has a bias of 1/8 stepsize towards zero * (except for the DC coefficient) * * PRECONDITION: src dst point to *disinct* memory buffers... * of block_count *adjacent* int16_t[64] arrays... * * RETURN: 1 If non-zero coefficients left after quantisaion 0 otherwise */ #include "config.h" #include #include #include #include "global.h" #include "cpu_accel.h" #include "simd.h" #include "attributes.h" #include "mmx.h" /* * Quantisation for non-intra blocks * * Various versions for various SIMD instruction sets. Not all of them * bother to implement the test model 5 quantisation of the reference source * (this has a bias of 1/8 stepsize towards zero - except for the DC coefficient). * * Actually, as far as I can tell even the reference source doesn't quite do it * for non-intra (though it *does* for intra). * * Careful analysis of the code also suggests what it actually does is truncate * with a modest bias towards 1 (the d>>2 factor) * * PRECONDITION: src dst point to *disinct* memory buffers... * of block_count *adjacent* int16_t[64] arrays... * *RETURN: A bit-mask of block_count bits indicating non-zero blocks (a 1). */ /* * 3D-Now version: simply truncates to zero, however, the tables have a 2% bias * upwards which partly compensates. */ int quant_non_intra_hv_3dnow( pict_data_s *picture, int16_t *src, int16_t *dst, int mquant, int *nonsat_mquant) { int saturated; int satlim = dctsatlim; float *i_quant_matf; int coeff_count = 64*block_count; uint32_t nzflag, flags; int16_t *psrc, *pdst; float *piqf; int i; uint32_t tmp; /* Initialise zero block flags */ /* Load 1 into mm6 */ __asm__ ( "movl %0, %%eax\n" "movd %%eax, %%mm6\n" : :"g" (1) : "eax" ); /* Load satlim into mm1 */ movd_m2r( satlim, mm1 ); punpcklwd_r2r( mm1, mm1 ); punpckldq_r2r( mm1, mm1 ); restart: i_quant_matf = i_inter_q_tblf[mquant]; flags = 0; piqf = i_quant_matf; saturated = 0; nzflag = 0; psrc = src; pdst = dst; for (i=0; i < coeff_count ; i+=4) { /* TODO: For maximum efficiency this should be unrolled to allow f.p. and int MMX to be interleaved... */ /* Load 4 words, unpack into mm2 and mm3 (with sign extension!) */ movq_m2r( *(mmx_t *)&psrc[0], mm2 ); movq_r2r( mm2, mm7 ); psraw_i2r( 16, mm7 ); /* Replicate sign bits mm2 in mm7 */ movq_r2r( mm2, mm3 ); punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */ punpckhwd_r2r( mm7, mm3); /* Multiply by sixteen... */ pslld_i2r( 4, mm2 ); pslld_i2r( 4, mm3 ); /* Load the inverse quantisation factors from the table in to mm4 and mm5 Interleaved with converting mm2 and mm3 to float's to (hopefully) maximise parallelism. */ movq_m2r( *(mmx_t*)&piqf[0], mm4); pi2fd_r2r( mm2, mm2); movq_m2r( *(mmx_t*)&piqf[2], mm5); pi2fd_r2r( mm3, mm3); /* "Divide" by multiplying by inverse quantisation and convert back to integers*/ pfmul_r2r( mm4, mm2 ); pf2id_r2r( mm2, mm2); pfmul_r2r( mm5, mm3); pf2id_r2r( mm3, mm3); /* Convert the two pairs of double words into four words */ packssdw_r2r( mm3, mm2); /* Accumulate saturation... */ movq_r2r( mm2, mm4 ); pxor_r2r( mm5, mm5 ); // mm5 = -mm2 pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim) psubw_r2r( mm2, mm5 ); pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim por_r2r( mm5, mm4 ); // mm4 = abs(mm2) > satlim movq_r2r( mm4, mm5 ); psrlq_i2r( 32, mm5); por_r2r( mm5, mm4 ); movd_m2r( saturated, mm5 ); // saturated |= mm4 por_r2r( mm4, mm5 ); movd_r2m( mm5, saturated ); /* Store and accumulate zero-ness */ movq_r2r( mm2, mm3 ); movq_r2m( mm2, *(mmx_t*)pdst ); psrlq_i2r( 32, mm3 ); por_r2r( mm3, mm2 ); movd_r2m( mm2, tmp ); flags |= tmp; piqf += 4; pdst += 4; psrc += 4; if( (i & 63) == (63/4)*4 ) { if( saturated ) { int new_mquant = next_larger_quant_hv( picture, mquant ); if( new_mquant != mquant ) { mquant = new_mquant; goto restart; } else { return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant); } } nzflag = (nzflag<<1) | !!flags; flags = 0; piqf = i_quant_matf; } } femms(); //nzflag = (nzflag<<1) | (!!flags); return nzflag; } /* Disabled in mjpegtools */ #if 0 /* * SSE version: simply truncates to zero, however, the tables have a 2% bias * upwards which partly compensates. */ static int trunc_mxcsr = 0x7f80; int quant_non_intra_hv_sse( pict_data_s *picture, int16_t *src, int16_t *dst, int mquant, int *nonsat_mquant) { int saturated; int satlim = dctsatlim; float *i_quant_matf; int coeff_count = 64*block_count; uint32_t nzflag, flags; int16_t *psrc, *pdst; float *piqf; int i; uint32_t tmp; /* Initialise zero block flags */ /* Load 1 into mm6 */ __asm__ ( "movl %0, %%eax\n" "movd %%eax, %%mm6\n" : :"g" (1) : "eax" ); /* Set up SSE rounding mode */ __asm__ ( "ldmxcsr (%0)\n" : : "X" (trunc_mxcsr) ); /* Load satlim into mm1 */ movd_m2r( satlim, mm1 ); punpcklwd_r2r( mm1, mm1 ); punpckldq_r2r( mm1, mm1 ); restart: i_quant_matf = i_inter_q_tblf[mquant]; flags = 0; piqf = i_quant_matf; saturated = 0; nzflag = 0; psrc = src; pdst = dst; for (i=0; i < coeff_count ; i+=4) { /* Load 4 words, unpack into mm2 and mm3 (with sign extension!) */ movq_m2r( *(mmx_t *)&psrc[0], mm2 ); movq_r2r( mm2, mm7 ); psraw_i2r( 16, mm7 ); /* Replicate sign bits mm2 in mm7 */ movq_r2r( mm2, mm3 ); punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */ punpckhwd_r2r( mm7, mm3); /* Multiply by sixteen... */ pslld_i2r( 4, mm2 ); pslld_i2r( 4, mm3 ); /* Convert mm2 and mm3 to float's in xmm2 and xmm3 */ cvtpi2ps_r2r( mm2, xmm2 ); cvtpi2ps_r2r( mm3, xmm3 ); shufps_r2ri( xmm3, xmm2, 0*1 + 1*4 + 0 * 16 + 1 * 64 ); /* "Divide" by multiplying by inverse quantisation and convert back to integers*/ mulps_m2r( *(mmx_t*)&piqf[0], xmm2 ); cvtps2pi_r2r( xmm2, mm2 ); shufps_r2ri( xmm2, xmm2, 2*1 + 3*4 + 0 * 16 + 1 * 64 ); cvtps2pi_r2r( xmm2, mm3 ); /* Convert the two pairs of double words into four words */ packssdw_r2r( mm3, mm2); /* Accumulate saturation... */ movq_r2r( mm2, mm4 ); pxor_r2r( mm5, mm5 ); // mm5 = -mm2 pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim) psubw_r2r( mm2, mm5 ); pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim por_r2r( mm5, mm4 ); // mm4 = abs(mm2) > satlim movq_r2r( mm4, mm5 ); psrlq_i2r( 32, mm5); por_r2r( mm5, mm4 ); movd_m2r( saturated, mm5 ); // saturated |= mm4 por_r2r( mm4, mm5 ); movd_r2m( mm5, saturated ); /* Store and accumulate zero-ness */ movq_r2r( mm2, mm3 ); movq_r2m( mm2, *(mmx_t*)pdst ); psrlq_i2r( 32, mm3 ); por_r2r( mm3, mm2 ); movd_r2m( mm2, tmp ); flags |= tmp; piqf += 4; pdst += 4; psrc += 4; if( (i & 63) == (63/4)*4 ) { if( saturated ) { int new_mquant = next_larger_quant_hv( picture, mquant ); if( new_mquant != mquant ) { mquant = new_mquant; goto restart; } else { return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant); } } nzflag = (nzflag<<1) | !!flags; flags = 0; piqf = i_quant_matf; } } emms(); //nzflag = (nzflag<<1) | (!!flags); return nzflag; } #endif /* * The ordinary MMX version. Due to the limited dynamic range afforded by working * with 16-bit int's it (a) has to jump through some gory fudge-factor hoops * (b) give up in tough cases and fall back on the reference code. Fortunately, the * latter happens *very* rarely. * * TODO Replace the inefficient block-by-block call to the assembler by a sweep * through the whole lot... */ int quant_non_intra_hv_mmx( pict_data_s *picture, int16_t *src, int16_t *dst, int mquant, int *nonsat_mquant) { int nzflag; int clipvalue = dctsatlim; int flags = 0; int saturated = 0; uint16_t *quant_mat = inter_q; int comp; uint16_t *i_quant_mat = i_inter_q; int imquant; int16_t *psrc, *pdst; /* MMX routine does not work right for MQ=2 ... (no unsigned mult) */ if( mquant == 2 ) { return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant); } /* If available use the fast MMX quantiser. It returns flags to signal if coefficients are outside its limited range or saturation would occur with the specified quantisation factor Top 16 bits - non zero quantised coefficient present Bits 8-15 - Saturation occurred Bits 0-7 - Coefficient out of range. */ nzflag = 0; pdst = dst; psrc = src; comp = 0; do { imquant = (IQUANT_SCALE/mquant); flags = quantize_ni_mmx( pdst, psrc, quant_mat, i_quant_mat, imquant, mquant, clipvalue ); nzflag = (nzflag << 1) |( !!(flags & 0xffff0000)); /* If we're saturating simply bump up quantization and start from scratch... if we can't avoid saturation by quantising then we're hosed and we fall back to saturation using the old C code. */ if( (flags & 0xff00) != 0 ) { int new_mquant = next_larger_quant_hv( picture, mquant ); if( new_mquant != mquant ) { mquant = new_mquant; } else { saturated = 1; break; } comp = 0; nzflag = 0; pdst = dst; psrc = src; } else { ++comp; pdst += 64; psrc +=64; } /* Fall back to 32-bit(or better - if some hero(ine) made this work on non 32-bit int machines ;-)) if out of dynamic range for MMX... */ } while( comp < block_count && (flags & 0xff) == 0 ); /* Coefficient out of range or can't avoid saturation: fall back to the original 32-bit int version: this is rare */ if( (flags & 0xff) != 0 || saturated) { return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant); } *nonsat_mquant = mquant; return nzflag; } void iquant1_intra(int16_t *src, int16_t *dst, int dc_prec, int mquant) { int i, val; uint16_t *quant_mat = intra_q; dst[0] = src[0] << (3-dc_prec); for (i=1; i<64; i++) { val = (int)(src[i]*quant_mat[i]*mquant)/16; /* mismatch control */ if ((val&1)==0 && val!=0) val+= (val>0) ? -1 : 1; /* saturation */ dst[i] = (val>2047) ? 2047 : ((val<-2048) ? -2048 : val); } }