1 /* quantize_x86.c Quantization / inverse quantization
2 In compiler (gcc) embdeed assmbley language...
5 /* Copyright (C) 2000 Andrew Stevens */
7 /* This program is free software; you can redistribute it
8 * and/or modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2 of
10 * the License, or (at your option) any later version.
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * General Public License for more details.
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
28 * Quantisation for non-intra blocks using Test Model 5 quantization
30 * this quantizer has a bias of 1/8 stepsize towards zero
31 * (except for the DC coefficient)
33 * PRECONDITION: src dst point to *disinct* memory buffers...
34 * of block_count *adjacent* int16_t[64] arrays...
36 * RETURN: 1 If non-zero coefficients left after quantisaion 0 otherwise
44 #include "cpu_accel.h"
46 #include "attributes.h"
50 * Quantisation for non-intra blocks
52 * Various versions for various SIMD instruction sets. Not all of them
53 * bother to implement the test model 5 quantisation of the reference source
54 * (this has a bias of 1/8 stepsize towards zero - except for the DC coefficient).
56 * Actually, as far as I can tell even the reference source doesn't quite do it
57 * for non-intra (though it *does* for intra).
59 * Careful analysis of the code also suggests what it actually does is truncate
60 * with a modest bias towards 1 (the d>>2 factor)
62 * PRECONDITION: src dst point to *disinct* memory buffers...
63 * of block_count *adjacent* int16_t[64] arrays...
65 *RETURN: A bit-mask of block_count bits indicating non-zero blocks (a 1).
70 * 3D-Now version: simply truncates to zero, however, the tables have a 2% bias
71 * upwards which partly compensates.
74 int quant_non_intra_hv_3dnow(
76 int16_t *src, int16_t *dst,
81 int satlim = dctsatlim;
83 int coeff_count = 64*block_count;
84 uint32_t nzflag, flags;
90 /* Initialise zero block flags */
92 __asm__ ( "movl %0, %%eax\n"
95 /* Load satlim into mm1 */
96 movd_m2r( satlim, mm1 );
97 punpcklwd_r2r( mm1, mm1 );
98 punpckldq_r2r( mm1, mm1 );
100 i_quant_matf = i_inter_q_tblf[mquant];
107 for (i=0; i < coeff_count ; i+=4)
110 /* TODO: For maximum efficiency this should be unrolled to allow
111 f.p. and int MMX to be interleaved...
114 /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
117 movq_m2r( *(mmx_t *)&psrc[0], mm2 );
118 movq_r2r( mm2, mm7 );
119 psraw_i2r( 16, mm7 ); /* Replicate sign bits mm2 in mm7 */
120 movq_r2r( mm2, mm3 );
121 punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */
122 punpckhwd_r2r( mm7, mm3);
124 /* Multiply by sixteen... */
129 Load the inverse quantisation factors from the
130 table in to mm4 and mm5
131 Interleaved with converting mm2 and mm3 to float's
132 to (hopefully) maximise parallelism.
134 movq_m2r( *(mmx_t*)&piqf[0], mm4);
135 pi2fd_r2r( mm2, mm2);
136 movq_m2r( *(mmx_t*)&piqf[2], mm5);
137 pi2fd_r2r( mm3, mm3);
139 /* "Divide" by multiplying by inverse quantisation
140 and convert back to integers*/
141 pfmul_r2r( mm4, mm2 );
142 pf2id_r2r( mm2, mm2);
143 pfmul_r2r( mm5, mm3);
144 pf2id_r2r( mm3, mm3);
147 /* Convert the two pairs of double words into four words */
148 packssdw_r2r( mm3, mm2);
151 /* Accumulate saturation... */
152 movq_r2r( mm2, mm4 );
154 pxor_r2r( mm5, mm5 ); // mm5 = -mm2
155 pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim)
156 psubw_r2r( mm2, mm5 );
157 pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim
158 por_r2r( mm5, mm4 ); // mm4 = abs(mm2) > satlim
159 movq_r2r( mm4, mm5 );
163 movd_m2r( saturated, mm5 ); // saturated |= mm4
165 movd_r2m( mm5, saturated );
167 /* Store and accumulate zero-ness */
168 movq_r2r( mm2, mm3 );
169 movq_r2m( mm2, *(mmx_t*)pdst );
170 psrlq_i2r( 32, mm3 );
172 movd_r2m( mm2, tmp );
179 if( (i & 63) == (63/4)*4 )
184 int new_mquant = next_larger_quant_hv( picture, mquant );
185 if( new_mquant != mquant )
192 return quant_non_intra_hv(picture, src, dst, mquant,
197 nzflag = (nzflag<<1) | !!flags;
205 //nzflag = (nzflag<<1) | (!!flags);
215 /* Disabled in mjpegtools */
219 * SSE version: simply truncates to zero, however, the tables have a 2% bias
220 * upwards which partly compensates.
222 static int trunc_mxcsr = 0x7f80;
224 int quant_non_intra_hv_sse(
225 pict_data_s *picture,
226 int16_t *src, int16_t *dst,
231 int satlim = dctsatlim;
233 int coeff_count = 64*block_count;
234 uint32_t nzflag, flags;
235 int16_t *psrc, *pdst;
240 /* Initialise zero block flags */
241 /* Load 1 into mm6 */
242 __asm__ ( "movl %0, %%eax\n"
243 "movd %%eax, %%mm6\n"
244 : :"g" (1) : "eax" );
245 /* Set up SSE rounding mode */
246 __asm__ ( "ldmxcsr (%0)\n" : : "X" (trunc_mxcsr) );
248 /* Load satlim into mm1 */
249 movd_m2r( satlim, mm1 );
250 punpcklwd_r2r( mm1, mm1 );
251 punpckldq_r2r( mm1, mm1 );
253 i_quant_matf = i_inter_q_tblf[mquant];
260 for (i=0; i < coeff_count ; i+=4)
263 /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
266 movq_m2r( *(mmx_t *)&psrc[0], mm2 );
267 movq_r2r( mm2, mm7 );
268 psraw_i2r( 16, mm7 ); /* Replicate sign bits mm2 in mm7 */
269 movq_r2r( mm2, mm3 );
270 punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */
271 punpckhwd_r2r( mm7, mm3);
273 /* Multiply by sixteen... */
278 Convert mm2 and mm3 to float's in xmm2 and xmm3
280 cvtpi2ps_r2r( mm2, xmm2 );
281 cvtpi2ps_r2r( mm3, xmm3 );
282 shufps_r2ri( xmm3, xmm2, 0*1 + 1*4 + 0 * 16 + 1 * 64 );
284 /* "Divide" by multiplying by inverse quantisation
285 and convert back to integers*/
286 mulps_m2r( *(mmx_t*)&piqf[0], xmm2 );
287 cvtps2pi_r2r( xmm2, mm2 );
288 shufps_r2ri( xmm2, xmm2, 2*1 + 3*4 + 0 * 16 + 1 * 64 );
289 cvtps2pi_r2r( xmm2, mm3 );
291 /* Convert the two pairs of double words into four words */
292 packssdw_r2r( mm3, mm2);
295 /* Accumulate saturation... */
296 movq_r2r( mm2, mm4 );
298 pxor_r2r( mm5, mm5 ); // mm5 = -mm2
299 pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim)
300 psubw_r2r( mm2, mm5 );
301 pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim
302 por_r2r( mm5, mm4 ); // mm4 = abs(mm2) > satlim
303 movq_r2r( mm4, mm5 );
307 movd_m2r( saturated, mm5 ); // saturated |= mm4
309 movd_r2m( mm5, saturated );
311 /* Store and accumulate zero-ness */
312 movq_r2r( mm2, mm3 );
313 movq_r2m( mm2, *(mmx_t*)pdst );
314 psrlq_i2r( 32, mm3 );
316 movd_r2m( mm2, tmp );
323 if( (i & 63) == (63/4)*4 )
328 int new_mquant = next_larger_quant_hv( picture, mquant );
329 if( new_mquant != mquant )
336 return quant_non_intra_hv(picture, src, dst, mquant,
341 nzflag = (nzflag<<1) | !!flags;
349 //nzflag = (nzflag<<1) | (!!flags);
356 * The ordinary MMX version. Due to the limited dynamic range afforded by working
357 * with 16-bit int's it (a) has to jump through some gory fudge-factor hoops
358 * (b) give up in tough cases and fall back on the reference code. Fortunately, the
359 * latter happens *very* rarely.
361 * TODO Replace the inefficient block-by-block call to the assembler by a sweep
362 * through the whole lot...
365 int quant_non_intra_hv_mmx(
366 pict_data_s *picture,
367 int16_t *src, int16_t *dst,
373 int clipvalue = dctsatlim;
376 uint16_t *quant_mat = inter_q;
378 uint16_t *i_quant_mat = i_inter_q;
380 int16_t *psrc, *pdst;
382 /* MMX routine does not work right for MQ=2 ... (no unsigned mult) */
385 return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant);
387 /* If available use the fast MMX quantiser. It returns
388 flags to signal if coefficients are outside its limited range or
389 saturation would occur with the specified quantisation
391 Top 16 bits - non zero quantised coefficient present
392 Bits 8-15 - Saturation occurred
393 Bits 0-7 - Coefficient out of range.
402 imquant = (IQUANT_SCALE/mquant);
403 flags = quantize_ni_mmx( pdst, psrc, quant_mat, i_quant_mat,
404 imquant, mquant, clipvalue );
405 nzflag = (nzflag << 1) |( !!(flags & 0xffff0000));
407 /* If we're saturating simply bump up quantization and start
408 from scratch... if we can't avoid saturation by
409 quantising then we're hosed and we fall back to
410 saturation using the old C code. */
412 if( (flags & 0xff00) != 0 )
414 int new_mquant = next_larger_quant_hv( picture, mquant );
415 if( new_mquant != mquant )
436 /* Fall back to 32-bit(or better - if some hero(ine) made this work on
437 non 32-bit int machines ;-)) if out of dynamic range for MMX...
440 while( comp < block_count && (flags & 0xff) == 0 );
443 /* Coefficient out of range or can't avoid saturation:
444 fall back to the original 32-bit int version: this is rare */
445 if( (flags & 0xff) != 0 || saturated)
447 return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant);
450 *nonsat_mquant = mquant;
455 void iquant1_intra(int16_t *src, int16_t *dst, int dc_prec, int mquant)
458 uint16_t *quant_mat = intra_q;
460 dst[0] = src[0] << (3-dc_prec);
463 val = (int)(src[i]*quant_mat[i]*mquant)/16;
465 /* mismatch control */
466 if ((val&1)==0 && val!=0)
467 val+= (val>0) ? -1 : 1;
470 dst[i] = (val>2047) ? 2047 : ((val<-2048) ? -2048 : val);