cinelerra-5.1/mpeg2enc/quantize_x86.c

   1 /* quantize_x86.c Quantization / inverse quantization
   2    In compiler (gcc) embdeed assmbley language...
   3 */
   4
   5 /* Copyright (C) 2000 Andrew Stevens */
   6
   7 /* This program is free software; you can redistribute it
   8  *  and/or modify it under the terms of the GNU General Public License
   9  *  as published by the Free Software Foundation; either version 2 of
  10  *  the License, or (at your option) any later version.
  11  *
  12  *  This program is distributed in the hope that it will be useful,
  13  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  *  General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU General Public License
  18  * along with this program; if not, write to the Free Software
  19  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
  20  * 02111-1307, USA.
  21  *
  22  */
  23
  24
  25
  26 /*
  27  * 3DNow version of
  28  * Quantisation for non-intra blocks using Test Model 5 quantization
  29  *
  30  * this quantizer has a bias of 1/8 stepsize towards zero
  31  * (except for the DC coefficient)
  32  *
  33  *      PRECONDITION: src dst point to *disinct* memory buffers...
  34  *                    of block_count *adjacent* int16_t[64] arrays...
  35  *
  36  * RETURN: 1 If non-zero coefficients left after quantisaion 0 otherwise
  37  */
  38
  39 #include "config.h"
  40 #include <stdio.h>
  41 #include <math.h>
  42 #include <fenv.h>
  43 #include "global.h"
  44 #include "cpu_accel.h"
  45 #include "simd.h"
  46 #include "attributes.h"
  47 #include "mmx.h"
  48
  49 /*
  50  * Quantisation for non-intra blocks
  51  *
  52  * Various versions for various SIMD instruction sets.  Not all of them
  53  * bother to implement the test model 5 quantisation of the reference source
  54  * (this has a bias of 1/8 stepsize towards zero - except for the DC coefficient).
  55  *
  56  * Actually, as far as I can tell even the reference source doesn't quite do it
  57  * for non-intra (though it *does* for intra).
  58  *
  59  * Careful analysis of the code also suggests what it actually does is truncate
  60  * with a modest bias towards 1 (the d>>2 factor)
  61  *
  62  *      PRECONDITION: src dst point to *disinct* memory buffers...
  63  *                    of block_count *adjacent* int16_t[64] arrays...
  64  *
  65  *RETURN: A bit-mask of block_count bits indicating non-zero blocks (a 1).
  66  */
  67
  68
  69 /*
  70  * 3D-Now version: simply truncates to zero, however, the tables have a 2% bias
  71  * upwards which partly compensates.
  72  */
  73
  74 int quant_non_intra_hv_3dnow(
  75         pict_data_s *picture,
  76         int16_t *src, int16_t *dst,
  77         int mquant,
  78         int *nonsat_mquant)
  79 {
  80         int saturated;
  81         int satlim = dctsatlim;
  82         float *i_quant_matf;
  83         int   coeff_count = 64*block_count;
  84         uint32_t nzflag, flags;
  85         int16_t *psrc, *pdst;
  86         float *piqf;
  87         int i;
  88         uint32_t tmp;
  89
  90         /* Initialise zero block flags */
  91         /* Load 1 into mm6 */
  92         __asm__ ( "movl %0, %%eax\n"
  93                           "movd %%eax, %%mm6\n"
  94                           : :"g" (1) : "eax" );
  95         /* Load satlim into mm1 */
  96         movd_m2r( satlim, mm1 );
  97         punpcklwd_r2r( mm1, mm1 );
  98         punpckldq_r2r( mm1, mm1 );
  99 restart:
 100         i_quant_matf = i_inter_q_tblf[mquant];
 101         flags = 0;
 102         piqf = i_quant_matf;
 103         saturated = 0;
 104         nzflag = 0;
 105         psrc = src;
 106         pdst = dst;
 107         for (i=0; i < coeff_count ; i+=4)
 108         {
 109
 110                 /* TODO: For maximum efficiency this should be unrolled to allow
 111                    f.p. and int MMX to be interleaved...
 112                 */
 113
 114                 /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
 115                  */
 116
 117                 movq_m2r( *(mmx_t *)&psrc[0], mm2 );
 118                 movq_r2r( mm2, mm7 );
 119                 psraw_i2r( 16, mm7 );   /* Replicate sign bits mm2 in mm7 */
 120                 movq_r2r( mm2, mm3 );
 121                 punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */
 122                 punpckhwd_r2r( mm7, mm3);
 123
 124                 /* Multiply by sixteen... */
 125                 pslld_i2r( 4, mm2 );
 126                 pslld_i2r( 4, mm3 );
 127
 128                 /*
 129                    Load the inverse quantisation factors from the
 130                    table in to mm4 and mm5
 131                    Interleaved with converting mm2 and mm3 to float's
 132                    to (hopefully) maximise parallelism.
 133                  */
 134                 movq_m2r( *(mmx_t*)&piqf[0], mm4);
 135                 pi2fd_r2r( mm2, mm2);
 136                 movq_m2r( *(mmx_t*)&piqf[2], mm5);
 137                 pi2fd_r2r( mm3, mm3);
 138
 139                 /* "Divide" by multiplying by inverse quantisation
 140                  and convert back to integers*/
 141                 pfmul_r2r( mm4, mm2 );
 142                 pf2id_r2r( mm2, mm2);
 143                 pfmul_r2r( mm5, mm3);
 144                 pf2id_r2r( mm3, mm3);
 145
 146
 147                 /* Convert the two pairs of double words into four words */
 148                 packssdw_r2r(  mm3, mm2);
 149
 150
 151                 /* Accumulate saturation... */
 152                 movq_r2r( mm2, mm4 );
 153
 154                 pxor_r2r( mm5, mm5 );   // mm5 = -mm2
 155                 pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim)
 156                 psubw_r2r( mm2, mm5 );
 157                 pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim
 158                 por_r2r( mm5, mm4 );  // mm4 = abs(mm2) > satlim
 159                 movq_r2r( mm4, mm5 );
 160                 psrlq_i2r( 32, mm5);
 161                 por_r2r( mm5, mm4 );
 162
 163                 movd_m2r( saturated, mm5 ); // saturated |= mm4
 164                 por_r2r( mm4, mm5 );
 165                 movd_r2m( mm5, saturated );
 166
 167                 /* Store and accumulate zero-ness */
 168                 movq_r2r( mm2, mm3 );
 169                 movq_r2m( mm2, *(mmx_t*)pdst );
 170                 psrlq_i2r( 32, mm3 );
 171                 por_r2r( mm3, mm2 );
 172                 movd_r2m( mm2, tmp );
 173                 flags |= tmp;
 174
 175                 piqf += 4;
 176                 pdst += 4;
 177                 psrc += 4;
 178
 179                 if( (i & 63) == (63/4)*4 )
 180                 {
 181
 182                         if( saturated )
 183                         {
 184                                 int new_mquant = next_larger_quant_hv( picture, mquant );
 185                                 if( new_mquant != mquant )
 186                                 {
 187                                         mquant = new_mquant;
 188                                         goto restart;
 189                                 }
 190                                 else
 191                                 {
 192                                         return quant_non_intra_hv(picture, src, dst, mquant,
 193                                                                                    nonsat_mquant);
 194                                 }
 195                         }
 196
 197                         nzflag = (nzflag<<1) | !!flags;
 198                         flags = 0;
 199                         piqf = i_quant_matf;
 200                 }
 201
 202         }
 203         femms();
 204
 205         //nzflag = (nzflag<<1) | (!!flags);
 206         return nzflag;
 207 }
 208
 209
 210
 211
 212
 213
 214
 215 /* Disabled in mjpegtools */
 216
 217 #if 0
 218 /*
 219  * SSE version: simply truncates to zero, however, the tables have a 2% bias
 220  * upwards which partly compensates.
 221  */
 222 static int trunc_mxcsr = 0x7f80;
 223
 224 int quant_non_intra_hv_sse(
 225         pict_data_s *picture,
 226         int16_t *src, int16_t *dst,
 227         int mquant,
 228         int *nonsat_mquant)
 229 {
 230         int saturated;
 231         int satlim = dctsatlim;
 232         float *i_quant_matf;
 233         int   coeff_count = 64*block_count;
 234         uint32_t nzflag, flags;
 235         int16_t *psrc, *pdst;
 236         float *piqf;
 237         int i;
 238         uint32_t tmp;
 239
 240         /* Initialise zero block flags */
 241         /* Load 1 into mm6 */
 242         __asm__ ( "movl %0, %%eax\n"
 243                           "movd %%eax, %%mm6\n"
 244                           : :"g" (1) : "eax" );
 245         /* Set up SSE rounding mode */
 246         __asm__ ( "ldmxcsr (%0)\n" : : "X" (trunc_mxcsr) );
 247
 248         /* Load satlim into mm1 */
 249         movd_m2r( satlim, mm1 );
 250         punpcklwd_r2r( mm1, mm1 );
 251         punpckldq_r2r( mm1, mm1 );
 252 restart:
 253         i_quant_matf = i_inter_q_tblf[mquant];
 254         flags = 0;
 255         piqf = i_quant_matf;
 256         saturated = 0;
 257         nzflag = 0;
 258         psrc = src;
 259         pdst = dst;
 260         for (i=0; i < coeff_count ; i+=4)
 261         {
 262
 263                 /* Load 4 words, unpack into mm2 and mm3 (with sign extension!)
 264                  */
 265
 266                 movq_m2r( *(mmx_t *)&psrc[0], mm2 );
 267                 movq_r2r( mm2, mm7 );
 268                 psraw_i2r( 16, mm7 );   /* Replicate sign bits mm2 in mm7 */
 269                 movq_r2r( mm2, mm3 );
 270                 punpcklwd_r2r( mm7, mm2 ); /* Unpack with sign extensions */
 271                 punpckhwd_r2r( mm7, mm3);
 272
 273                 /* Multiply by sixteen... */
 274                 pslld_i2r( 4, mm2 );
 275                 pslld_i2r( 4, mm3 );
 276
 277                 /*
 278                   Convert mm2 and mm3 to float's  in xmm2 and xmm3
 279                  */
 280                 cvtpi2ps_r2r( mm2, xmm2 );
 281                 cvtpi2ps_r2r( mm3, xmm3 );
 282                 shufps_r2ri(  xmm3, xmm2, 0*1 + 1*4 + 0 * 16 + 1 * 64 );
 283
 284                 /* "Divide" by multiplying by inverse quantisation
 285                  and convert back to integers*/
 286                 mulps_m2r( *(mmx_t*)&piqf[0], xmm2 );
 287                 cvtps2pi_r2r( xmm2, mm2 );
 288                 shufps_r2ri( xmm2, xmm2, 2*1 + 3*4 + 0 * 16 + 1 * 64 );
 289                 cvtps2pi_r2r( xmm2, mm3 );
 290
 291                 /* Convert the two pairs of double words into four words */
 292                 packssdw_r2r(  mm3, mm2);
 293
 294
 295                 /* Accumulate saturation... */
 296                 movq_r2r( mm2, mm4 );
 297
 298                 pxor_r2r( mm5, mm5 );   // mm5 = -mm2
 299                 pcmpgtw_r2r( mm1, mm4 ); // mm4 = (mm2 > satlim)
 300                 psubw_r2r( mm2, mm5 );
 301                 pcmpgtw_r2r( mm1, mm5 ); // mm5 = -mm2 > satlim
 302                 por_r2r( mm5, mm4 );  // mm4 = abs(mm2) > satlim
 303                 movq_r2r( mm4, mm5 );
 304                 psrlq_i2r( 32, mm5);
 305                 por_r2r( mm5, mm4 );
 306
 307                 movd_m2r( saturated, mm5 ); // saturated |= mm4
 308                 por_r2r( mm4, mm5 );
 309                 movd_r2m( mm5, saturated );
 310
 311                 /* Store and accumulate zero-ness */
 312                 movq_r2r( mm2, mm3 );
 313                 movq_r2m( mm2, *(mmx_t*)pdst );
 314                 psrlq_i2r( 32, mm3 );
 315                 por_r2r( mm3, mm2 );
 316                 movd_r2m( mm2, tmp );
 317                 flags |= tmp;
 318
 319                 piqf += 4;
 320                 pdst += 4;
 321                 psrc += 4;
 322
 323                 if( (i & 63) == (63/4)*4 )
 324                 {
 325
 326                         if( saturated )
 327                         {
 328                                 int new_mquant = next_larger_quant_hv( picture, mquant );
 329                                 if( new_mquant != mquant )
 330                                 {
 331                                         mquant = new_mquant;
 332                                         goto restart;
 333                                 }
 334                                 else
 335                                 {
 336                                         return quant_non_intra_hv(picture, src, dst, mquant,
 337                                                                                    nonsat_mquant);
 338                                 }
 339                         }
 340
 341                         nzflag = (nzflag<<1) | !!flags;
 342                         flags = 0;
 343                         piqf = i_quant_matf;
 344                 }
 345
 346         }
 347         emms();
 348
 349         //nzflag = (nzflag<<1) | (!!flags);
 350         return nzflag;
 351 }
 352
 353 #endif
 354
 355 /*
 356  * The ordinary MMX version.  Due to the limited dynamic range afforded by working
 357  * with 16-bit int's it (a) has to jump through some gory fudge-factor hoops
 358  * (b) give up in tough cases and fall back on the reference code. Fortunately, the
 359  * latter happens *very* rarely.
 360  *
 361  * TODO Replace the inefficient block-by-block call to the assembler by a sweep
 362  * through the whole lot...
 363  */
 364
 365 int quant_non_intra_hv_mmx(
 366         pict_data_s *picture,
 367         int16_t *src, int16_t *dst,
 368         int mquant,
 369         int *nonsat_mquant)
 370 {
 371
 372         int nzflag;
 373         int clipvalue  = dctsatlim;
 374         int flags = 0;
 375         int saturated = 0;
 376         uint16_t *quant_mat = inter_q;
 377         int comp;
 378         uint16_t *i_quant_mat = i_inter_q;
 379         int imquant;
 380         int16_t *psrc, *pdst;
 381
 382         /* MMX routine does not work right for MQ=2 ... (no unsigned mult) */
 383         if( mquant == 2 )
 384         {
 385                 return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant);
 386         }
 387         /* If available use the fast MMX quantiser.  It returns
 388            flags to signal if coefficients are outside its limited range or
 389            saturation would occur with the specified quantisation
 390            factor
 391            Top 16 bits - non zero quantised coefficient present
 392            Bits 8-15   - Saturation occurred
 393            Bits 0-7    - Coefficient out of range.
 394         */
 395
 396         nzflag = 0;
 397         pdst = dst;
 398         psrc = src;
 399         comp = 0;
 400         do
 401         {
 402                 imquant = (IQUANT_SCALE/mquant);
 403                 flags = quantize_ni_mmx( pdst, psrc, quant_mat, i_quant_mat,
 404                                                                                 imquant, mquant, clipvalue );
 405                 nzflag = (nzflag << 1) |( !!(flags & 0xffff0000));
 406
 407                 /* If we're saturating simply bump up quantization and start
 408                         from scratch...  if we can't avoid saturation by
 409                         quantising then we're hosed and we fall back to
 410                         saturation using the old C code.  */
 411
 412                 if( (flags & 0xff00) != 0 )
 413                 {
 414                         int new_mquant = next_larger_quant_hv( picture, mquant );
 415                         if( new_mquant != mquant )
 416                         {
 417                                 mquant = new_mquant;
 418                         }
 419                         else
 420                         {
 421                                 saturated = 1;
 422                                 break;
 423                         }
 424
 425                         comp = 0;
 426                         nzflag = 0;
 427                         pdst = dst;
 428                         psrc = src;
 429                 }
 430                 else
 431                 {
 432                         ++comp;
 433                         pdst += 64;
 434                         psrc +=64;
 435                 }
 436                 /* Fall back to 32-bit(or better - if some hero(ine) made this work on
 437                         non 32-bit int machines ;-)) if out of dynamic range for MMX...
 438                 */
 439         }
 440         while( comp < block_count  && (flags & 0xff) == 0  );
 441
 442
 443         /* Coefficient out of range or can't avoid saturation:
 444         fall back to the original 32-bit int version: this is rare */
 445         if(  (flags & 0xff) != 0 || saturated)
 446         {
 447                 return quant_non_intra_hv(picture, src, dst, mquant, nonsat_mquant);
 448         }
 449
 450         *nonsat_mquant = mquant;
 451         return nzflag;
 452 }
 453
 454
 455 void iquant1_intra(int16_t *src, int16_t *dst, int dc_prec, int mquant)
 456 {
 457   int i, val;
 458   uint16_t *quant_mat = intra_q;
 459
 460   dst[0] = src[0] << (3-dc_prec);
 461   for (i=1; i<64; i++)
 462   {
 463     val = (int)(src[i]*quant_mat[i]*mquant)/16;
 464
 465     /* mismatch control */
 466     if ((val&1)==0 && val!=0)
 467       val+= (val>0) ? -1 : 1;
 468
 469     /* saturation */
 470     dst[i] = (val>2047) ? 2047 : ((val<-2048) ? -2048 : val);
 471   }
 472 }