/* quantize.c, quantization / inverse quantization                          */

/* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */

/*
 * Disclaimer of Warranty
 *
 * These software programs are available to the user without any license fee or
 * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
 * any and all warranties, whether express, implied, or statuary, including any
 * implied warranties or merchantability or of fitness for a particular
 * purpose.  In no event shall the copyright-holder be liable for any
 * incidental, punitive, or consequential damages of any kind whatsoever
 * arising from the use of these programs.
 *
 * This disclaimer of warranty extends to the user of these programs and user's
 * customers, employees, agents, transferees, successors, and assigns.
 *
 * The MPEG Software Simulation Group does not represent or warrant that the
 * programs furnished hereunder are free of infringement of any third-party
 * patents.
 *
 * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
 * are subject to royalty fees to patent holders.  Many of these patents are
 * general enough such that they are unavoidable regardless of implementation
 * design.
 *
 */

#include "config.h"
#include <stdio.h>
#include <math.h>
#include <fenv.h>
#include "global.h"
#include "cpu_accel.h"
#include "simd.h"
#include "fastintfns.h"


/* Global function pointers for SIMD-dependent functions */
int (*pquant_non_intra)(pict_data_s *picture, int16_t *src, int16_t *dst,
						int mquant, int *nonsat_mquant);
int (*pquant_weight_coeff_sum)(int16_t *blk, uint16_t*i_quant_mat );

/* Local functions pointers for SIMD-dependent functions */

static void (*piquant_non_intra_m1)(int16_t *src, int16_t *dst,  uint16_t *quant_mat);


static int quant_weight_coeff_sum( int16_t *blk, uint16_t * i_quant_mat );
static void iquant_non_intra_m1(int16_t *src, int16_t *dst, uint16_t *quant_mat);


/*
  Initialise quantization routines.
  Currently just setting up MMX routines if available...
 */

void init_quantizer_hv()
{
  int flags;
  flags = cpu_accel();
#ifdef X86_CPU
  if( (flags & ACCEL_X86_MMX) != 0 ) /* MMX CPU */
	{
		if(verbose) fprintf( stderr, "SETTING " );
		if( (flags & ACCEL_X86_3DNOW) != 0 )
		{
			if(verbose) fprintf( stderr, "3DNOW and ");
			pquant_non_intra = quant_non_intra_hv_3dnow;
		}
/*
 * 		else if ( (flags & ACCEL_X86_MMXEXT) != 0 )
 * 		{
 * 			if(verbose) fprintf( stderr, "SSE and ");
 * 			pquant_non_intra = quant_non_intra_hv_sse;
 * 		}
 */
		else 
		{
			pquant_non_intra = quant_non_intra_hv;
		}

		if ( (flags & ACCEL_X86_MMXEXT) != 0 )
		{
			if(verbose) fprintf( stderr, "EXTENDED MMX");
			pquant_weight_coeff_sum = quant_weight_coeff_sum_mmx;
			piquant_non_intra_m1 = iquant_non_intra_m1_sse;
		}
		else
		{
			if(verbose) fprintf( stderr, "MMX");
			pquant_weight_coeff_sum = quant_weight_coeff_sum_mmx;
			piquant_non_intra_m1 = iquant_non_intra_m1_mmx;
		}
		if(verbose) fprintf( stderr, " for QUANTIZER!\n");
	}
  else
#endif
	{
	  pquant_non_intra = quant_non_intra_hv;	  
	  pquant_weight_coeff_sum = quant_weight_coeff_sum;
	  piquant_non_intra_m1 = iquant_non_intra_m1;
	}
}

/*
 *
 * Computes the next quantisation up.  Used to avoid saturation
 * in macroblock coefficients - common in MPEG-1 - which causes
 * nasty artefacts.
 *
 * NOTE: Does no range checking...
 *
 */
 

int next_larger_quant_hv( pict_data_s *picture, int quant )
{
	if( picture->q_scale_type )
		{
			if( map_non_linear_mquant_hv[quant]+1 > 31 )
				return quant;
			else
				return non_linear_mquant_table_hv[map_non_linear_mquant_hv[quant]+1];
		}
	else 
		{
			if( quant+2 > 31 )
				return quant;
			else 
				return quant+2;
		}
	
}

/* 
 * Quantisation for intra blocks using Test Model 5 quantization
 *
 * this quantizer has a bias of 1/8 stepsize towards zero
 * (except for the DC coefficient)
 *
	PRECONDITION: src dst point to *disinct* memory buffers...
		          of block_count *adjact* int16_t[64] arrays... 
 *
 * RETURN: 1 If non-zero coefficients left after quantisaiont 0 otherwise
 */

void quant_intra_hv(
	pict_data_s *picture,
	int16_t *src, 
	int16_t *dst,
	int mquant,
	int *nonsat_mquant
	)
{
  int16_t *psrc,*pbuf;
  int i,comp;
  int x, y, d;
  int clipping;
  int clipvalue  = dctsatlim;
  uint16_t *quant_mat = intra_q_tbl[mquant] /* intra_q */;


  /* Inspired by suggestion by Juan.  Quantize a little harder if we clip...
   */

  do
	{
	  clipping = 0;
	  pbuf = dst;
	  psrc = src;
	  for( comp = 0; comp<block_count && !clipping; ++comp )
	  {
		x = psrc[0];
		d = 8>>picture->dc_prec; /* intra_dc_mult */
		pbuf[0] = (x>=0) ? (x+(d>>1))/d : -((-x+(d>>1))/d); /* round(x/d) */


		for (i=1; i<64 ; i++)
		  {
			x = psrc[i];
			d = quant_mat[i];
			/* RJ: save one divide operation */
			y = ((abs(x) << 5)+ ((3 * quant_mat[i]) >> 2)) / (quant_mat[i] << 1);
			if ( y > clipvalue )
			  {
				clipping = 1;
				mquant = next_larger_quant_hv( picture, mquant );
				quant_mat = intra_q_tbl[mquant];
				break;
			  }
		  
		  	pbuf[i] = intsamesign(x,y);
		  }
		pbuf += 64;
		psrc += 64;
	  }
			
	} while( clipping );
  *nonsat_mquant = mquant;
}


/*
 * Quantisation matrix weighted Coefficient sum fixed-point
 * integer with low 16 bits fractional...
 * To be used for rate control as a measure of dct block
 * complexity...
 *
 */

int quant_weight_coeff_sum( int16_t *blk, uint16_t * i_quant_mat )
{
  int i;
  int sum = 0;
   for( i = 0; i < 64; i+=2 )
	{
		sum += abs((int)blk[i]) * (i_quant_mat[i]) + abs((int)blk[i+1]) * (i_quant_mat[i+1]);
	}
    return sum;
  /* In case you're wondering typical average coeff_sum's for a rather
  	noisy video are around 20.0.  */
}


/* 
 * Quantisation for non-intra blocks using Test Model 5 quantization
 *
 * this quantizer has a bias of 1/8 stepsize towards zero
 * (except for the DC coefficient)
 *
 * A.Stevens 2000: The above comment is nonsense.  Only the intra quantiser does
 * this.  This one just truncates with a modest bias of 1/(4*quant_matrix_scale)
 * to 1.
 *
 *	PRECONDITION: src dst point to *disinct* memory buffers...
 *	              of block_count *adjacent* int16_t[64] arrays...
 *
 * RETURN: A bit-mask of block_count bits indicating non-zero blocks (a 1).
 *
 * TODO: A candidate for use of efficient abs and "intsamesign". If only gcc understood
 * PPro conditional moves...
 */
																							     											     
int quant_non_intra_hv(
						   pict_data_s *picture,
						   int16_t *src, int16_t *dst,
						   int mquant,
						   int *nonsat_mquant)
{
	int i;
	int x, y, d;
	int nzflag;
	int coeff_count;
	int clipvalue  = dctsatlim;
	int flags = 0;
	int saturated = 0;
	uint16_t *quant_mat = inter_q_tbl[mquant]/* inter_q */;
	
	coeff_count = 64*block_count;
	flags = 0;
	nzflag = 0;
	for (i=0; i<coeff_count; ++i)
	{
restart:
		if( (i%64) == 0 )
		{
			nzflag = (nzflag<<1) | !!flags;
			flags = 0;
			  
		}
		/* RJ: save one divide operation */

		x = abs( ((int)src[i]) ) /*(src[i] >= 0 ? src[i] : -src[i])*/ ;
		d = (int)quant_mat[(i&63)]; 
		/* A.Stevens 2000: Given the math of non-intra frame
		   quantisation / inverse quantisation I always though the
		   funny little foudning factor was bogus.  It seems to be
		   the encoder needs less bits if you simply divide!
		*/

		y = (x<<4) /  (d) /* (32*x + (d>>1))/(d*2*mquant)*/ ;
		if ( y > clipvalue )
		{
			if( saturated )
			{
				y = clipvalue;
			}
			else
			{
				int new_mquant = next_larger_quant_hv( picture, mquant );
				if( new_mquant != mquant )
				{
					mquant = new_mquant;
					quant_mat = inter_q_tbl[mquant];
				}
				else
				{
					saturated = 1;
				}
				i=0;
				nzflag =0;
				goto restart;
			}
		}
		dst[i] = intsamesign(src[i], y) /* (src[i] >= 0 ? y : -y) */;
		flags |= dst[i];
	}
	nzflag = (nzflag<<1) | !!flags;

    *nonsat_mquant = mquant;
    return nzflag;
}

/* MPEG-1 inverse quantization */
static void iquant1_intra(int16_t *src, int16_t *dst, int dc_prec, int mquant)
{
  int i, val;
  uint16_t *quant_mat = intra_q;

  dst[0] = src[0] << (3-dc_prec);
  for (i=1; i<64; i++)
  {
    val = (int)(src[i]*quant_mat[i]*mquant)/16;

    /* mismatch control */
    if ((val&1)==0 && val!=0)
      val+= (val>0) ? -1 : 1;

    /* saturation */
    dst[i] = (val>2047) ? 2047 : ((val<-2048) ? -2048 : val);
  }
}


/* MPEG-2 inverse quantization */
void iquant_intra(int16_t *src, int16_t *dst, int dc_prec, int mquant)
{
  int i, val, sum;

  if ( mpeg1  )
    iquant1_intra(src,dst,dc_prec, mquant);
  else
  {
    sum = dst[0] = src[0] << (3-dc_prec);
    for (i=1; i<64; i++)
    {
      val = (int)(src[i]*intra_q[i]*mquant)/16;
      sum+= dst[i] = (val>2047) ? 2047 : ((val<-2048) ? -2048 : val);
    }

    /* mismatch control */
    if ((sum&1)==0)
      dst[63]^= 1;
  }
}


static void iquant_non_intra_m1(int16_t *src, int16_t *dst,  uint16_t *quant_mat)
{
  int i, val;

#ifndef ORIGINAL_CODE

  for (i=0; i<64; i++)
  {
    val = src[i];
    if (val!=0)
    {
      val = (int)((2*val+(val>0 ? 1 : -1))*quant_mat[i])/32;

      /* mismatch control */
      if ((val&1)==0 && val!=0)
        val+= (val>0) ? -1 : 1;
    }

    /* saturation */
     dst[i] = (val>2047) ? 2047 : ((val<-2048) ? -2048 : val);
 }
#else
  
  for (i=0; i<64; i++)
  {
    val = abs(src[i]);
    if (val!=0)
    {
		val = ((val+val+1)*quant_mat[i]) >> 5;
		/* mismatch control */
		val -= (~(val&1))&(val!=0);
		val = fastmin(val, 2047); /* Saturation */
    }
	dst[i] = intsamesign(src[i],val);
	
  }
  
#endif
}


void iquant_non_intra(int16_t *src, int16_t *dst, int mquant )
{
  int i, val, sum;
  uint16_t *quant_mat;

  if ( mpeg1 )
    (*piquant_non_intra_m1)(src,dst,inter_q_tbl[mquant]);
  else
  {
	  sum = 0;
#ifdef ORIGINAL_CODE

	  for (i=0; i<64; i++)
	  {
		  val = src[i];
		  if (val!=0)
			  
			  val = (int)((2*val+(val>0 ? 1 : -1))*inter_q[i]*mquant)/32;
		  sum+= dst[i] = (val>2047) ? 2047 : ((val<-2048) ? -2048 : val);
	  }
#else
	  quant_mat = inter_q_tbl[mquant];
	  for (i=0; i<64; i++)
	  {
		  val = src[i];
		  if( val != 0 )
		  {
			  val = abs(val);
			  val = (int)((val+val+1)*quant_mat[i])>>5;
			  val = intmin( val, 2047);
			  sum += val;
		  }
		  dst[i] = intsamesign(src[i],val);
	  }
#endif

    /* mismatch control */
    if ((sum&1)==0)
      dst[63]^= 1;
  }
}

void iquantize( pict_data_s *picture )
{
	int j,k;
	int16_t (*qblocks)[64] = picture->qblocks;
	for (k=0; k<mb_per_pict; k++)
	{
		if (picture->mbinfo[k].mb_type & MB_INTRA)
			for (j=0; j<block_count; j++)
				iquant_intra(qblocks[k*block_count+j],
							 qblocks[k*block_count+j],
							 cur_picture.dc_prec,
							 cur_picture.mbinfo[k].mquant);
		else
			for (j=0;j<block_count;j++)
				iquant_non_intra(qblocks[k*block_count+j],
								 qblocks[k*block_count+j],
								 cur_picture.mbinfo[k].mquant);
	}
}