--- /dev/null
+;
+; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+
+;
+; This program is free software; you can redistribute it and/or
+; modify it under the terms of the GNU General Public License
+; as published by the Free Software Foundation; either version 2
+; of the License, or (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program; if not, write to the Free Software
+; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+;
+;
+;
+; quantize_ni_mmx.s: MMX optimized coefficient quantization sub-routine
+
+
+global quantize_ni_mmx
+; int quantize_ni_mmx(short *dst, short *src,
+; short *quant_mat, short *i_quant_mat,
+; int imquant, int mquant, int sat_limit)
+
+; See quantize.c: quant_non_intra_hv_inv() for reference implementation in C...
+ ;; mquant is not currently used.
+; eax = row counter...
+; ebx = pqm
+; ecx = piqm ; Matrix of quads first (2^16/quant)
+ ; then (2^16/quant)*(2^16%quant) the second part is for rounding
+; edx = temp
+; edi = psrc
+; esi = pdst
+
+; mm0 = [imquant|0..3]W
+; mm1 = [sat_limit|0..3]W
+; mm2 = *psrc -> src
+; mm3 = rounding corrections... / temp
+; mm4 = sign
+; mm5 = nzflag accumulators
+; mm6 = overflow limit
+; mm7 = temp
+
+ ;;
+ ;; private constants needed
+ ;;
+
+SECTION .data
+align 16
+overflim:
+ dw 1024-1
+ dw 1024-1
+ dw 1024-1
+ dw 1024-1
+
+ ;; BUFFER NO LONGER USED DUE TO IMPROVED MAIN ROUTINE...
+SECTION .bss
+align 32
+quant_buf: resw 64
+
+SECTION .text
+
+
+align 32
+quantize_ni_mmx:
+ push ebp ; save frame pointer
+ mov ebp, esp ; link
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+
+ mov edi, [ebp+8] ; get dst
+ mov esi, [ebp+12] ; get psrc
+ mov ebx, [ebp+16] ; get pqm
+ mov ecx, [ebp+20] ; get piqm
+ movd mm0, [ebp+24] ; get imquant (2^16 / mquant )
+ movq mm1, mm0
+ punpcklwd mm0, mm1
+ punpcklwd mm0, mm0 ; mm0 = [imquant|0..3]W
+
+ movq mm6, [overflim]; overflow limit
+
+ movd mm1, [ebp+32] ; sat_limit
+ movq mm2, mm1
+ punpcklwd mm1, mm2 ; [sat_limit|0..3]W
+ punpcklwd mm1, mm1 ; mm1 = [sat_limit|0..3]W
+
+ pxor mm5, mm5 ; Non-zero flag accumulator
+ mov eax, 16 ; 16 quads to do
+ jmp nextquadniq
+
+align 32
+nextquadniq:
+ movq mm2, [esi] ; mm0 = *psrc
+
+ pxor mm4, mm4
+ pcmpgtw mm4, mm2 ; mm4 = *psrc < 0
+ movq mm7, mm2 ; mm7 = *psrc
+ psllw mm7, 1 ; mm7 = 2*(*psrc)
+ pand mm7, mm4 ; mm7 = 2*(*psrc)*(*psrc < 0)
+ psubw mm2, mm7 ; mm2 = abs(*psrc)
+
+ ;;
+ ;; Check whether we'll saturate intermediate results
+ ;; Eventually flag is low 8 bits of result
+ ;;
+
+ movq mm7, mm2
+ pcmpgtw mm7, mm6 ; Tooo big for 16 bit arithmetic :-( (should be *very* rare)
+ movq mm3, mm7
+ psrlq mm3, 32
+ por mm7, mm3
+ movd edx, mm7
+ cmp edx, 0
+ jnz near out_of_range
+
+ ;;
+ ;; Carry on with the arithmetic...
+ psllw mm2, 5 ; mm2 = 32*abs(*psrc)
+ movq mm7, [ebx] ; mm7 = *pqm>>1
+ psrlw mm7, 1
+ paddw mm2, mm7 ; mm2 = 32*abs(*psrc)+((*pqm)/2) = "p"
+
+
+ ;;
+ ;; Do the first multiplication. Cunningly we've set things up so
+ ;; it is exactly the top 16 bits we're interested in...
+ ;;
+ ;; We need the low word results for a rounding correction.
+ ;; This is *not* exact (that actual
+ ;; correction the product abs(*psrc)*(*pqm)*(2^16%*qm) >> 16
+ ;; However we get very very few wrong and none too low (the most
+ ;; important) and no errors for small coefficients (also important)
+ ;; if we simply add abs(*psrc)
+
+
+ movq mm3, mm2
+ pmullw mm3, [ecx]
+ movq mm7, mm2
+ psrlw mm7, 1 ; Want to see if adding p would carry into upper 16 bits
+ psrlw mm3, 1
+ paddw mm3, mm7
+ psrlw mm3, 15 ; High bit in lsb rest 0's
+ pmulhw mm2, [ecx] ; mm2 = (p*iqm+p) >> IQUANT_SCALE_POW2 ~= p/*qm
+
+
+
+ ;;
+ ;; To hide the latency lets update some pointers...
+ add esi, 8 ; 4 word's
+ add ecx, 8 ; 4 word's
+ sub eax, 1
+
+ ;; Add rounding correction....
+ paddw mm2, mm3
+
+
+ ;;
+ ;; Do the second multiplication, again we ned to make a rounding adjustment
+ ;; EXPERIMENT: see comments in quantize.c:quant_non_intra_hv don't adjust...
+; movq mm3, mm2
+; pmullw mm3, mm0
+; movq mm7, mm2
+; psrlw mm7, 1 ; Want to see if adding p would carry into upper 16 bits
+; psrlw mm3, 1
+; paddw mm3, mm7
+; psrlw mm3, 15 ; High bit in lsb rest 0's
+
+ pmulhw mm2, mm0 ; mm2 ~= (p/(qm*mquant))
+
+ ;;
+ ;; To hide the latency lets update some more pointers...
+ add edi, 8
+ add ebx, 8
+
+ ;; Correct rounding and the factor of two (we want p/(qm*2*mquant)
+; paddw mm2, mm3
+ psrlw mm2, 1
+
+
+ ;;
+ ;; Check for saturation
+ ;;
+ movq mm7, mm2
+ pcmpgtw mm7, mm1
+ movq mm3, mm7
+ psrlq mm3, 32
+ movq mm3, mm7
+ por mm7, mm3
+ movd edx, mm7
+ cmp edx, 0
+ jnz saturated
+
+ ;;
+ ;; Accumulate non-zero flags
+ por mm5, mm2
+
+ ;;
+ ;; Now correct the sign mm4 = *psrc < 0
+ ;;
+
+ pxor mm7, mm7 ; mm7 = -2*mm2
+ psubw mm7, mm2
+ psllw mm7, 1
+ pand mm7, mm4 ; mm7 = -2*mm2 * (*psrc < 0)
+ paddw mm2, mm7 ; mm7 = samesign(*psrc, mm2 )
+
+ ;;
+ ;; Store the quantised words....
+ ;;
+
+ movq [edi-8], mm2
+ test eax, eax
+
+ jnz near nextquadniq
+
+ ;; Return saturation in low word and nzflag in high word of result dword
+
+
+ movq mm0, mm5
+ psrlq mm0, 32
+ por mm5, mm0
+ movd edx, mm5
+ mov ebx, edx
+ shl ebx, 16
+ or edx, ebx
+ and edx, 0xffff0000 ;; hiwgh word ecx is nzflag
+ mov eax, edx
+
+return:
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+
+ pop ebp ; restore stack pointer
+
+ emms ; clear mmx registers
+ ret
+
+out_of_range:
+ mov eax, 0x00ff
+ jp return
+saturated:
+
+ mov eax, 0xff00
+ jp return
+
+
+
+
+;;;
+;;; void iquant_non_intra_m1_{sse,mmx}(int16_t *src, int16_t *dst, uint16_t
+;;; *quant_mat)
+;;; mmx/sse Inverse mpeg-1 quantisation routine.
+;;;
+;;; eax - block counter...
+;;; edi - src
+;;; esi - dst
+;;; edx - quant_mat
+
+ ;; MMX Register usage
+ ;; mm7 = [1|0..3]W
+ ;; mm6 = [2047|0..3]W
+ ;; mm5 = 0
+
+
+global iquant_non_intra_m1_sse
+align 32
+iquant_non_intra_m1_sse:
+
+ push ebp ; save frame pointer
+ mov ebp, esp ; link
+
+ push eax
+ push esi
+ push edi
+ push edx
+
+ mov edi, [ebp+8] ; get psrc
+ mov esi, [ebp+12] ; get pdst
+ mov edx, [ebp+16] ; get quant table
+ mov eax,1
+ movd mm7, eax
+ punpcklwd mm7, mm7
+ punpckldq mm7, mm7
+
+ mov eax, 2047
+ movd mm6, eax
+ punpcklwd mm6, mm6
+ punpckldq mm6, mm6
+
+ mov eax, 64 ; 64 coeffs in a DCT block
+ pxor mm5, mm5
+
+iquant_loop_sse:
+ movq mm0, [edi] ; mm0 = *psrc
+ add edi,8
+ pxor mm1,mm1
+ movq mm2, mm0
+ pcmpeqw mm2, mm1 ; mm2 = 1's for non-zero in mm0
+ pcmpeqw mm2, mm1
+
+ ;; Work with absolute value for convience...
+ psubw mm1, mm0 ; mm1 = -*psrc
+ pmaxsw mm1, mm0 ; mm1 = val = max(*psrc,-*psrc) = abs(*psrc)
+ paddw mm1, mm1 ; mm1 *= 2;
+ paddw mm1, mm7 ; mm1 += 1
+ pmullw mm1, [edx] ; mm1 = (val*2+1) * *quant_mat
+ add edx, 8
+ psraw mm1, 5 ; mm1 = ((val*2+1) * *quant_mat)/32
+
+ ;; Now that nasty mis-match control
+
+ movq mm3, mm1
+ pand mm3, mm7
+ pxor mm3, mm7 ; mm3 = ~(val&1) (in the low bits, others 0)
+ movq mm4, mm1
+ pcmpeqw mm4, mm5 ; mm4 = (val == 0)
+ pxor mm4, mm7 ; Low bits now (val != 0)
+ pand mm3, mm4 ; mm3 = (~(val&1))&(val!=0)
+
+ psubw mm1, mm3 ; mm1 -= (~(val&1))&(val!=0)
+ pminsw mm1, mm6 ; mm1 = saturated(res)
+
+ ;; Handle zero case and restoring sign
+ pand mm1, mm2 ; Zero in the zero case
+ pxor mm3, mm3
+ psubw mm3, mm1 ; mm3 = - res
+ paddw mm3, mm3 ; mm3 = - 2*res
+ pcmpgtw mm0, mm5 ; mm0 = *psrc < 0
+ pcmpeqw mm0, mm5 ; mm0 = *psrc >= 0
+ pand mm3, mm0 ; mm3 = *psrc <= 0 ? -2 * res : 0
+ paddw mm1, mm3 ; mm3 = samesign(*psrc,res)
+ movq [esi], mm1
+ add esi,8
+
+ sub eax, 4
+ jnz iquant_loop_sse
+
+ pop edx
+ pop edi
+ pop esi
+ pop eax
+
+ pop ebp ; restore stack pointer
+
+ emms ; clear mmx registers
+ ret
+
+
+;;;
+;;; void iquant_non_intra_m1_mmx(int16_t *src, int16_t *dst, uint16_t
+;;; *quant_mat)
+;;; eax - block counter...
+;;; edi - src
+;;; esi - dst
+;;; edx - quant_mat
+
+ ;; MMX Register usage
+ ;; mm7 = [1|0..3]W
+ ;; mm6 = [MAX_UINT16-2047|0..3]W
+ ;; mm5 = 0
+
+
+global iquant_non_intra_m1_mmx
+align 32
+iquant_non_intra_m1_mmx:
+
+ push ebp ; save frame pointer
+ mov ebp, esp ; link
+
+ push eax
+ push esi
+ push edi
+ push edx
+
+ mov edi, [ebp+8] ; get psrc
+ mov esi, [ebp+12] ; get pdst
+ mov edx, [ebp+16] ; get quant table
+ mov eax,1
+ movd mm7, eax
+ punpcklwd mm7, mm7
+ punpckldq mm7, mm7
+
+ mov eax, (0xffff-2047)
+ movd mm6, eax
+ punpcklwd mm6, mm6
+ punpckldq mm6, mm6
+
+ mov eax, 64 ; 64 coeffs in a DCT block
+ pxor mm5, mm5
+
+iquant_loop:
+ movq mm0, [edi] ; mm0 = *psrc
+ add edi,8
+ pxor mm1, mm1
+ movq mm2, mm0
+ pcmpeqw mm2, mm5 ; mm2 = 1's for non-zero in mm0
+ pcmpeqw mm2, mm5
+
+ ;; Work with absolute value for convience...
+
+ psubw mm1, mm0 ; mm1 = -*psrc
+ psllw mm1, 1 ; mm1 = -2*psrc
+ movq mm3, mm0 ; mm3 = *psrc > 0
+ pcmpgtw mm3, mm5
+ pcmpeqw mm3, mm5 ; mm3 = *psrc <= 0
+ pand mm3, mm1 ; mm3 = (*psrc <= 0)*-2* *psrc
+ movq mm1, mm0 ; mm1 = (*psrc <= 0)*-2* *psrc + *psrc = abs(*psrc)
+ paddw mm1, mm3
+
+
+ paddw mm1, mm1 ; mm1 *= 2;
+ paddw mm1, mm7 ; mm1 += 1
+ pmullw mm1, [edx] ; mm1 = (val*2+1) * *quant_mat
+ add edx, 8
+ psraw mm1, 5 ; mm1 = ((val*2+1) * *quant_mat)/32
+
+ ;; Now that nasty mis-match control
+
+ movq mm3, mm1
+ pand mm3, mm7
+ pxor mm3, mm7 ; mm3 = ~(val&1) (in the low bits, others 0)
+ movq mm4, mm1
+ pcmpeqw mm4, mm5 ; mm4 = (val == 0)
+ pxor mm4, mm7 ; Low bits now (val != 0)
+ pand mm3, mm4 ; mm3 = (~(val&1))&(val!=0)
+
+ psubw mm1, mm3 ; mm1 -= (~(val&1))&(val!=0)
+
+ paddsw mm1, mm6 ; Will saturate if > 2047
+ psubw mm1, mm6 ; 2047 if saturated... unchanged otherwise
+
+ ;; Handle zero case and restoring sign
+ pand mm1, mm2 ; Zero in the zero case
+ pxor mm3, mm3
+ psubw mm3, mm1 ; mm3 = - res
+ paddw mm3, mm3 ; mm3 = - 2*res
+ pcmpgtw mm0, mm5 ; mm0 = *psrc < 0
+ pcmpeqw mm0, mm5 ; mm0 = *psrc >= 0
+ pand mm3, mm0 ; mm3 = *psrc <= 0 ? -2 * res : 0
+ paddw mm1, mm3 ; mm3 = samesign(*psrc,res)
+ movq [esi], mm1
+ add esi,8
+
+ sub eax, 4
+ jnz near iquant_loop
+
+ pop edx
+ pop edi
+ pop esi
+ pop eax
+
+ pop ebp ; restore stack pointer
+
+ emms ; clear mmx registers
+ ret
+
+
+
+;;; int32_t quant_weight_coeff_sum_mmx(int16_t *src, int16_t *i_quant_mat
+;;; Simply add up the sum of coefficients weighted
+;;; by their quantisation coefficients
+;;; )
+;;; eax - block counter...
+;;; edi - src
+;;; esi - dst
+;;; edx - quant_mat
+
+ ;; MMX Register usage
+ ;; mm7 = [1|0..3]W
+ ;; mm6 = [2047|0..3]W
+ ;; mm5 = 0
+
+global quant_weight_coeff_sum_mmx
+align 32
+quant_weight_coeff_sum_mmx:
+ push ebp ; save frame pointer
+ mov ebp, esp ; link
+
+ push ecx
+ push esi
+ push edi
+
+ mov edi, [ebp+8] ; get pdst
+ mov esi, [ebp+12] ; get piqm
+
+ mov ecx, 16 ; 16 coefficient / quantiser quads to process...
+ pxor mm6, mm6 ; Accumulator
+ pxor mm7, mm7 ; Zero
+quantsum:
+ movq mm0, [edi]
+ movq mm2, [esi]
+
+ ;;
+ ;; Compute absolute value of coefficients...
+ ;;
+ movq mm1, mm7
+ pcmpgtw mm1, mm0 ; (mm0 < 0 )
+ movq mm3, mm0
+ psllw mm3, 1 ; 2*mm0
+ pand mm3, mm1 ; 2*mm0 * (mm0 < 0)
+ psubw mm0, mm3 ; mm0 = abs(mm0)
+
+
+ ;;
+ ;; Compute the low and high words of the result....
+ ;;
+ movq mm1, mm0
+ pmullw mm0, mm2
+ add edi, 8
+ add esi, 8
+ pmulhw mm1, mm2
+
+ movq mm3, mm0
+ punpcklwd mm3, mm1
+ punpckhwd mm0, mm1
+ paddd mm6, mm3
+ paddd mm6, mm0
+
+
+ sub ecx, 1
+ jnz quantsum
+
+ movd eax, mm6
+ psrlq mm6, 32
+ movd ecx, mm6
+ add eax, ecx
+
+ pop edi
+ pop esi
+ pop ecx
+
+ pop ebp ; restore stack pointer
+
+ emms ; clear mmx registers
+ ret
+
+