; ; Copyright (C) 2000 Andrew Stevens ; ; This program is free software; you can redistribute it and/or ; modify it under the terms of the GNU General Public License ; as published by the Free Software Foundation; either version 2 ; of the License, or (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; ; You should have received a copy of the GNU General Public License ; along with this program; if not, write to the Free Software ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ; ; ; ; quantize_ni_mmx.s: MMX optimized coefficient quantization sub-routine global quantize_ni_mmx ; int quantize_ni_mmx(short *dst, short *src, ; short *quant_mat, short *i_quant_mat, ; int imquant, int mquant, int sat_limit) ; See quantize.c: quant_non_intra_hv_inv() for reference implementation in C... ;; mquant is not currently used. ; eax = row counter... ; ebx = pqm ; ecx = piqm ; Matrix of quads first (2^16/quant) ; then (2^16/quant)*(2^16%quant) the second part is for rounding ; edx = temp ; edi = psrc ; esi = pdst ; mm0 = [imquant|0..3]W ; mm1 = [sat_limit|0..3]W ; mm2 = *psrc -> src ; mm3 = rounding corrections... / temp ; mm4 = sign ; mm5 = nzflag accumulators ; mm6 = overflow limit ; mm7 = temp ;; ;; private constants needed ;; SECTION .data align 16 overflim: dw 1024-1 dw 1024-1 dw 1024-1 dw 1024-1 ;; BUFFER NO LONGER USED DUE TO IMPROVED MAIN ROUTINE... SECTION .bss align 32 quant_buf: resw 64 SECTION .text align 32 quantize_ni_mmx: push ebp ; save frame pointer mov ebp, esp ; link push ebx push ecx push edx push esi push edi mov edi, [ebp+8] ; get dst mov esi, [ebp+12] ; get psrc mov ebx, [ebp+16] ; get pqm mov ecx, [ebp+20] ; get piqm movd mm0, [ebp+24] ; get imquant (2^16 / mquant ) movq mm1, mm0 punpcklwd mm0, mm1 punpcklwd mm0, mm0 ; mm0 = [imquant|0..3]W movq mm6, [overflim]; overflow limit movd mm1, [ebp+32] ; sat_limit movq mm2, mm1 punpcklwd mm1, mm2 ; [sat_limit|0..3]W punpcklwd mm1, mm1 ; mm1 = [sat_limit|0..3]W pxor mm5, mm5 ; Non-zero flag accumulator mov eax, 16 ; 16 quads to do jmp nextquadniq align 32 nextquadniq: movq mm2, [esi] ; mm0 = *psrc pxor mm4, mm4 pcmpgtw mm4, mm2 ; mm4 = *psrc < 0 movq mm7, mm2 ; mm7 = *psrc psllw mm7, 1 ; mm7 = 2*(*psrc) pand mm7, mm4 ; mm7 = 2*(*psrc)*(*psrc < 0) psubw mm2, mm7 ; mm2 = abs(*psrc) ;; ;; Check whether we'll saturate intermediate results ;; Eventually flag is low 8 bits of result ;; movq mm7, mm2 pcmpgtw mm7, mm6 ; Tooo big for 16 bit arithmetic :-( (should be *very* rare) movq mm3, mm7 psrlq mm3, 32 por mm7, mm3 movd edx, mm7 cmp edx, 0 jnz near out_of_range ;; ;; Carry on with the arithmetic... psllw mm2, 5 ; mm2 = 32*abs(*psrc) movq mm7, [ebx] ; mm7 = *pqm>>1 psrlw mm7, 1 paddw mm2, mm7 ; mm2 = 32*abs(*psrc)+((*pqm)/2) = "p" ;; ;; Do the first multiplication. Cunningly we've set things up so ;; it is exactly the top 16 bits we're interested in... ;; ;; We need the low word results for a rounding correction. ;; This is *not* exact (that actual ;; correction the product abs(*psrc)*(*pqm)*(2^16%*qm) >> 16 ;; However we get very very few wrong and none too low (the most ;; important) and no errors for small coefficients (also important) ;; if we simply add abs(*psrc) movq mm3, mm2 pmullw mm3, [ecx] movq mm7, mm2 psrlw mm7, 1 ; Want to see if adding p would carry into upper 16 bits psrlw mm3, 1 paddw mm3, mm7 psrlw mm3, 15 ; High bit in lsb rest 0's pmulhw mm2, [ecx] ; mm2 = (p*iqm+p) >> IQUANT_SCALE_POW2 ~= p/*qm ;; ;; To hide the latency lets update some pointers... add esi, 8 ; 4 word's add ecx, 8 ; 4 word's sub eax, 1 ;; Add rounding correction.... paddw mm2, mm3 ;; ;; Do the second multiplication, again we ned to make a rounding adjustment ;; EXPERIMENT: see comments in quantize.c:quant_non_intra_hv don't adjust... ; movq mm3, mm2 ; pmullw mm3, mm0 ; movq mm7, mm2 ; psrlw mm7, 1 ; Want to see if adding p would carry into upper 16 bits ; psrlw mm3, 1 ; paddw mm3, mm7 ; psrlw mm3, 15 ; High bit in lsb rest 0's pmulhw mm2, mm0 ; mm2 ~= (p/(qm*mquant)) ;; ;; To hide the latency lets update some more pointers... add edi, 8 add ebx, 8 ;; Correct rounding and the factor of two (we want p/(qm*2*mquant) ; paddw mm2, mm3 psrlw mm2, 1 ;; ;; Check for saturation ;; movq mm7, mm2 pcmpgtw mm7, mm1 movq mm3, mm7 psrlq mm3, 32 movq mm3, mm7 por mm7, mm3 movd edx, mm7 cmp edx, 0 jnz saturated ;; ;; Accumulate non-zero flags por mm5, mm2 ;; ;; Now correct the sign mm4 = *psrc < 0 ;; pxor mm7, mm7 ; mm7 = -2*mm2 psubw mm7, mm2 psllw mm7, 1 pand mm7, mm4 ; mm7 = -2*mm2 * (*psrc < 0) paddw mm2, mm7 ; mm7 = samesign(*psrc, mm2 ) ;; ;; Store the quantised words.... ;; movq [edi-8], mm2 test eax, eax jnz near nextquadniq ;; Return saturation in low word and nzflag in high word of result dword movq mm0, mm5 psrlq mm0, 32 por mm5, mm0 movd edx, mm5 mov ebx, edx shl ebx, 16 or edx, ebx and edx, 0xffff0000 ;; hiwgh word ecx is nzflag mov eax, edx return: pop edi pop esi pop edx pop ecx pop ebx pop ebp ; restore stack pointer emms ; clear mmx registers ret out_of_range: mov eax, 0x00ff jp return saturated: mov eax, 0xff00 jp return ;;; ;;; void iquant_non_intra_m1_{sse,mmx}(int16_t *src, int16_t *dst, uint16_t ;;; *quant_mat) ;;; mmx/sse Inverse mpeg-1 quantisation routine. ;;; ;;; eax - block counter... ;;; edi - src ;;; esi - dst ;;; edx - quant_mat ;; MMX Register usage ;; mm7 = [1|0..3]W ;; mm6 = [2047|0..3]W ;; mm5 = 0 global iquant_non_intra_m1_sse align 32 iquant_non_intra_m1_sse: push ebp ; save frame pointer mov ebp, esp ; link push eax push esi push edi push edx mov edi, [ebp+8] ; get psrc mov esi, [ebp+12] ; get pdst mov edx, [ebp+16] ; get quant table mov eax,1 movd mm7, eax punpcklwd mm7, mm7 punpckldq mm7, mm7 mov eax, 2047 movd mm6, eax punpcklwd mm6, mm6 punpckldq mm6, mm6 mov eax, 64 ; 64 coeffs in a DCT block pxor mm5, mm5 iquant_loop_sse: movq mm0, [edi] ; mm0 = *psrc add edi,8 pxor mm1,mm1 movq mm2, mm0 pcmpeqw mm2, mm1 ; mm2 = 1's for non-zero in mm0 pcmpeqw mm2, mm1 ;; Work with absolute value for convience... psubw mm1, mm0 ; mm1 = -*psrc pmaxsw mm1, mm0 ; mm1 = val = max(*psrc,-*psrc) = abs(*psrc) paddw mm1, mm1 ; mm1 *= 2; paddw mm1, mm7 ; mm1 += 1 pmullw mm1, [edx] ; mm1 = (val*2+1) * *quant_mat add edx, 8 psraw mm1, 5 ; mm1 = ((val*2+1) * *quant_mat)/32 ;; Now that nasty mis-match control movq mm3, mm1 pand mm3, mm7 pxor mm3, mm7 ; mm3 = ~(val&1) (in the low bits, others 0) movq mm4, mm1 pcmpeqw mm4, mm5 ; mm4 = (val == 0) pxor mm4, mm7 ; Low bits now (val != 0) pand mm3, mm4 ; mm3 = (~(val&1))&(val!=0) psubw mm1, mm3 ; mm1 -= (~(val&1))&(val!=0) pminsw mm1, mm6 ; mm1 = saturated(res) ;; Handle zero case and restoring sign pand mm1, mm2 ; Zero in the zero case pxor mm3, mm3 psubw mm3, mm1 ; mm3 = - res paddw mm3, mm3 ; mm3 = - 2*res pcmpgtw mm0, mm5 ; mm0 = *psrc < 0 pcmpeqw mm0, mm5 ; mm0 = *psrc >= 0 pand mm3, mm0 ; mm3 = *psrc <= 0 ? -2 * res : 0 paddw mm1, mm3 ; mm3 = samesign(*psrc,res) movq [esi], mm1 add esi,8 sub eax, 4 jnz iquant_loop_sse pop edx pop edi pop esi pop eax pop ebp ; restore stack pointer emms ; clear mmx registers ret ;;; ;;; void iquant_non_intra_m1_mmx(int16_t *src, int16_t *dst, uint16_t ;;; *quant_mat) ;;; eax - block counter... ;;; edi - src ;;; esi - dst ;;; edx - quant_mat ;; MMX Register usage ;; mm7 = [1|0..3]W ;; mm6 = [MAX_UINT16-2047|0..3]W ;; mm5 = 0 global iquant_non_intra_m1_mmx align 32 iquant_non_intra_m1_mmx: push ebp ; save frame pointer mov ebp, esp ; link push eax push esi push edi push edx mov edi, [ebp+8] ; get psrc mov esi, [ebp+12] ; get pdst mov edx, [ebp+16] ; get quant table mov eax,1 movd mm7, eax punpcklwd mm7, mm7 punpckldq mm7, mm7 mov eax, (0xffff-2047) movd mm6, eax punpcklwd mm6, mm6 punpckldq mm6, mm6 mov eax, 64 ; 64 coeffs in a DCT block pxor mm5, mm5 iquant_loop: movq mm0, [edi] ; mm0 = *psrc add edi,8 pxor mm1, mm1 movq mm2, mm0 pcmpeqw mm2, mm5 ; mm2 = 1's for non-zero in mm0 pcmpeqw mm2, mm5 ;; Work with absolute value for convience... psubw mm1, mm0 ; mm1 = -*psrc psllw mm1, 1 ; mm1 = -2*psrc movq mm3, mm0 ; mm3 = *psrc > 0 pcmpgtw mm3, mm5 pcmpeqw mm3, mm5 ; mm3 = *psrc <= 0 pand mm3, mm1 ; mm3 = (*psrc <= 0)*-2* *psrc movq mm1, mm0 ; mm1 = (*psrc <= 0)*-2* *psrc + *psrc = abs(*psrc) paddw mm1, mm3 paddw mm1, mm1 ; mm1 *= 2; paddw mm1, mm7 ; mm1 += 1 pmullw mm1, [edx] ; mm1 = (val*2+1) * *quant_mat add edx, 8 psraw mm1, 5 ; mm1 = ((val*2+1) * *quant_mat)/32 ;; Now that nasty mis-match control movq mm3, mm1 pand mm3, mm7 pxor mm3, mm7 ; mm3 = ~(val&1) (in the low bits, others 0) movq mm4, mm1 pcmpeqw mm4, mm5 ; mm4 = (val == 0) pxor mm4, mm7 ; Low bits now (val != 0) pand mm3, mm4 ; mm3 = (~(val&1))&(val!=0) psubw mm1, mm3 ; mm1 -= (~(val&1))&(val!=0) paddsw mm1, mm6 ; Will saturate if > 2047 psubw mm1, mm6 ; 2047 if saturated... unchanged otherwise ;; Handle zero case and restoring sign pand mm1, mm2 ; Zero in the zero case pxor mm3, mm3 psubw mm3, mm1 ; mm3 = - res paddw mm3, mm3 ; mm3 = - 2*res pcmpgtw mm0, mm5 ; mm0 = *psrc < 0 pcmpeqw mm0, mm5 ; mm0 = *psrc >= 0 pand mm3, mm0 ; mm3 = *psrc <= 0 ? -2 * res : 0 paddw mm1, mm3 ; mm3 = samesign(*psrc,res) movq [esi], mm1 add esi,8 sub eax, 4 jnz near iquant_loop pop edx pop edi pop esi pop eax pop ebp ; restore stack pointer emms ; clear mmx registers ret ;;; int32_t quant_weight_coeff_sum_mmx(int16_t *src, int16_t *i_quant_mat ;;; Simply add up the sum of coefficients weighted ;;; by their quantisation coefficients ;;; ) ;;; eax - block counter... ;;; edi - src ;;; esi - dst ;;; edx - quant_mat ;; MMX Register usage ;; mm7 = [1|0..3]W ;; mm6 = [2047|0..3]W ;; mm5 = 0 global quant_weight_coeff_sum_mmx align 32 quant_weight_coeff_sum_mmx: push ebp ; save frame pointer mov ebp, esp ; link push ecx push esi push edi mov edi, [ebp+8] ; get pdst mov esi, [ebp+12] ; get piqm mov ecx, 16 ; 16 coefficient / quantiser quads to process... pxor mm6, mm6 ; Accumulator pxor mm7, mm7 ; Zero quantsum: movq mm0, [edi] movq mm2, [esi] ;; ;; Compute absolute value of coefficients... ;; movq mm1, mm7 pcmpgtw mm1, mm0 ; (mm0 < 0 ) movq mm3, mm0 psllw mm3, 1 ; 2*mm0 pand mm3, mm1 ; 2*mm0 * (mm0 < 0) psubw mm0, mm3 ; mm0 = abs(mm0) ;; ;; Compute the low and high words of the result.... ;; movq mm1, mm0 pmullw mm0, mm2 add edi, 8 add esi, 8 pmulhw mm1, mm2 movq mm3, mm0 punpcklwd mm3, mm1 punpckhwd mm0, mm1 paddd mm6, mm3 paddd mm6, mm0 sub ecx, 1 jnz quantsum movd eax, mm6 psrlq mm6, 32 movd ecx, mm6 add eax, ecx pop edi pop esi pop ecx pop ebp ; restore stack pointer emms ; clear mmx registers ret