Merge CV, ver=5.1; ops/methods from HV, and interface from CV where possible

[goodguy/history.git] / cinelerra-5.1 / mpeg2enc / quant_mmx.s
diff --git a/cinelerra-5.1/mpeg2enc/quant_mmx.s b/cinelerra-5.1/mpeg2enc/quant_mmx.s

new file mode 100644 (file)

index 0000000..c206918
--- /dev/null
+++ b/cinelerra-5.1/mpeg2enc/quant_mmx.s
@@ -0,0 +1,547 @@
+;
+;  Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+
+;
+;  This program is free software; you can redistribute it and/or
+;  modify it under the terms of the GNU General Public License
+;  as published by the Free Software Foundation; either version 2
+;  of the License, or (at your option) any later version.
+;
+;  This program is distributed in the hope that it will be useful,
+;  but WITHOUT ANY WARRANTY; without even the implied warranty of
+;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;  GNU General Public License for more details.
+;
+;  You should have received a copy of the GNU General Public License
+;  along with this program; if not, write to the Free Software
+;  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
+;
+;
+;
+;  quantize_ni_mmx.s:  MMX optimized coefficient quantization sub-routine
+
+
+global quantize_ni_mmx
+; int quantize_ni_mmx(short *dst, short *src, 
+;                            short *quant_mat, short *i_quant_mat,
+;                     int imquant, int mquant, int sat_limit)
+
+;  See quantize.c: quant_non_intra_hv_inv()  for reference implementation in C...
+               ;;  mquant is not currently used.
+; eax = row counter...
+; ebx = pqm
+; ecx = piqm  ; Matrix of quads first (2^16/quant) 
+                         ; then (2^16/quant)*(2^16%quant) the second part is for rounding
+; edx = temp
+; edi = psrc
+; esi = pdst
+
+; mm0 = [imquant|0..3]W
+; mm1 = [sat_limit|0..3]W
+; mm2 = *psrc -> src
+; mm3 = rounding corrections... / temp
+; mm4 = sign
+; mm5 = nzflag accumulators
+; mm6 = overflow limit
+; mm7 = temp
+
+               ;; 
+               ;;  private constants needed
+               ;; 
+
+SECTION .data
+align 16
+overflim:      
+                       dw      1024-1
+                       dw      1024-1
+                       dw      1024-1
+                       dw      1024-1
+                       
+                       ;; BUFFER NO LONGER USED DUE TO IMPROVED MAIN ROUTINE...
+SECTION .bss
+align 32
+quant_buf:     resw 64
+               
+SECTION .text
+               
+
+align 32
+quantize_ni_mmx:
+       push ebp                                ; save frame pointer
+       mov ebp, esp            ; link
+       push ebx
+       push ecx
+       push edx
+       push esi     
+       push edi
+
+       mov edi, [ebp+8]    ; get dst
+       mov esi, [ebp+12]       ; get psrc
+       mov ebx, [ebp+16]       ; get pqm
+       mov ecx,  [ebp+20]  ; get piqm
+       movd mm0, [ebp+24]  ; get imquant (2^16 / mquant )
+       movq mm1, mm0
+       punpcklwd mm0, mm1  
+       punpcklwd mm0, mm0    ; mm0 = [imquant|0..3]W
+       
+       movq  mm6, [overflim]; overflow limit
+
+       movd mm1, [ebp+32]  ; sat_limit
+       movq mm2, mm1
+       punpcklwd mm1, mm2   ; [sat_limit|0..3]W
+       punpcklwd mm1, mm1   ; mm1 = [sat_limit|0..3]W
+       
+       pxor      mm5, mm5  ; Non-zero flag accumulator 
+       mov eax,  16            ; 16 quads to do 
+       jmp nextquadniq
+
+align 32
+nextquadniq:
+       movq mm2, [esi]                         ; mm0 = *psrc
+
+       pxor    mm4, mm4
+       pcmpgtw mm4, mm2       ; mm4 = *psrc < 0
+       movq    mm7, mm2       ; mm7 = *psrc
+       psllw   mm7, 1         ; mm7 = 2*(*psrc)
+       pand    mm7, mm4       ; mm7 = 2*(*psrc)*(*psrc < 0)
+       psubw   mm2, mm7       ; mm2 = abs(*psrc)
+
+       ;;
+       ;;  Check whether we'll saturate intermediate results
+       ;;  Eventually flag is low 8 bits of result
+       ;;
+       
+       movq    mm7, mm2
+       pcmpgtw mm7, mm6    ; Tooo  big for 16 bit arithmetic :-( (should be *very* rare)
+       movq    mm3, mm7
+       psrlq   mm3, 32
+       por     mm7, mm3
+       movd    edx, mm7
+       cmp             edx, 0
+       jnz             near out_of_range
+
+       ;;
+       ;; Carry on with the arithmetic...
+       psllw   mm2, 5         ; mm2 = 32*abs(*psrc)
+       movq    mm7, [ebx]     ; mm7 = *pqm>>1
+       psrlw   mm7, 1
+       paddw   mm2, mm7       ; mm2 = 32*abs(*psrc)+((*pqm)/2) = "p"
+
+       
+       ;;
+       ;; Do the first multiplication.  Cunningly we've set things up so
+       ;; it is exactly the top 16 bits we're interested in...
+       ;;
+       ;; We need the low word results for a rounding correction.  
+       ;; This is *not* exact (that actual
+    ;; correction the product abs(*psrc)*(*pqm)*(2^16%*qm) >> 16
+    ;;  However we get very very few wrong and none too low (the most
+    ;; important) and no errors for small coefficients (also important)
+       ;;      if we simply add abs(*psrc)
+
+                       
+       movq    mm3, mm2                                
+       pmullw  mm3, [ecx]          
+       movq    mm7, mm2
+       psrlw   mm7, 1            ; Want to see if adding p would carry into upper 16 bits
+       psrlw   mm3, 1
+       paddw  mm3, mm7
+       psrlw   mm3, 15           ; High bit in lsb rest 0's
+       pmulhw  mm2, [ecx]        ; mm2 = (p*iqm+p) >> IQUANT_SCALE_POW2 ~= p/*qm
+
+       
+       
+       ;;
+       ;; To hide the latency lets update some pointers...
+       add   esi, 8                                    ; 4 word's
+       add   ecx, 8                                    ; 4 word's
+       sub   eax, 1
+
+       ;; Add rounding correction....
+       paddw   mm2, mm3
+
+
+       ;;
+       ;; Do the second multiplication, again we ned to make a rounding adjustment
+       ;; EXPERIMENT:   see comments in quantize.c:quant_non_intra_hv don't adjust...
+;      movq    mm3, mm2                                
+;      pmullw  mm3, mm0          
+;      movq    mm7, mm2
+;      psrlw   mm7, 1            ; Want to see if adding p would carry into upper 16 bits
+;      psrlw   mm3, 1
+;      paddw mm3, mm7
+;      psrlw   mm3, 15           ; High bit in lsb rest 0's
+
+       pmulhw  mm2, mm0     ; mm2 ~= (p/(qm*mquant)) 
+
+       ;;
+       ;; To hide the latency lets update some more pointers...
+       add   edi, 8
+       add   ebx, 8
+
+       ;; Correct rounding and the factor of two (we want p/(qm*2*mquant)
+;      paddw mm2, mm3
+       psrlw mm2, 1
+
+
+       ;;
+       ;; Check for saturation
+       ;;
+       movq mm7, mm2
+       pcmpgtw mm7, mm1
+       movq    mm3, mm7
+       psrlq   mm3, 32 
+       movq    mm3, mm7
+       por             mm7, mm3
+       movd    edx, mm7
+       cmp             edx, 0
+       jnz             saturated
+
+       ;;
+       ;;  Accumulate non-zero flags
+       por     mm5, mm2
+       
+       ;;
+       ;; Now correct the sign mm4 = *psrc < 0
+       ;;
+       
+       pxor mm7, mm7        ; mm7 = -2*mm2
+       psubw mm7, mm2
+       psllw mm7, 1
+       pand  mm7, mm4       ; mm7 = -2*mm2 * (*psrc < 0)
+       paddw mm2, mm7       ; mm7 = samesign(*psrc, mm2 )
+       
+               ;;
+               ;;  Store the quantised words....
+               ;;
+
+       movq [edi-8], mm2
+       test eax, eax
+       
+       jnz near nextquadniq
+
+       ;; Return saturation in low word and nzflag in high word of result dword 
+               
+
+       movq  mm0, mm5
+       psrlq mm0, 32
+       por   mm5, mm0
+       movd  edx, mm5
+       mov   ebx, edx
+       shl   ebx, 16
+       or    edx, ebx
+    and   edx, 0xffff0000  ;; hiwgh word ecx is nzflag
+       mov   eax, edx
+       
+return:
+       pop edi
+       pop esi
+       pop edx
+       pop ecx
+       pop ebx
+
+       pop ebp                 ; restore stack pointer
+
+       emms                    ; clear mmx registers
+       ret                     
+
+out_of_range:
+       mov     eax,    0x00ff
+       jp      return
+saturated:
+
+       mov eax,    0xff00
+       jp return
+
+
+
+
+;;;            
+;;;  void iquant_non_intra_m1_{sse,mmx}(int16_t *src, int16_t *dst, uint16_t
+;;;                               *quant_mat)
+;;; mmx/sse Inverse mpeg-1 quantisation routine.
+;;; 
+;;; eax - block counter...
+;;; edi - src
+;;; esi - dst
+;;; edx - quant_mat
+
+               ;; MMX Register usage
+               ;; mm7 = [1|0..3]W
+               ;; mm6 = [2047|0..3]W
+               ;; mm5 = 0
+
+                               
+global iquant_non_intra_m1_sse
+align 32
+iquant_non_intra_m1_sse:
+               
+               push ebp                                ; save frame pointer
+               mov ebp, esp            ; link
+
+               push eax
+               push esi     
+               push edi
+               push edx
+
+               mov             edi, [ebp+8]                    ; get psrc
+               mov             esi, [ebp+12]                   ; get pdst
+               mov             edx, [ebp+16]                   ; get quant table
+               mov             eax,1
+               movd    mm7, eax
+               punpcklwd       mm7, mm7
+               punpckldq       mm7, mm7
+
+               mov     eax, 2047
+               movd    mm6, eax
+               punpcklwd               mm6, mm6
+               punpckldq               mm6, mm6
+
+               mov             eax, 64                 ; 64 coeffs in a DCT block
+               pxor    mm5, mm5
+               
+iquant_loop_sse:
+               movq    mm0, [edi]      ; mm0 = *psrc
+               add             edi,8
+               pxor    mm1,mm1
+               movq    mm2, mm0
+               pcmpeqw mm2, mm1                ; mm2 = 1's for non-zero in mm0
+               pcmpeqw mm2, mm1
+
+               ;; Work with absolute value for convience...
+               psubw   mm1, mm0        ; mm1 = -*psrc
+               pmaxsw  mm1, mm0        ; mm1 = val = max(*psrc,-*psrc) = abs(*psrc)
+               paddw   mm1, mm1                ; mm1 *= 2;
+               paddw   mm1, mm7                ; mm1 += 1
+               pmullw  mm1, [edx]              ; mm1 = (val*2+1) * *quant_mat
+               add             edx, 8
+               psraw   mm1, 5                  ; mm1 = ((val*2+1) * *quant_mat)/32
+
+               ;; Now that nasty mis-match control
+
+               movq    mm3, mm1
+               pand    mm3, mm7
+               pxor    mm3, mm7                ; mm3 = ~(val&1) (in the low bits, others 0)
+               movq    mm4, mm1
+               pcmpeqw mm4, mm5                ; mm4 = (val == 0) 
+               pxor    mm4, mm7                ;  Low bits now (val != 0)
+               pand    mm3, mm4                ; mm3 =  (~(val&1))&(val!=0)
+
+               psubw   mm1, mm3                ; mm1 -= (~(val&1))&(val!=0)
+               pminsw  mm1, mm6                ; mm1 = saturated(res)
+
+               ;; Handle zero case and restoring sign
+               pand    mm1, mm2                ; Zero in the zero case
+               pxor    mm3, mm3
+               psubw   mm3, mm1                ;  mm3 = - res
+               paddw   mm3, mm3                ;  mm3 = - 2*res
+               pcmpgtw mm0, mm5                ;  mm0 = *psrc < 0
+               pcmpeqw mm0, mm5                ;  mm0 = *psrc >= 0
+               pand    mm3, mm0                ;  mm3 = *psrc <= 0 ? -2 * res :         0
+               paddw   mm1, mm3                ;  mm3 = samesign(*psrc,res)
+               movq    [esi], mm1
+               add             esi,8
+
+               sub             eax, 4
+               jnz             iquant_loop_sse
+               
+               pop     edx
+               pop edi
+               pop esi
+               pop eax
+
+               pop ebp                 ; restore stack pointer
+
+               emms                    ; clear mmx registers
+               ret                     
+
+
+;;;            
+;;;  void iquant_non_intra_m1_mmx(int16_t *src, int16_t *dst, uint16_t
+;;;                               *quant_mat)
+;;; eax - block counter...
+;;; edi - src
+;;; esi - dst
+;;; edx - quant_mat
+
+               ;; MMX Register usage
+               ;; mm7 = [1|0..3]W
+               ;; mm6 = [MAX_UINT16-2047|0..3]W
+               ;; mm5 = 0
+
+                               
+global iquant_non_intra_m1_mmx
+align 32
+iquant_non_intra_m1_mmx:
+               
+               push ebp                                ; save frame pointer
+               mov ebp, esp            ; link
+
+               push eax
+               push esi     
+               push edi
+               push edx
+
+               mov             edi, [ebp+8]                    ; get psrc
+               mov             esi, [ebp+12]                   ; get pdst
+               mov             edx, [ebp+16]                   ; get quant table
+               mov             eax,1
+               movd    mm7, eax
+               punpcklwd       mm7, mm7
+               punpckldq       mm7, mm7
+
+               mov     eax, (0xffff-2047)
+               movd    mm6, eax
+               punpcklwd               mm6, mm6
+               punpckldq               mm6, mm6
+
+               mov             eax, 64                 ; 64 coeffs in a DCT block
+               pxor    mm5, mm5
+               
+iquant_loop:
+               movq    mm0, [edi]      ; mm0 = *psrc
+               add             edi,8
+               pxor    mm1, mm1                
+               movq    mm2, mm0
+               pcmpeqw mm2, mm5                ; mm2 = 1's for non-zero in mm0
+               pcmpeqw mm2, mm5
+
+               ;; Work with absolute value for convience...
+
+               psubw   mm1, mm0        ; mm1 = -*psrc
+               psllw   mm1, 1                  ; mm1 = -2*psrc
+               movq    mm3, mm0                ; mm3 = *psrc > 0
+               pcmpgtw mm3, mm5
+               pcmpeqw mm3, mm5        ; mm3 = *psrc <= 0
+               pand    mm3, mm1                ; mm3 = (*psrc <= 0)*-2* *psrc
+               movq    mm1, mm0        ; mm1 = (*psrc <= 0)*-2* *psrc + *psrc = abs(*psrc)
+               paddw   mm1, mm3
+
+               
+               paddw   mm1, mm1                ; mm1 *= 2;
+               paddw   mm1, mm7                ; mm1 += 1
+               pmullw  mm1, [edx]              ; mm1 = (val*2+1) * *quant_mat
+               add             edx, 8
+               psraw   mm1, 5                  ; mm1 = ((val*2+1) * *quant_mat)/32
+
+               ;; Now that nasty mis-match control
+
+               movq    mm3, mm1
+               pand    mm3, mm7
+               pxor    mm3, mm7                ; mm3 = ~(val&1) (in the low bits, others 0)
+               movq    mm4, mm1
+               pcmpeqw mm4, mm5                ; mm4 = (val == 0) 
+               pxor    mm4, mm7                ;  Low bits now (val != 0)
+               pand    mm3, mm4                ; mm3 =  (~(val&1))&(val!=0)
+
+               psubw   mm1, mm3                ; mm1 -= (~(val&1))&(val!=0)
+
+               paddsw  mm1, mm6                ; Will saturate if > 2047
+               psubw   mm1, mm6                ; 2047 if saturated... unchanged otherwise
+
+               ;; Handle zero case and restoring sign
+               pand    mm1, mm2                ; Zero in the zero case
+               pxor    mm3, mm3
+               psubw   mm3, mm1                ;  mm3 = - res
+               paddw   mm3, mm3                ;  mm3 = - 2*res
+               pcmpgtw mm0, mm5                ;  mm0 = *psrc < 0
+               pcmpeqw mm0, mm5                ;  mm0 = *psrc >= 0
+               pand    mm3, mm0                ;  mm3 = *psrc <= 0 ? -2 * res :         0
+               paddw   mm1, mm3                ;  mm3 = samesign(*psrc,res)
+               movq    [esi], mm1
+               add             esi,8
+
+               sub             eax, 4
+               jnz             near iquant_loop
+               
+               pop     edx
+               pop edi
+               pop esi
+               pop eax
+
+               pop ebp                 ; restore stack pointer
+
+               emms                    ; clear mmx registers
+               ret                     
+                                               
+
+
+;;;  int32_t quant_weight_coeff_sum_mmx(int16_t *src, int16_t *i_quant_mat
+;;; Simply add up the sum of coefficients weighted 
+;;; by their quantisation coefficients
+;;;                               )
+;;; eax - block counter...
+;;; edi - src
+;;; esi - dst
+;;; edx - quant_mat
+
+               ;; MMX Register usage
+               ;; mm7 = [1|0..3]W
+               ;; mm6 = [2047|0..3]W
+               ;; mm5 = 0
+               
+global quant_weight_coeff_sum_mmx
+align 32
+quant_weight_coeff_sum_mmx:
+       push ebp                                ; save frame pointer
+       mov ebp, esp            ; link
+
+       push ecx
+       push esi     
+       push edi
+
+       mov edi, [ebp+8]        ; get pdst
+       mov esi, [ebp+12]       ; get piqm
+
+       mov ecx, 16                     ; 16 coefficient / quantiser quads to process...
+       pxor mm6, mm6           ; Accumulator
+       pxor mm7, mm7           ; Zero
+quantsum:
+       movq    mm0, [edi]
+       movq    mm2, [esi]
+       
+       ;;
+       ;;      Compute absolute value of coefficients...
+       ;;
+       movq    mm1, mm7
+       pcmpgtw mm1, mm0   ; (mm0 < 0 )
+       movq    mm3, mm0
+       psllw   mm3, 1     ; 2*mm0
+       pand    mm3, mm1   ; 2*mm0 * (mm0 < 0)
+       psubw   mm0, mm3   ; mm0 = abs(mm0)
+
+
+       ;;
+       ;; Compute the low and high words of the result....
+       ;; 
+       movq    mm1, mm0        
+       pmullw  mm0, mm2
+       add             edi, 8
+       add             esi, 8
+       pmulhw  mm1, mm2
+       
+       movq      mm3, mm0
+       punpcklwd  mm3, mm1
+       punpckhwd  mm0, mm1
+       paddd      mm6, mm3
+       paddd      mm6, mm0
+       
+       
+       sub ecx,        1
+       jnz   quantsum
+
+       movd   eax, mm6
+       psrlq  mm6, 32
+       movd   ecx, mm6
+       add    eax, ecx
+       
+       pop edi
+       pop esi
+       pop ecx
+
+       pop ebp                 ; restore stack pointer
+
+       emms                    ; clear mmx registers
+       ret                     
+
+