2 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
5 ; This program is free software; you can redistribute it and/or
6 ; modify it under the terms of the GNU General Public License
7 ; as published by the Free Software Foundation; either version 2
8 ; of the License, or (at your option) any later version.
10 ; This program is distributed in the hope that it will be useful,
11 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ; GNU General Public License for more details.
15 ; You should have received a copy of the GNU General Public License
16 ; along with this program; if not, write to the Free Software
17 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ; quantize_ni_mmx.s: MMX optimized coefficient quantization sub-routine
24 global quantize_ni_mmx
25 ; int quantize_ni_mmx(short *dst, short *src,
26 ; short *quant_mat, short *i_quant_mat,
27 ; int imquant, int mquant, int sat_limit)
29 ; See quantize.c: quant_non_intra_hv_inv() for reference implementation in C...
30 ;; mquant is not currently used.
31 ; eax = row counter...
33 ; ecx = piqm ; Matrix of quads first (2^16/quant)
34 ; then (2^16/quant)*(2^16%quant) the second part is for rounding
39 ; mm0 = [imquant|0..3]W
40 ; mm1 = [sat_limit|0..3]W
42 ; mm3 = rounding corrections... / temp
44 ; mm5 = nzflag accumulators
45 ; mm6 = overflow limit
49 ;; private constants needed
60 ;; BUFFER NO LONGER USED DUE TO IMPROVED MAIN ROUTINE...
70 push ebp ; save frame pointer
78 mov edi, [ebp+8] ; get dst
79 mov esi, [ebp+12] ; get psrc
80 mov ebx, [ebp+16] ; get pqm
81 mov ecx, [ebp+20] ; get piqm
82 movd mm0, [ebp+24] ; get imquant (2^16 / mquant )
85 punpcklwd mm0, mm0 ; mm0 = [imquant|0..3]W
87 movq mm6, [overflim]; overflow limit
89 movd mm1, [ebp+32] ; sat_limit
91 punpcklwd mm1, mm2 ; [sat_limit|0..3]W
92 punpcklwd mm1, mm1 ; mm1 = [sat_limit|0..3]W
94 pxor mm5, mm5 ; Non-zero flag accumulator
95 mov eax, 16 ; 16 quads to do
100 movq mm2, [esi] ; mm0 = *psrc
103 pcmpgtw mm4, mm2 ; mm4 = *psrc < 0
104 movq mm7, mm2 ; mm7 = *psrc
105 psllw mm7, 1 ; mm7 = 2*(*psrc)
106 pand mm7, mm4 ; mm7 = 2*(*psrc)*(*psrc < 0)
107 psubw mm2, mm7 ; mm2 = abs(*psrc)
110 ;; Check whether we'll saturate intermediate results
111 ;; Eventually flag is low 8 bits of result
115 pcmpgtw mm7, mm6 ; Tooo big for 16 bit arithmetic :-( (should be *very* rare)
121 jnz near out_of_range
124 ;; Carry on with the arithmetic...
125 psllw mm2, 5 ; mm2 = 32*abs(*psrc)
126 movq mm7, [ebx] ; mm7 = *pqm>>1
128 paddw mm2, mm7 ; mm2 = 32*abs(*psrc)+((*pqm)/2) = "p"
132 ;; Do the first multiplication. Cunningly we've set things up so
133 ;; it is exactly the top 16 bits we're interested in...
135 ;; We need the low word results for a rounding correction.
136 ;; This is *not* exact (that actual
137 ;; correction the product abs(*psrc)*(*pqm)*(2^16%*qm) >> 16
138 ;; However we get very very few wrong and none too low (the most
139 ;; important) and no errors for small coefficients (also important)
140 ;; if we simply add abs(*psrc)
146 psrlw mm7, 1 ; Want to see if adding p would carry into upper 16 bits
149 psrlw mm3, 15 ; High bit in lsb rest 0's
150 pmulhw mm2, [ecx] ; mm2 = (p*iqm+p) >> IQUANT_SCALE_POW2 ~= p/*qm
155 ;; To hide the latency lets update some pointers...
156 add esi, 8 ; 4 word's
157 add ecx, 8 ; 4 word's
160 ;; Add rounding correction....
165 ;; Do the second multiplication, again we ned to make a rounding adjustment
166 ;; EXPERIMENT: see comments in quantize.c:quant_non_intra_hv don't adjust...
170 ; psrlw mm7, 1 ; Want to see if adding p would carry into upper 16 bits
173 ; psrlw mm3, 15 ; High bit in lsb rest 0's
175 pmulhw mm2, mm0 ; mm2 ~= (p/(qm*mquant))
178 ;; To hide the latency lets update some more pointers...
182 ;; Correct rounding and the factor of two (we want p/(qm*2*mquant)
188 ;; Check for saturation
201 ;; Accumulate non-zero flags
205 ;; Now correct the sign mm4 = *psrc < 0
208 pxor mm7, mm7 ; mm7 = -2*mm2
211 pand mm7, mm4 ; mm7 = -2*mm2 * (*psrc < 0)
212 paddw mm2, mm7 ; mm7 = samesign(*psrc, mm2 )
215 ;; Store the quantised words....
223 ;; Return saturation in low word and nzflag in high word of result dword
233 and edx, 0xffff0000 ;; hiwgh word ecx is nzflag
243 pop ebp ; restore stack pointer
245 emms ; clear mmx registers
260 ;;; void iquant_non_intra_m1_{sse,mmx}(int16_t *src, int16_t *dst, uint16_t
262 ;;; mmx/sse Inverse mpeg-1 quantisation routine.
264 ;;; eax - block counter...
269 ;; MMX Register usage
271 ;; mm6 = [2047|0..3]W
275 global iquant_non_intra_m1_sse
277 iquant_non_intra_m1_sse:
279 push ebp ; save frame pointer
287 mov edi, [ebp+8] ; get psrc
288 mov esi, [ebp+12] ; get pdst
289 mov edx, [ebp+16] ; get quant table
300 mov eax, 64 ; 64 coeffs in a DCT block
304 movq mm0, [edi] ; mm0 = *psrc
308 pcmpeqw mm2, mm1 ; mm2 = 1's for non-zero in mm0
311 ;; Work with absolute value for convience...
312 psubw mm1, mm0 ; mm1 = -*psrc
313 pmaxsw mm1, mm0 ; mm1 = val = max(*psrc,-*psrc) = abs(*psrc)
314 paddw mm1, mm1 ; mm1 *= 2;
315 paddw mm1, mm7 ; mm1 += 1
316 pmullw mm1, [edx] ; mm1 = (val*2+1) * *quant_mat
318 psraw mm1, 5 ; mm1 = ((val*2+1) * *quant_mat)/32
320 ;; Now that nasty mis-match control
324 pxor mm3, mm7 ; mm3 = ~(val&1) (in the low bits, others 0)
326 pcmpeqw mm4, mm5 ; mm4 = (val == 0)
327 pxor mm4, mm7 ; Low bits now (val != 0)
328 pand mm3, mm4 ; mm3 = (~(val&1))&(val!=0)
330 psubw mm1, mm3 ; mm1 -= (~(val&1))&(val!=0)
331 pminsw mm1, mm6 ; mm1 = saturated(res)
333 ;; Handle zero case and restoring sign
334 pand mm1, mm2 ; Zero in the zero case
336 psubw mm3, mm1 ; mm3 = - res
337 paddw mm3, mm3 ; mm3 = - 2*res
338 pcmpgtw mm0, mm5 ; mm0 = *psrc < 0
339 pcmpeqw mm0, mm5 ; mm0 = *psrc >= 0
340 pand mm3, mm0 ; mm3 = *psrc <= 0 ? -2 * res : 0
341 paddw mm1, mm3 ; mm3 = samesign(*psrc,res)
353 pop ebp ; restore stack pointer
355 emms ; clear mmx registers
360 ;;; void iquant_non_intra_m1_mmx(int16_t *src, int16_t *dst, uint16_t
362 ;;; eax - block counter...
367 ;; MMX Register usage
369 ;; mm6 = [MAX_UINT16-2047|0..3]W
373 global iquant_non_intra_m1_mmx
375 iquant_non_intra_m1_mmx:
377 push ebp ; save frame pointer
385 mov edi, [ebp+8] ; get psrc
386 mov esi, [ebp+12] ; get pdst
387 mov edx, [ebp+16] ; get quant table
393 mov eax, (0xffff-2047)
398 mov eax, 64 ; 64 coeffs in a DCT block
402 movq mm0, [edi] ; mm0 = *psrc
406 pcmpeqw mm2, mm5 ; mm2 = 1's for non-zero in mm0
409 ;; Work with absolute value for convience...
411 psubw mm1, mm0 ; mm1 = -*psrc
412 psllw mm1, 1 ; mm1 = -2*psrc
413 movq mm3, mm0 ; mm3 = *psrc > 0
415 pcmpeqw mm3, mm5 ; mm3 = *psrc <= 0
416 pand mm3, mm1 ; mm3 = (*psrc <= 0)*-2* *psrc
417 movq mm1, mm0 ; mm1 = (*psrc <= 0)*-2* *psrc + *psrc = abs(*psrc)
421 paddw mm1, mm1 ; mm1 *= 2;
422 paddw mm1, mm7 ; mm1 += 1
423 pmullw mm1, [edx] ; mm1 = (val*2+1) * *quant_mat
425 psraw mm1, 5 ; mm1 = ((val*2+1) * *quant_mat)/32
427 ;; Now that nasty mis-match control
431 pxor mm3, mm7 ; mm3 = ~(val&1) (in the low bits, others 0)
433 pcmpeqw mm4, mm5 ; mm4 = (val == 0)
434 pxor mm4, mm7 ; Low bits now (val != 0)
435 pand mm3, mm4 ; mm3 = (~(val&1))&(val!=0)
437 psubw mm1, mm3 ; mm1 -= (~(val&1))&(val!=0)
439 paddsw mm1, mm6 ; Will saturate if > 2047
440 psubw mm1, mm6 ; 2047 if saturated... unchanged otherwise
442 ;; Handle zero case and restoring sign
443 pand mm1, mm2 ; Zero in the zero case
445 psubw mm3, mm1 ; mm3 = - res
446 paddw mm3, mm3 ; mm3 = - 2*res
447 pcmpgtw mm0, mm5 ; mm0 = *psrc < 0
448 pcmpeqw mm0, mm5 ; mm0 = *psrc >= 0
449 pand mm3, mm0 ; mm3 = *psrc <= 0 ? -2 * res : 0
450 paddw mm1, mm3 ; mm3 = samesign(*psrc,res)
462 pop ebp ; restore stack pointer
464 emms ; clear mmx registers
469 ;;; int32_t quant_weight_coeff_sum_mmx(int16_t *src, int16_t *i_quant_mat
470 ;;; Simply add up the sum of coefficients weighted
471 ;;; by their quantisation coefficients
473 ;;; eax - block counter...
478 ;; MMX Register usage
480 ;; mm6 = [2047|0..3]W
483 global quant_weight_coeff_sum_mmx
485 quant_weight_coeff_sum_mmx:
486 push ebp ; save frame pointer
493 mov edi, [ebp+8] ; get pdst
494 mov esi, [ebp+12] ; get piqm
496 mov ecx, 16 ; 16 coefficient / quantiser quads to process...
497 pxor mm6, mm6 ; Accumulator
504 ;; Compute absolute value of coefficients...
507 pcmpgtw mm1, mm0 ; (mm0 < 0 )
510 pand mm3, mm1 ; 2*mm0 * (mm0 < 0)
511 psubw mm0, mm3 ; mm0 = abs(mm0)
515 ;; Compute the low and high words of the result....
542 pop ebp ; restore stack pointer
544 emms ; clear mmx registers