2 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
5 ; This program is free software; you can redistribute it and/or
6 ; modify it under the terms of the GNU General Public License
7 ; as published by the Free Software Foundation; either version 2
8 ; of the License, or (at your option) any later version.
10 ; This program is distributed in the hope that it will be useful,
11 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
12 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 ; GNU General Public License for more details.
15 ; You should have received a copy of the GNU General Public License
16 ; along with this program; if not, write to the Free Software
17 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ; quantize_ni_mmx.s: MMX optimized coefficient quantization sub-routine
24 global quantize_ni_mmx
25 ; int quantize_ni_mmx(short *dst, short *src,
26 ; short *quant_mat, short *i_quant_mat,
27 ; int imquant, int mquant, int sat_limit)
29 ; See quantize.c: quant_non_intra_hv_inv() for reference implementation in C...
30 ;; mquant is not currently used.
31 ; eax = row counter...
33 ; ecx = piqm ; Matrix of quads first (2^16/quant)
34 ; then (2^16/quant)*(2^16%quant) the second part is for rounding
39 ; mm0 = [imquant|0..3]W
40 ; mm1 = [sat_limit|0..3]W
42 ; mm3 = rounding corrections... / temp
44 ; mm5 = nzflag accumulators
45 ; mm6 = overflow limit
49 ;; private constants needed
60 ;; BUFFER NO LONGER USED DUE TO IMPROVED MAIN ROUTINE...
68 push ebp ; save frame pointer
76 mov edi, [ebp+8] ; get dst
77 mov esi, [ebp+12] ; get psrc
78 mov ebx, [ebp+16] ; get pqm
79 mov ecx, [ebp+20] ; get piqm
80 movd mm0, [ebp+24] ; get imquant (2^16 / mquant )
83 punpcklwd mm0, mm0 ; mm0 = [imquant|0..3]W
85 movq mm6, [overflim]; overflow limit
87 movd mm1, [ebp+32] ; sat_limit
89 punpcklwd mm1, mm2 ; [sat_limit|0..3]W
90 punpcklwd mm1, mm1 ; mm1 = [sat_limit|0..3]W
92 pxor mm5, mm5 ; Non-zero flag accumulator
93 mov eax, 16 ; 16 quads to do
98 movq mm2, [esi] ; mm0 = *psrc
101 pcmpgtw mm4, mm2 ; mm4 = *psrc < 0
102 movq mm7, mm2 ; mm7 = *psrc
103 psllw mm7, 1 ; mm7 = 2*(*psrc)
104 pand mm7, mm4 ; mm7 = 2*(*psrc)*(*psrc < 0)
105 psubw mm2, mm7 ; mm2 = abs(*psrc)
108 ;; Check whether we'll saturate intermediate results
109 ;; Eventually flag is low 8 bits of result
113 pcmpgtw mm7, mm6 ; Tooo big for 16 bit arithmetic :-( (should be *very* rare)
119 jnz near out_of_range
122 ;; Carry on with the arithmetic...
123 psllw mm2, 5 ; mm2 = 32*abs(*psrc)
124 movq mm7, [ebx] ; mm7 = *pqm>>1
126 paddw mm2, mm7 ; mm2 = 32*abs(*psrc)+((*pqm)/2) = "p"
130 ;; Do the first multiplication. Cunningly we've set things up so
131 ;; it is exactly the top 16 bits we're interested in...
133 ;; We need the low word results for a rounding correction.
134 ;; This is *not* exact (that actual
135 ;; correction the product abs(*psrc)*(*pqm)*(2^16%*qm) >> 16
136 ;; However we get very very few wrong and none too low (the most
137 ;; important) and no errors for small coefficients (also important)
138 ;; if we simply add abs(*psrc)
144 psrlw mm7, 1 ; Want to see if adding p would carry into upper 16 bits
147 psrlw mm3, 15 ; High bit in lsb rest 0's
148 pmulhw mm2, [ecx] ; mm2 = (p*iqm+p) >> IQUANT_SCALE_POW2 ~= p/*qm
153 ;; To hide the latency lets update some pointers...
154 add esi, 8 ; 4 word's
155 add ecx, 8 ; 4 word's
158 ;; Add rounding correction....
163 ;; Do the second multiplication, again we ned to make a rounding adjustment
164 ;; EXPERIMENT: see comments in quantize.c:quant_non_intra_hv don't adjust...
168 ; psrlw mm7, 1 ; Want to see if adding p would carry into upper 16 bits
171 ; psrlw mm3, 15 ; High bit in lsb rest 0's
173 pmulhw mm2, mm0 ; mm2 ~= (p/(qm*mquant))
176 ;; To hide the latency lets update some more pointers...
180 ;; Correct rounding and the factor of two (we want p/(qm*2*mquant)
186 ;; Check for saturation
199 ;; Accumulate non-zero flags
203 ;; Now correct the sign mm4 = *psrc < 0
206 pxor mm7, mm7 ; mm7 = -2*mm2
209 pand mm7, mm4 ; mm7 = -2*mm2 * (*psrc < 0)
210 paddw mm2, mm7 ; mm7 = samesign(*psrc, mm2 )
213 ;; Store the quantised words....
221 ;; Return saturation in low word and nzflag in high word of result dword
231 and edx, 0xffff0000 ;; hiwgh word ecx is nzflag
241 pop ebp ; restore stack pointer
243 emms ; clear mmx registers
258 ;;; void iquant_non_intra_m1_{sse,mmx}(int16_t *src, int16_t *dst, uint16_t
260 ;;; mmx/sse Inverse mpeg-1 quantisation routine.
262 ;;; eax - block counter...
267 ;; MMX Register usage
269 ;; mm6 = [2047|0..3]W
273 global iquant_non_intra_m1_sse
275 iquant_non_intra_m1_sse:
277 push ebp ; save frame pointer
285 mov edi, [ebp+8] ; get psrc
286 mov esi, [ebp+12] ; get pdst
287 mov edx, [ebp+16] ; get quant table
298 mov eax, 64 ; 64 coeffs in a DCT block
302 movq mm0, [edi] ; mm0 = *psrc
306 pcmpeqw mm2, mm1 ; mm2 = 1's for non-zero in mm0
309 ;; Work with absolute value for convience...
310 psubw mm1, mm0 ; mm1 = -*psrc
311 pmaxsw mm1, mm0 ; mm1 = val = max(*psrc,-*psrc) = abs(*psrc)
312 paddw mm1, mm1 ; mm1 *= 2;
313 paddw mm1, mm7 ; mm1 += 1
314 pmullw mm1, [edx] ; mm1 = (val*2+1) * *quant_mat
316 psraw mm1, 5 ; mm1 = ((val*2+1) * *quant_mat)/32
318 ;; Now that nasty mis-match control
322 pxor mm3, mm7 ; mm3 = ~(val&1) (in the low bits, others 0)
324 pcmpeqw mm4, mm5 ; mm4 = (val == 0)
325 pxor mm4, mm7 ; Low bits now (val != 0)
326 pand mm3, mm4 ; mm3 = (~(val&1))&(val!=0)
328 psubw mm1, mm3 ; mm1 -= (~(val&1))&(val!=0)
329 pminsw mm1, mm6 ; mm1 = saturated(res)
331 ;; Handle zero case and restoring sign
332 pand mm1, mm2 ; Zero in the zero case
334 psubw mm3, mm1 ; mm3 = - res
335 paddw mm3, mm3 ; mm3 = - 2*res
336 pcmpgtw mm0, mm5 ; mm0 = *psrc < 0
337 pcmpeqw mm0, mm5 ; mm0 = *psrc >= 0
338 pand mm3, mm0 ; mm3 = *psrc <= 0 ? -2 * res : 0
339 paddw mm1, mm3 ; mm3 = samesign(*psrc,res)
351 pop ebp ; restore stack pointer
353 emms ; clear mmx registers
358 ;;; void iquant_non_intra_m1_mmx(int16_t *src, int16_t *dst, uint16_t
360 ;;; eax - block counter...
365 ;; MMX Register usage
367 ;; mm6 = [MAX_UINT16-2047|0..3]W
371 global iquant_non_intra_m1_mmx
373 iquant_non_intra_m1_mmx:
375 push ebp ; save frame pointer
383 mov edi, [ebp+8] ; get psrc
384 mov esi, [ebp+12] ; get pdst
385 mov edx, [ebp+16] ; get quant table
391 mov eax, (0xffff-2047)
396 mov eax, 64 ; 64 coeffs in a DCT block
400 movq mm0, [edi] ; mm0 = *psrc
404 pcmpeqw mm2, mm5 ; mm2 = 1's for non-zero in mm0
407 ;; Work with absolute value for convience...
409 psubw mm1, mm0 ; mm1 = -*psrc
410 psllw mm1, 1 ; mm1 = -2*psrc
411 movq mm3, mm0 ; mm3 = *psrc > 0
413 pcmpeqw mm3, mm5 ; mm3 = *psrc <= 0
414 pand mm3, mm1 ; mm3 = (*psrc <= 0)*-2* *psrc
415 movq mm1, mm0 ; mm1 = (*psrc <= 0)*-2* *psrc + *psrc = abs(*psrc)
419 paddw mm1, mm1 ; mm1 *= 2;
420 paddw mm1, mm7 ; mm1 += 1
421 pmullw mm1, [edx] ; mm1 = (val*2+1) * *quant_mat
423 psraw mm1, 5 ; mm1 = ((val*2+1) * *quant_mat)/32
425 ;; Now that nasty mis-match control
429 pxor mm3, mm7 ; mm3 = ~(val&1) (in the low bits, others 0)
431 pcmpeqw mm4, mm5 ; mm4 = (val == 0)
432 pxor mm4, mm7 ; Low bits now (val != 0)
433 pand mm3, mm4 ; mm3 = (~(val&1))&(val!=0)
435 psubw mm1, mm3 ; mm1 -= (~(val&1))&(val!=0)
437 paddsw mm1, mm6 ; Will saturate if > 2047
438 psubw mm1, mm6 ; 2047 if saturated... unchanged otherwise
440 ;; Handle zero case and restoring sign
441 pand mm1, mm2 ; Zero in the zero case
443 psubw mm3, mm1 ; mm3 = - res
444 paddw mm3, mm3 ; mm3 = - 2*res
445 pcmpgtw mm0, mm5 ; mm0 = *psrc < 0
446 pcmpeqw mm0, mm5 ; mm0 = *psrc >= 0
447 pand mm3, mm0 ; mm3 = *psrc <= 0 ? -2 * res : 0
448 paddw mm1, mm3 ; mm3 = samesign(*psrc,res)
460 pop ebp ; restore stack pointer
462 emms ; clear mmx registers
467 ;;; int32_t quant_weight_coeff_sum_mmx(int16_t *src, int16_t *i_quant_mat
468 ;;; Simply add up the sum of coefficients weighted
469 ;;; by their quantisation coefficients
471 ;;; eax - block counter...
476 ;; MMX Register usage
478 ;; mm6 = [2047|0..3]W
481 global quant_weight_coeff_sum_mmx
483 quant_weight_coeff_sum_mmx:
484 push ebp ; save frame pointer
491 mov edi, [ebp+8] ; get pdst
492 mov esi, [ebp+12] ; get piqm
494 mov ecx, 16 ; 16 coefficient / quantiser quads to process...
495 pxor mm6, mm6 ; Accumulator
502 ;; Compute absolute value of coefficients...
505 pcmpgtw mm1, mm0 ; (mm0 < 0 )
508 pand mm3, mm1 ; 2*mm0 * (mm0 < 0)
509 psubw mm0, mm3 ; mm0 = abs(mm0)
513 ;; Compute the low and high words of the result....
540 pop ebp ; restore stack pointer
542 emms ; clear mmx registers