cinelerra-5.1/mpeg2enc/quant_mmx.s

   1 ;
   2 ;  Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
   3
   4 ;
   5 ;  This program is free software; you can redistribute it and/or
   6 ;  modify it under the terms of the GNU General Public License
   7 ;  as published by the Free Software Foundation; either version 2
   8 ;  of the License, or (at your option) any later version.
   9 ;
  10 ;  This program is distributed in the hope that it will be useful,
  11 ;  but WITHOUT ANY WARRANTY; without even the implied warranty of
  12 ;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  13 ;  GNU General Public License for more details.
  14 ;
  15 ;  You should have received a copy of the GNU General Public License
  16 ;  along with this program; if not, write to the Free Software
  17 ;  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  18 ;
  19 ;
  20 ;
  21 ;  quantize_ni_mmx.s:  MMX optimized coefficient quantization sub-routine
  22
  23
  24 global quantize_ni_mmx
  25 ; int quantize_ni_mmx(short *dst, short *src,
  26 ;                             short *quant_mat, short *i_quant_mat,
  27 ;                     int imquant, int mquant, int sat_limit)
  28
  29 ;  See quantize.c: quant_non_intra_hv_inv()  for reference implementation in C...
  30                 ;;  mquant is not currently used.
  31 ; eax = row counter...
  32 ; ebx = pqm
  33 ; ecx = piqm  ; Matrix of quads first (2^16/quant)
  34                           ; then (2^16/quant)*(2^16%quant) the second part is for rounding
  35 ; edx = temp
  36 ; edi = psrc
  37 ; esi = pdst
  38
  39 ; mm0 = [imquant|0..3]W
  40 ; mm1 = [sat_limit|0..3]W
  41 ; mm2 = *psrc -> src
  42 ; mm3 = rounding corrections... / temp
  43 ; mm4 = sign
  44 ; mm5 = nzflag accumulators
  45 ; mm6 = overflow limit
  46 ; mm7 = temp
  47
  48                 ;;
  49                 ;;  private constants needed
  50                 ;;
  51
  52 SECTION .data
  53 align 16
  54 overflim:
  55                         dw      1024-1
  56                         dw      1024-1
  57                         dw      1024-1
  58                         dw      1024-1
  59
  60                         ;; BUFFER NO LONGER USED DUE TO IMPROVED MAIN ROUTINE...
  61 SECTION .bss
  62 align 32
  63 quant_buf:      resw 64
  64
  65 SECTION .text
  66 align 32
  67 quantize_ni_mmx:
  68         push ebp                                ; save frame pointer
  69         mov ebp, esp            ; link
  70         push ebx
  71         push ecx
  72         push edx
  73         push esi
  74         push edi
  75
  76         mov edi, [ebp+8]    ; get dst
  77         mov esi, [ebp+12]       ; get psrc
  78         mov ebx, [ebp+16]       ; get pqm
  79         mov ecx,  [ebp+20]  ; get piqm
  80         movd mm0, [ebp+24]  ; get imquant (2^16 / mquant )
  81         movq mm1, mm0
  82         punpcklwd mm0, mm1
  83         punpcklwd mm0, mm0    ; mm0 = [imquant|0..3]W
  84
  85         movq  mm6, [overflim]; overflow limit
  86
  87         movd mm1, [ebp+32]  ; sat_limit
  88         movq mm2, mm1
  89         punpcklwd mm1, mm2   ; [sat_limit|0..3]W
  90         punpcklwd mm1, mm1   ; mm1 = [sat_limit|0..3]W
  91
  92         pxor      mm5, mm5  ; Non-zero flag accumulator
  93         mov eax,  16            ; 16 quads to do
  94         jmp nextquadniq
  95
  96 align 32
  97 nextquadniq:
  98         movq mm2, [esi]                         ; mm0 = *psrc
  99
 100         pxor    mm4, mm4
 101         pcmpgtw mm4, mm2       ; mm4 = *psrc < 0
 102         movq    mm7, mm2       ; mm7 = *psrc
 103         psllw   mm7, 1         ; mm7 = 2*(*psrc)
 104         pand    mm7, mm4       ; mm7 = 2*(*psrc)*(*psrc < 0)
 105         psubw   mm2, mm7       ; mm2 = abs(*psrc)
 106
 107         ;;
 108         ;;  Check whether we'll saturate intermediate results
 109         ;;  Eventually flag is low 8 bits of result
 110         ;;
 111
 112         movq    mm7, mm2
 113         pcmpgtw mm7, mm6    ; Tooo  big for 16 bit arithmetic :-( (should be *very* rare)
 114         movq    mm3, mm7
 115         psrlq   mm3, 32
 116         por     mm7, mm3
 117         movd    edx, mm7
 118         cmp             edx, 0
 119         jnz             near out_of_range
 120
 121         ;;
 122         ;; Carry on with the arithmetic...
 123         psllw   mm2, 5         ; mm2 = 32*abs(*psrc)
 124         movq    mm7, [ebx]     ; mm7 = *pqm>>1
 125         psrlw   mm7, 1
 126         paddw   mm2, mm7       ; mm2 = 32*abs(*psrc)+((*pqm)/2) = "p"
 127
 128
 129         ;;
 130         ;; Do the first multiplication.  Cunningly we've set things up so
 131         ;; it is exactly the top 16 bits we're interested in...
 132         ;;
 133         ;; We need the low word results for a rounding correction.
 134         ;; This is *not* exact (that actual
 135     ;; correction the product abs(*psrc)*(*pqm)*(2^16%*qm) >> 16
 136     ;;  However we get very very few wrong and none too low (the most
 137     ;; important) and no errors for small coefficients (also important)
 138         ;;      if we simply add abs(*psrc)
 139
 140
 141         movq    mm3, mm2
 142         pmullw  mm3, [ecx]
 143         movq    mm7, mm2
 144         psrlw   mm7, 1            ; Want to see if adding p would carry into upper 16 bits
 145         psrlw   mm3, 1
 146         paddw  mm3, mm7
 147         psrlw   mm3, 15           ; High bit in lsb rest 0's
 148         pmulhw  mm2, [ecx]        ; mm2 = (p*iqm+p) >> IQUANT_SCALE_POW2 ~= p/*qm
 149
 150
 151
 152         ;;
 153         ;; To hide the latency lets update some pointers...
 154         add   esi, 8                                    ; 4 word's
 155         add   ecx, 8                                    ; 4 word's
 156         sub   eax, 1
 157
 158         ;; Add rounding correction....
 159         paddw   mm2, mm3
 160
 161
 162         ;;
 163         ;; Do the second multiplication, again we ned to make a rounding adjustment
 164         ;; EXPERIMENT:   see comments in quantize.c:quant_non_intra_hv don't adjust...
 165 ;       movq    mm3, mm2
 166 ;       pmullw  mm3, mm0
 167 ;       movq    mm7, mm2
 168 ;       psrlw   mm7, 1            ; Want to see if adding p would carry into upper 16 bits
 169 ;       psrlw   mm3, 1
 170 ;       paddw mm3, mm7
 171 ;       psrlw   mm3, 15           ; High bit in lsb rest 0's
 172
 173         pmulhw  mm2, mm0     ; mm2 ~= (p/(qm*mquant))
 174
 175         ;;
 176         ;; To hide the latency lets update some more pointers...
 177         add   edi, 8
 178         add   ebx, 8
 179
 180         ;; Correct rounding and the factor of two (we want p/(qm*2*mquant)
 181 ;       paddw mm2, mm3
 182         psrlw mm2, 1
 183
 184
 185         ;;
 186         ;; Check for saturation
 187         ;;
 188         movq mm7, mm2
 189         pcmpgtw mm7, mm1
 190         movq    mm3, mm7
 191         psrlq   mm3, 32
 192         movq    mm3, mm7
 193         por             mm7, mm3
 194         movd    edx, mm7
 195         cmp             edx, 0
 196         jnz             saturated
 197
 198         ;;
 199         ;;  Accumulate non-zero flags
 200         por     mm5, mm2
 201
 202         ;;
 203         ;; Now correct the sign mm4 = *psrc < 0
 204         ;;
 205
 206         pxor mm7, mm7        ; mm7 = -2*mm2
 207         psubw mm7, mm2
 208         psllw mm7, 1
 209         pand  mm7, mm4       ; mm7 = -2*mm2 * (*psrc < 0)
 210         paddw mm2, mm7       ; mm7 = samesign(*psrc, mm2 )
 211
 212                 ;;
 213                 ;;  Store the quantised words....
 214                 ;;
 215
 216         movq [edi-8], mm2
 217         test eax, eax
 218
 219         jnz near nextquadniq
 220
 221         ;; Return saturation in low word and nzflag in high word of result dword
 222
 223
 224         movq  mm0, mm5
 225         psrlq mm0, 32
 226         por   mm5, mm0
 227         movd  edx, mm5
 228         mov   ebx, edx
 229         shl   ebx, 16
 230         or    edx, ebx
 231     and   edx, 0xffff0000  ;; hiwgh word ecx is nzflag
 232         mov   eax, edx
 233
 234 return:
 235         pop edi
 236         pop esi
 237         pop edx
 238         pop ecx
 239         pop ebx
 240
 241         pop ebp                 ; restore stack pointer
 242
 243         emms                    ; clear mmx registers
 244         ret
 245
 246 out_of_range:
 247         mov     eax,    0x00ff
 248         jp      return
 249 saturated:
 250
 251         mov eax,    0xff00
 252         jp return
 253
 254
 255
 256
 257 ;;;
 258 ;;;  void iquant_non_intra_m1_{sse,mmx}(int16_t *src, int16_t *dst, uint16_t
 259 ;;;                               *quant_mat)
 260 ;;; mmx/sse Inverse mpeg-1 quantisation routine.
 261 ;;;
 262 ;;; eax - block counter...
 263 ;;; edi - src
 264 ;;; esi - dst
 265 ;;; edx - quant_mat
 266
 267                 ;; MMX Register usage
 268                 ;; mm7 = [1|0..3]W
 269                 ;; mm6 = [2047|0..3]W
 270                 ;; mm5 = 0
 271
 272
 273 global iquant_non_intra_m1_sse
 274 align 32
 275 iquant_non_intra_m1_sse:
 276
 277                 push ebp                                ; save frame pointer
 278                 mov ebp, esp            ; link
 279
 280                 push eax
 281                 push esi
 282                 push edi
 283                 push edx
 284
 285                 mov             edi, [ebp+8]                    ; get psrc
 286                 mov             esi, [ebp+12]                   ; get pdst
 287                 mov             edx, [ebp+16]                   ; get quant table
 288                 mov             eax,1
 289                 movd    mm7, eax
 290                 punpcklwd       mm7, mm7
 291                 punpckldq       mm7, mm7
 292
 293                 mov     eax, 2047
 294                 movd    mm6, eax
 295                 punpcklwd               mm6, mm6
 296                 punpckldq               mm6, mm6
 297
 298                 mov             eax, 64                 ; 64 coeffs in a DCT block
 299                 pxor    mm5, mm5
 300
 301 iquant_loop_sse:
 302                 movq    mm0, [edi]      ; mm0 = *psrc
 303                 add             edi,8
 304                 pxor    mm1,mm1
 305                 movq    mm2, mm0
 306                 pcmpeqw mm2, mm1                ; mm2 = 1's for non-zero in mm0
 307                 pcmpeqw mm2, mm1
 308
 309                 ;; Work with absolute value for convience...
 310                 psubw   mm1, mm0        ; mm1 = -*psrc
 311                 pmaxsw  mm1, mm0        ; mm1 = val = max(*psrc,-*psrc) = abs(*psrc)
 312                 paddw   mm1, mm1                ; mm1 *= 2;
 313                 paddw   mm1, mm7                ; mm1 += 1
 314                 pmullw  mm1, [edx]              ; mm1 = (val*2+1) * *quant_mat
 315                 add             edx, 8
 316                 psraw   mm1, 5                  ; mm1 = ((val*2+1) * *quant_mat)/32
 317
 318                 ;; Now that nasty mis-match control
 319
 320                 movq    mm3, mm1
 321                 pand    mm3, mm7
 322                 pxor    mm3, mm7                ; mm3 = ~(val&1) (in the low bits, others 0)
 323                 movq    mm4, mm1
 324                 pcmpeqw mm4, mm5                ; mm4 = (val == 0)
 325                 pxor    mm4, mm7                ;  Low bits now (val != 0)
 326                 pand    mm3, mm4                ; mm3 =  (~(val&1))&(val!=0)
 327
 328                 psubw   mm1, mm3                ; mm1 -= (~(val&1))&(val!=0)
 329                 pminsw  mm1, mm6                ; mm1 = saturated(res)
 330
 331                 ;; Handle zero case and restoring sign
 332                 pand    mm1, mm2                ; Zero in the zero case
 333                 pxor    mm3, mm3
 334                 psubw   mm3, mm1                ;  mm3 = - res
 335                 paddw   mm3, mm3                ;  mm3 = - 2*res
 336                 pcmpgtw mm0, mm5                ;  mm0 = *psrc < 0
 337                 pcmpeqw mm0, mm5                ;  mm0 = *psrc >= 0
 338                 pand    mm3, mm0                ;  mm3 = *psrc <= 0 ? -2 * res :         0
 339                 paddw   mm1, mm3                ;  mm3 = samesign(*psrc,res)
 340                 movq    [esi], mm1
 341                 add             esi,8
 342
 343                 sub             eax, 4
 344                 jnz             iquant_loop_sse
 345
 346                 pop     edx
 347                 pop edi
 348                 pop esi
 349                 pop eax
 350
 351                 pop ebp                 ; restore stack pointer
 352
 353                 emms                    ; clear mmx registers
 354                 ret
 355
 356
 357 ;;;
 358 ;;;  void iquant_non_intra_m1_mmx(int16_t *src, int16_t *dst, uint16_t
 359 ;;;                               *quant_mat)
 360 ;;; eax - block counter...
 361 ;;; edi - src
 362 ;;; esi - dst
 363 ;;; edx - quant_mat
 364
 365                 ;; MMX Register usage
 366                 ;; mm7 = [1|0..3]W
 367                 ;; mm6 = [MAX_UINT16-2047|0..3]W
 368                 ;; mm5 = 0
 369
 370
 371 global iquant_non_intra_m1_mmx
 372 align 32
 373 iquant_non_intra_m1_mmx:
 374
 375                 push ebp                                ; save frame pointer
 376                 mov ebp, esp            ; link
 377
 378                 push eax
 379                 push esi
 380                 push edi
 381                 push edx
 382
 383                 mov             edi, [ebp+8]                    ; get psrc
 384                 mov             esi, [ebp+12]                   ; get pdst
 385                 mov             edx, [ebp+16]                   ; get quant table
 386                 mov             eax,1
 387                 movd    mm7, eax
 388                 punpcklwd       mm7, mm7
 389                 punpckldq       mm7, mm7
 390
 391                 mov     eax, (0xffff-2047)
 392                 movd    mm6, eax
 393                 punpcklwd               mm6, mm6
 394                 punpckldq               mm6, mm6
 395
 396                 mov             eax, 64                 ; 64 coeffs in a DCT block
 397                 pxor    mm5, mm5
 398
 399 iquant_loop:
 400                 movq    mm0, [edi]      ; mm0 = *psrc
 401                 add             edi,8
 402                 pxor    mm1, mm1
 403                 movq    mm2, mm0
 404                 pcmpeqw mm2, mm5                ; mm2 = 1's for non-zero in mm0
 405                 pcmpeqw mm2, mm5
 406
 407                 ;; Work with absolute value for convience...
 408
 409                 psubw   mm1, mm0        ; mm1 = -*psrc
 410                 psllw   mm1, 1                  ; mm1 = -2*psrc
 411                 movq    mm3, mm0                ; mm3 = *psrc > 0
 412                 pcmpgtw mm3, mm5
 413                 pcmpeqw mm3, mm5        ; mm3 = *psrc <= 0
 414                 pand    mm3, mm1                ; mm3 = (*psrc <= 0)*-2* *psrc
 415                 movq    mm1, mm0        ; mm1 = (*psrc <= 0)*-2* *psrc + *psrc = abs(*psrc)
 416                 paddw   mm1, mm3
 417
 418
 419                 paddw   mm1, mm1                ; mm1 *= 2;
 420                 paddw   mm1, mm7                ; mm1 += 1
 421                 pmullw  mm1, [edx]              ; mm1 = (val*2+1) * *quant_mat
 422                 add             edx, 8
 423                 psraw   mm1, 5                  ; mm1 = ((val*2+1) * *quant_mat)/32
 424
 425                 ;; Now that nasty mis-match control
 426
 427                 movq    mm3, mm1
 428                 pand    mm3, mm7
 429                 pxor    mm3, mm7                ; mm3 = ~(val&1) (in the low bits, others 0)
 430                 movq    mm4, mm1
 431                 pcmpeqw mm4, mm5                ; mm4 = (val == 0)
 432                 pxor    mm4, mm7                ;  Low bits now (val != 0)
 433                 pand    mm3, mm4                ; mm3 =  (~(val&1))&(val!=0)
 434
 435                 psubw   mm1, mm3                ; mm1 -= (~(val&1))&(val!=0)
 436
 437                 paddsw  mm1, mm6                ; Will saturate if > 2047
 438                 psubw   mm1, mm6                ; 2047 if saturated... unchanged otherwise
 439
 440                 ;; Handle zero case and restoring sign
 441                 pand    mm1, mm2                ; Zero in the zero case
 442                 pxor    mm3, mm3
 443                 psubw   mm3, mm1                ;  mm3 = - res
 444                 paddw   mm3, mm3                ;  mm3 = - 2*res
 445                 pcmpgtw mm0, mm5                ;  mm0 = *psrc < 0
 446                 pcmpeqw mm0, mm5                ;  mm0 = *psrc >= 0
 447                 pand    mm3, mm0                ;  mm3 = *psrc <= 0 ? -2 * res :         0
 448                 paddw   mm1, mm3                ;  mm3 = samesign(*psrc,res)
 449                 movq    [esi], mm1
 450                 add             esi,8
 451
 452                 sub             eax, 4
 453                 jnz             near iquant_loop
 454
 455                 pop     edx
 456                 pop edi
 457                 pop esi
 458                 pop eax
 459
 460                 pop ebp                 ; restore stack pointer
 461
 462                 emms                    ; clear mmx registers
 463                 ret
 464
 465
 466
 467 ;;;  int32_t quant_weight_coeff_sum_mmx(int16_t *src, int16_t *i_quant_mat
 468 ;;; Simply add up the sum of coefficients weighted
 469 ;;; by their quantisation coefficients
 470 ;;;                               )
 471 ;;; eax - block counter...
 472 ;;; edi - src
 473 ;;; esi - dst
 474 ;;; edx - quant_mat
 475
 476                 ;; MMX Register usage
 477                 ;; mm7 = [1|0..3]W
 478                 ;; mm6 = [2047|0..3]W
 479                 ;; mm5 = 0
 480
 481 global quant_weight_coeff_sum_mmx
 482 align 32
 483 quant_weight_coeff_sum_mmx:
 484         push ebp                                ; save frame pointer
 485         mov ebp, esp            ; link
 486
 487         push ecx
 488         push esi
 489         push edi
 490
 491         mov edi, [ebp+8]        ; get pdst
 492         mov esi, [ebp+12]       ; get piqm
 493
 494         mov ecx, 16                     ; 16 coefficient / quantiser quads to process...
 495         pxor mm6, mm6           ; Accumulator
 496         pxor mm7, mm7           ; Zero
 497 quantsum:
 498         movq    mm0, [edi]
 499         movq    mm2, [esi]
 500
 501         ;;
 502         ;;      Compute absolute value of coefficients...
 503         ;;
 504         movq    mm1, mm7
 505         pcmpgtw mm1, mm0   ; (mm0 < 0 )
 506         movq    mm3, mm0
 507         psllw   mm3, 1     ; 2*mm0
 508         pand    mm3, mm1   ; 2*mm0 * (mm0 < 0)
 509         psubw   mm0, mm3   ; mm0 = abs(mm0)
 510
 511
 512         ;;
 513         ;; Compute the low and high words of the result....
 514         ;;
 515         movq    mm1, mm0
 516         pmullw  mm0, mm2
 517         add             edi, 8
 518         add             esi, 8
 519         pmulhw  mm1, mm2
 520
 521         movq      mm3, mm0
 522         punpcklwd  mm3, mm1
 523         punpckhwd  mm0, mm1
 524         paddd      mm6, mm3
 525         paddd      mm6, mm0
 526
 527
 528         sub ecx,        1
 529         jnz   quantsum
 530
 531         movd   eax, mm6
 532         psrlq  mm6, 32
 533         movd   ecx, mm6
 534         add    eax, ecx
 535
 536         pop edi
 537         pop esi
 538         pop ecx
 539
 540         pop ebp                 ; restore stack pointer
 541
 542         emms                    ; clear mmx registers
 543         ret
 544
 545