cinelerra-5.1/mpeg2enc/mblockq_sad_mmxe.s

   1 ;;;
   2 ;;;  mblockq_sad_mmxe.s:
   3 ;;;
   4 ;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblock
   5 ;;; quads (2 by 2 squares of adjacent macroblocks)
   6
   7 ;;; Explanation: the motion compensation search at 1-pel and 2*2 sub-sampled
   8 ;;; evaluates macroblock quads.  A lot of memory accesses can be saved
   9 ;;; if each quad is done together rather than each macroblock in the
  10 ;;; quad handled individually.
  11
  12 ;;; TODO:               Really there ought to be MMX versions and the function's
  13 ;;; specification should be documented...
  14 ;
  15 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
  16
  17
  18 ;
  19 ;  This program is free software; you can reaxstribute it and/or
  20 ;  modify it under the terms of the GNU General Public License
  21 ;  as published by the Free Software Foundation; either version 2
  22 ;  of the License, or (at your option) any later version.
  23 ;
  24 ;  This program is distributed in the hope that it will be useful,
  25 ;  but WITHOUT ANY WARRANTY; without even the implied warranty of
  26 ;  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  27 ;  GNU General Public License for more details.
  28 ;
  29 ;  You should have received a copy of the GNU General Public License
  30 ;  along with this program; if not, write to the Free Software
  31 ;  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
  32 ;
  33 ;
  34
  35 ;;; CURRENTLY not used but used in testing as reference for tweaks...
  36 global mblockq_sad1_REF
  37
  38 ; void mblockq_dist1_REF(char *blk1,char *blk2,int lx,int h,int *weightvec);
  39 ; eax = p1
  40 ; ebx = p2
  41 ; ecx = unused
  42 ; edx = lx;
  43 ; edi = rowsleft
  44 ; esi = h
  45
  46 ; mm0 = SAD (x+0,y+0)
  47 ; mm1 = SAD (x+2,y+0)
  48 ; mm2 = SAD (x+0,y+2)
  49 ; mm3 = SAD (x+2,y+2)
  50 ; mm4 = temp
  51 ; mm5 = temp
  52 ; mm6 = temp
  53 ; mm7 = temp
  54
  55 SECTION .text
  56 align 32
  57 mblockq_dist1_REF:
  58         push ebp                                        ; save frame pointer
  59         mov ebp, esp                            ; link
  60         push eax
  61         push ebx
  62         push ecx
  63         push edx
  64         push edi
  65         push esi
  66
  67         pxor mm0, mm0           ; zero accumulators
  68         pxor mm1, mm1
  69         pxor mm2, mm2
  70         pxor mm3, mm3
  71         mov eax, [ebp+8]        ; get p1
  72         mov ebx, [ebp+12]       ; get p2
  73         mov edx, [ebp+16]       ; get lx
  74
  75         mov edi, [ebp+20]       ; get rowsleft
  76         mov esi, edi
  77
  78         jmp nextrow_block_d1
  79 align 32
  80 nextrow_block_d1:
  81
  82                 ;; Do the (+0,+0) SAD
  83
  84         movq mm4, [eax]         ; load 1st 8 bytes of p1
  85         movq mm6, mm4
  86         movq mm5, [ebx]
  87         psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
  88         paddd mm0, mm4          ; accumulate difference
  89         movq mm4, [eax+8]       ; load 2nd 8 bytes of p1
  90         movq mm7, mm4
  91         psadbw mm4, [ebx+8]     ; compare to 2nd 8 bytes of p2
  92         paddd mm0, mm4          ; accumulate difference
  93
  94
  95     cmp edi, esi
  96         jz  firstrow0
  97
  98                 ;; Do the (0,+2) SAD
  99         sub ebx, edx
 100         psadbw mm6, [ebx]       ; compare to next 8 bytes of p2 (row 1)
 101         paddd mm2, mm6          ; accumulate difference
 102         psadbw mm7, [ebx+8]     ;  next 8 bytes of p1 (row 1)
 103         add ebx, edx
 104         paddd mm2, mm7
 105
 106 firstrow0:
 107
 108                 ;; Do the (+2,0) SAD
 109
 110         movq mm4, [eax+1]
 111
 112         movq mm6, mm4
 113         psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
 114         paddd mm1, mm4          ; accumulate difference
 115         movq mm4, [eax+9]
 116         movq mm7, mm4
 117         psadbw mm4, [ebx+8]     ; compare to 2nd 8 bytes of p2
 118         paddd mm1, mm4          ; accumulate difference
 119
 120     cmp edi, esi
 121         jz  firstrow1
 122
 123                 ;; Do the (+2, +2 ) SAD
 124         sub ebx, edx
 125         psadbw mm6, [ebx]       ; compare to 1st 8 bytes of prev p2
 126         psadbw mm7, [ebx+8]     ;  2nd 8 bytes of prev p2
 127         add ebx, edx
 128         paddd mm3, mm6          ; accumulate difference
 129         paddd mm3, mm7
 130 firstrow1:
 131
 132         add eax, edx                            ; update pointer to next row
 133         add ebx, edx            ; ditto
 134
 135         sub edi, 1
 136         jnz near nextrow_block_d1
 137
 138                 ;; Do the last row of the (0,+2) SAD
 139
 140         movq mm4, [eax]         ; load 1st 8 bytes of p1
 141         movq mm5, [eax+8]       ; load 2nd 8 bytes of p1
 142         sub  ebx, edx
 143         psadbw mm4, [ebx]       ; compare to next 8 bytes of p2 (row 1)
 144         psadbw mm5, [ebx+8]     ;  next 8 bytes of p1 (row 1)
 145         paddd mm2, mm4          ; accumulate difference
 146         paddd mm2, mm5
 147
 148         movq mm4, [eax+1]
 149         movq mm5, [eax+9]
 150
 151                 ;; Do the last row of rhw (+2, +2) SAD
 152         psadbw mm4, [ebx]       ; compare to 1st 8 bytes of prev p2
 153         psadbw mm5, [ebx+8]     ;  2nd 8 bytes of prev p2
 154         paddd mm3, mm4          ; accumulate difference
 155         paddd mm3, mm5
 156
 157
 158         mov eax, [ebp+24]                       ; Weightvec
 159         movd [eax+0], mm0
 160         movd [eax+4], mm1
 161         movd [eax+8], mm2
 162         movd [eax+12], mm3
 163
 164         pop esi
 165         pop edi
 166         pop edx
 167         pop ecx
 168         pop ebx
 169         pop eax
 170
 171         pop ebp
 172         emms
 173         ret
 174
 175
 176
 177 global mblockq_dist1_mmxe
 178
 179 ; void mblockq_dist1_mmxe(char *blk1,char *blk2,int lx,int h,int *weightvec);
 180
 181 ; eax = p1
 182 ; ebx = p2
 183 ; ecx = unused
 184 ; edx = lx;
 185 ; edi = rowsleft
 186 ; esi = h
 187
 188 ; mm0 = SAD (x+0,y+0),SAD (x+0,y+2)
 189 ; mm1 = SAD (x+2,y+0),SAD (x+2,y+2)
 190
 191 ; mm4 = temp
 192 ; mm5 = temp
 193 ; mm6 = temp
 194 ; mm7 = temp
 195
 196 align 32
 197 mblockq_dist1_mmxe:
 198         push ebp                                        ; save frame pointer
 199         mov ebp, esp                            ; link
 200         push eax
 201         push ebx
 202         push ecx
 203         push edx
 204         push edi
 205         push esi
 206
 207         mov eax, [ebp+8]        ; get p1
 208         prefetcht0 [eax]
 209         pxor mm0, mm0           ; zero accumulators
 210         pxor mm1, mm1
 211         mov ebx, [ebp+12]       ; get p2
 212         mov edx, [ebp+16]       ; get lx
 213
 214         mov edi, [ebp+20]       ; get rowsleft
 215         mov esi, edi
 216
 217         jmp nextrow_block_e1
 218 align 32
 219 nextrow_block_e1:
 220
 221                 ;; Do the (+0,+0) SAD
 222         prefetcht0 [eax+edx]
 223         movq mm4, [eax]         ; load 1st 8 bytes of p1
 224         movq mm6, mm4
 225         movq mm5, [ebx]
 226         psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
 227         paddd mm0, mm4          ; accumulate difference
 228         movq mm4, [eax+8]       ; load 2nd 8 bytes of p1
 229         movq mm7, mm4
 230         psadbw mm4, [ebx+8]     ; compare to 2nd 8 bytes of p2
 231         paddd mm0, mm4          ; accumulate difference
 232
 233
 234     cmp edi, esi
 235         jz  firstrowe0
 236
 237                 ;; Do the (0,+2) SAD
 238         sub ebx, edx
 239         pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 240         movq   mm2, [ebx]
 241         psadbw mm6, mm2     ; compare to next 8 bytes of p2 (row 1)
 242         paddd mm0, mm6          ; accumulate difference
 243         movq  mm3, [ebx+8]
 244         psadbw mm7, mm3 ;  next 8 bytes of p1 (row 1)
 245         add ebx, edx
 246         paddd mm0, mm7
 247         pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 248 firstrowe0:
 249
 250                 ;; Do the (+2,0) SAD
 251
 252         movq mm4, [eax+1]
 253         movq mm6, mm4
 254
 255         psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
 256         paddd mm1, mm4          ; accumulate difference
 257
 258         movq mm4, [eax+9]
 259         movq mm7, mm4
 260
 261         psadbw mm4, [ebx+8]     ; compare to 2nd 8 bytes of p2
 262         paddd mm1, mm4          ; accumulate difference
 263
 264     cmp edi, esi
 265         jz  firstrowe1
 266
 267                 ;; Do the (+2, +2 ) SAD
 268         sub ebx, edx
 269         pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 270         psadbw mm6, mm2 ; compare to 1st 8 bytes of prev p2
 271         psadbw mm7, mm3 ;  2nd 8 bytes of prev p2
 272         add ebx, edx
 273         paddd mm1, mm6          ; accumulate difference
 274         paddd mm1, mm7
 275         pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 276 firstrowe1:
 277
 278         add eax, edx                            ; update pointer to next row
 279         add ebx, edx            ; ditto
 280
 281         sub edi, 1
 282         jnz near nextrow_block_e1
 283
 284                 ;; Do the last row of the (0,+2) SAD
 285         pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 286         movq mm4, [eax]         ; load 1st 8 bytes of p1
 287         movq mm5, [eax+8]       ; load 2nd 8 bytes of p1
 288         sub  ebx, edx
 289         psadbw mm4, [ebx]       ; compare to next 8 bytes of p2 (row 1)
 290         psadbw mm5, [ebx+8]     ;  next 8 bytes of p1 (row 1)
 291         paddd mm0, mm4          ; accumulate difference
 292         paddd mm0, mm5
 293
 294
 295                 ;; Do the last row of rhw (+2, +2) SAD
 296         pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 297         movq mm4, [eax+1]
 298         movq mm5, [eax+9]
 299
 300         psadbw mm4, [ebx]       ; compare to 1st 8 bytes of prev p2
 301         psadbw mm5, [ebx+8]     ;  2nd 8 bytes of prev p2
 302         paddd mm1, mm4          ; accumulate difference
 303         paddd mm1, mm5
 304
 305
 306         mov eax, [ebp+24]                       ; Weightvec
 307         movd [eax+8], mm0
 308         pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 309         movd [eax+12], mm1
 310         pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 311         movd [eax+0], mm0
 312         movd [eax+4], mm1
 313
 314         pop esi
 315         pop edi
 316         pop edx
 317         pop ecx
 318         pop ebx
 319         pop eax
 320
 321         pop ebp
 322         emms
 323         ret
 324
 325 global mblockq_dist22_mmxe
 326
 327 ; void mblockq_dist22_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh, int* resvec);
 328
 329 ; eax = p1
 330 ; ebx = p2
 331 ; ecx = counter temp
 332 ; edx = flx;
 333
 334 ; mm0 = distance accumulator
 335 ; mm1 = distance accumulator
 336 ; mm2 = previous p1 row
 337 ; mm3 = previous p1 displaced by 1 byte...
 338 ; mm4 = temp
 339 ; mm5 = temp
 340 ; mm6 = temp
 341 ; mm7 = temp / 0 if first row 0xff otherwise
 342
 343
 344 align 32
 345 mblockq_dist22_mmxe:
 346         push ebp                ; save frame pointer
 347         mov ebp, esp
 348         push eax
 349         push ebx
 350         push ecx
 351         push edx
 352
 353         pxor mm0, mm0           ; zero acculumator
 354         pxor mm1, mm1           ; zero acculumator
 355         pxor mm2, mm2           ; zero acculumator
 356         pxor mm3, mm3           ; zero acculumator
 357
 358         mov eax, [ebp+8]        ; get p1
 359         mov ebx, [ebp+12]       ; get p2
 360         mov edx, [ebp+16]       ; get lx
 361         mov ecx, [ebp+20]
 362         movq mm2, [eax+edx]
 363         movq mm3, [eax+edx+1]
 364         jmp nextrowbd22
 365 align 32
 366 nextrowbd22:
 367         movq   mm5, [ebx]                       ; load previous row reference block
 368                                                                 ; mm2 /mm3 containts current row target block
 369
 370         psadbw mm2, mm5                         ; Comparse (x+0,y+2)
 371         paddd  mm1, mm2
 372
 373         psadbw mm3, mm5                         ; Compare (x+2,y+2)
 374         pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 375         paddd  mm1, mm3
 376
 377         pshufw  mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 378
 379         movq mm2, [eax]                         ; Load current row traget block into mm2 / mm3
 380         movq mm6, mm2
 381         movq mm3, [eax+1]
 382         sub        eax, edx
 383         sub        ebx, edx
 384         prefetcht0 [eax]
 385         movq mm7, mm3
 386
 387         psadbw  mm6, mm5                        ; Compare (x+0,y+0)
 388         paddd   mm0, mm6
 389         pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 390         psadbw  mm7, mm5                        ; Compare (x+2,y+0)
 391         paddd   mm0, mm7
 392         pshufw  mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
 393
 394         sub ecx, 1
 395         jnz nextrowbd22
 396
 397         mov  eax, [ebp+24]
 398         movq [eax+0], mm0
 399         movq [eax+8], mm1
 400         pop edx
 401         pop ecx
 402         pop ebx
 403         pop eax
 404         pop ebp
 405
 406         emms
 407         ret
 408
 409
 410
 411
 412