2 ; dist2_mmx.s: mmX optimized squared distance sum
4 ; Original believed to be Copyright (C) 2000 Brent Byeler
6 ; This program is free software; you can reaxstribute it and/or
7 ; modify it under the terms of the GNU General Public License
8 ; as published by the Free Software Foundation; either version 2
9 ; of the License, or (at your option) any later version.
11 ; This program is distributed in the hope that it will be useful,
12 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ; GNU General Public License for more details.
16 ; You should have received a copy of the GNU General Public License
17 ; along with this program; if not, write to the Free Software
18 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ; total squared difference between two (16*h) blocks
22 ; including optional half pel interpolation of [ebp+8] ; blk1 (hx,hy)
23 ; blk1,blk2: addresses of top left pels of both blocks
24 ; lx: distance (in bytes) of vertically adjacent pels
25 ; hx,hy: flags for horizontal and/or vertical interpolation
26 ; h: height of block (usually 8 or 16)
30 ; int dist2_mmx(unsigned char *blk1, unsigned char *blk2,
31 ; int lx, int hx, int hy, int h)
43 ;; private constants needed
57 push ebp ; save frame pointer
65 mov esi, [ebp+16] ; lx
66 mov eax, [ebp+20] ; hx
67 mov edx, [ebp+24] ; hy
71 test edi, edi ; h = 0?
74 pxor mm7, mm7 ; get zeros i mm7
76 test eax, eax ; hx != 0?
78 test edx, edx ; hy != 0?
121 ;; Accumulate sum in edx... we use mm5
143 mov eax, [ebp+8] ; blk1
144 mov ebx, [ebp+12] ; blk1
146 pxor mm6, mm6 ; mm6 = 0 and isn't changed anyplace in the loop..
163 paddw mm0, mm6 ; here we add mm6 = 0.... weird...
207 ; Accumulate mm0 sum on edx... we'll use mm5 for this and add up at the end
228 mov eax, [ebp+8] ; blk1
229 mov edx, [ebp+12] ; blk2
231 add ebx, esi ; blk1 + lx
235 psubw mm6, mm1 ; mm6 = 1
297 ;; Accumulate in "s" - we use mm5 for the purpose
306 ;; Originally this moved
307 mov eax, ebx ; eax = eax + lx
308 add edx, esi ; edx = edx + lx
309 add ebx, esi ; ebx = ebx + lx
315 mov eax, [ebp+8] ; blk1
316 mov edx, [ebp+12] ; blk2
318 add ebx, esi ; ebx = blk1 + lx
345 ;pxor mm6, mm6 ; mm6 = 0
346 ;pcmpeqw mm5, mm5 ; mm5 = -1
347 ;psubw mm6, mm5 ; mm6 = 1
348 ;paddw mm6, mm6 ; mm6 = 2
350 paddw mm0, mm6 ; round mm0
351 paddw mm1, mm6 ; round mm1
394 ;pxor mm6, mm6 ; Zero mm6
395 ;pcmpeqw mm5, mm5 ; mm5 = -1
396 ;psubw mm6, mm5 ; mm6 = 1
397 ;paddw mm6, mm6 ; mm6 = 2
398 ;paddw mm1, mm6 ; round mm1 and mm2
421 ;; Accumulate the result in "s" we use mm6 for the purpose...
429 mov eax, ebx ; ahem ebx = eax at start of loop and wasn't changed...
437 ;; Put the final sum in eax for return...
449 pop ebp ; restore stack pointer
451 emms ; clear mmx registers
455 ; total squared difference between two (8*h) blocks
456 ; blk1,blk2: addresses of top left pels of both blocks
457 ; lx: distance (in bytes) of vertically adjacent pels
458 ; h: height of block (usually 4, or 8)
462 ; int dist2_22_mmx(unsigned char *blk1, unsigned char *blk2,
476 push ebp ; save frame pointer
484 mov esi, [ebp+16] ; lx
485 mov edi, [ebp+20] ; h
488 test edi, edi ; h = 0?
491 pxor mm7, mm7 ; get zeros i mm7
493 mov eax, [ebp+8] ; blk1
494 mov ebx, [ebp+12] ; blk2
523 ; total squared difference between interpolation of two (8*h) blocks and
525 ; blk1,blk2: addresses of top left pels of both blocks
526 ; lx: distance (in bytes) of vertically adjacent pels
527 ; h: height of block (usually 4, or 8)
531 ; int bdist2_22_mmx(unsigned char *blk1f, unsigned char*blk1b,
532 ; unsigned char *blk2,
546 push ebp ; save frame pointer
554 mov esi, [ebp+20] ; lx
555 mov edi, [ebp+24] ; h
558 test edi, edi ; h = 0?
561 pxor mm7, mm7 ; get zeros i mm7
563 mov eax, [ebp+8] ; blk1f
564 mov ebx, [ebp+12] ; blk1b
565 mov ecx, [ebp+16] ; blk2