2 ; dist2_mmx.s: mmX optimized squared distance sum
4 ; Original believed to be Copyright (C) 2000 Brent Byeler
6 ; This program is free software; you can reaxstribute it and/or
7 ; modify it under the terms of the GNU General Public License
8 ; as published by the Free Software Foundation; either version 2
9 ; of the License, or (at your option) any later version.
11 ; This program is distributed in the hope that it will be useful,
12 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
13 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 ; GNU General Public License for more details.
16 ; You should have received a copy of the GNU General Public License
17 ; along with this program; if not, write to the Free Software
18 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ; total squared difference between two (16*h) blocks
22 ; including optional half pel interpolation of [ebp+8] ; blk1 (hx,hy)
23 ; blk1,blk2: addresses of top left pels of both blocks
24 ; lx: distance (in bytes) of vertically adjacent pels
25 ; hx,hy: flags for horizontal and/or vertical interpolation
26 ; h: height of block (usually 8 or 16)
30 ; int dist2_mmx(unsigned char *blk1, unsigned char *blk2,
31 ; int lx, int hx, int hy, int h)
43 ;; private constants needed
56 push ebp ; save frame pointer
64 mov esi, [ebp+16] ; lx
65 mov eax, [ebp+20] ; hx
66 mov edx, [ebp+24] ; hy
70 test edi, edi ; h = 0?
73 pxor mm7, mm7 ; get zeros i mm7
75 test eax, eax ; hx != 0?
77 test edx, edx ; hy != 0?
120 ;; Accumulate sum in edx... we use mm5
142 mov eax, [ebp+8] ; blk1
143 mov ebx, [ebp+12] ; blk1
145 pxor mm6, mm6 ; mm6 = 0 and isn't changed anyplace in the loop..
162 paddw mm0, mm6 ; here we add mm6 = 0.... weird...
206 ; Accumulate mm0 sum on edx... we'll use mm5 for this and add up at the end
227 mov eax, [ebp+8] ; blk1
228 mov edx, [ebp+12] ; blk2
230 add ebx, esi ; blk1 + lx
234 psubw mm6, mm1 ; mm6 = 1
296 ;; Accumulate in "s" - we use mm5 for the purpose
305 ;; Originally this moved
306 mov eax, ebx ; eax = eax + lx
307 add edx, esi ; edx = edx + lx
308 add ebx, esi ; ebx = ebx + lx
314 mov eax, [ebp+8] ; blk1
315 mov edx, [ebp+12] ; blk2
317 add ebx, esi ; ebx = blk1 + lx
344 ;pxor mm6, mm6 ; mm6 = 0
345 ;pcmpeqw mm5, mm5 ; mm5 = -1
346 ;psubw mm6, mm5 ; mm6 = 1
347 ;paddw mm6, mm6 ; mm6 = 2
349 paddw mm0, mm6 ; round mm0
350 paddw mm1, mm6 ; round mm1
393 ;pxor mm6, mm6 ; Zero mm6
394 ;pcmpeqw mm5, mm5 ; mm5 = -1
395 ;psubw mm6, mm5 ; mm6 = 1
396 ;paddw mm6, mm6 ; mm6 = 2
397 ;paddw mm1, mm6 ; round mm1 and mm2
420 ;; Accumulate the result in "s" we use mm6 for the purpose...
428 mov eax, ebx ; ahem ebx = eax at start of loop and wasn't changed...
436 ;; Put the final sum in eax for return...
448 pop ebp ; restore stack pointer
450 emms ; clear mmx registers
454 ; total squared difference between two (8*h) blocks
455 ; blk1,blk2: addresses of top left pels of both blocks
456 ; lx: distance (in bytes) of vertically adjacent pels
457 ; h: height of block (usually 4, or 8)
461 ; int dist2_22_mmx(unsigned char *blk1, unsigned char *blk2,
475 push ebp ; save frame pointer
483 mov esi, [ebp+16] ; lx
484 mov edi, [ebp+20] ; h
487 test edi, edi ; h = 0?
490 pxor mm7, mm7 ; get zeros i mm7
492 mov eax, [ebp+8] ; blk1
493 mov ebx, [ebp+12] ; blk2
522 ; total squared difference between interpolation of two (8*h) blocks and
524 ; blk1,blk2: addresses of top left pels of both blocks
525 ; lx: distance (in bytes) of vertically adjacent pels
526 ; h: height of block (usually 4, or 8)
530 ; int bdist2_22_mmx(unsigned char *blk1f, unsigned char*blk1b,
531 ; unsigned char *blk2,
545 push ebp ; save frame pointer
553 mov esi, [ebp+20] ; lx
554 mov edi, [ebp+24] ; h
557 test edi, edi ; h = 0?
560 pxor mm7, mm7 ; get zeros i mm7
562 mov eax, [ebp+8] ; blk1f
563 mov ebx, [ebp+12] ; blk1b
564 mov ecx, [ebp+16] ; blk2