+++ /dev/null
-;
-; bdist2_mmx.s: MMX optimized bidirectional squared distance sum
-;
-; Original believed to be Copyright (C) 2000 Brent Byeler
-;
-; This program is free software; you can reaxstribute it and/or
-; modify it under the terms of the GNU General Public License
-; as published by the Free Software Foundation; either version 2
-; of the License, or (at your option) any later version.
-;
-; This program is distributed in the hope that it will be useful,
-; but WITHOUT ANY WARRANTY; without even the implied warranty of
-; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-; GNU General Public License for more details.
-;
-; You should have received a copy of the GNU General Public License
-; along with this program; if not, write to the Free Software
-; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
-;
-
-;/*
-; * squared error between a (16*h) block and a bidirectional
-; * prediction
-; *
-; * p2: address of top left pel of block
-; * pf,hxf,hyf: address and half pel flags of forward ref. block
-; * pb,hxb,hyb: address and half pel flags of backward ref. block
-; * h: height of block
-; * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
-; * mmX version
-; */
-
-;int bdist2_mmx(
-;unsigned char *pf, unsigned char *pb, unsigned char *p2,
-;int lx, int hxf, int hyf, int hxb, int hyb, int h)
-;{
-; unsigned char *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
-; int s;
-
-; Handy macros for readbility
-
-%define pf [ebp+8]
-%define pb [ebp+12]
-%define p2 [ebp+16]
-%define lx [ebp+20]
-%define hxf [ebp+24]
-%define hyf [ebp+28]
-%define hxb [ebp+32]
-%define hyb [ebp+36]
-%define h [ebp+40]
-
-
-%define pfa [esp+4]
-%define pfb [esp+8]
-%define pfc [esp+12]
-%define pba [esp+16]
-%define pbb [esp+20]
-%define pbc [esp+24]
-
-SECTION .text
-global bdist2_mmx
-
-align 32
-bdist2_mmx:
- push ebp ; save frame pointer
- mov ebp, esp ; link
- push ebx
- push ecx
- push edx
- push esi
- push edi
-
- ;;
- ;; Make space for local variables on stack
- sub esp, 32
-
- mov edx, hxb
- mov eax, hxf
- mov esi, lx
-
- mov ecx, pf
- add ecx, eax
- mov pfa, ecx
- mov ecx, esi
- imul ecx, hyf
- mov ebx, pf
- add ecx, ebx
- mov pfb, ecx
- add eax, ecx
- mov pfc, eax
- mov eax, pb
- add eax, edx
- mov pba, eax
- mov eax, esi
- imul eax, hyb
- mov ecx, pb
- add eax, ecx
- mov pbb, eax
- add edx, eax
- mov pbc, edx
- xor esi, esi ; esi = s (accumulated sym)
- mov eax, esi
-
- mov edi, h
- test edi, edi ; h = 0?
- jle near bdist2exit
-
- pxor mm7, mm7
- pxor mm6, mm6
- pcmpeqw mm5, mm5
- psubw mm6, mm5
- psllw mm6, 1
-
-bdist2top:
- mov eax, pf
- mov ebx, pfa
- mov ecx, pfb
- mov edx, pfc
- movq mm0, [eax]
- movq mm1, mm0
- punpcklbw mm0, mm7
- punpckhbw mm1, mm7
- movq mm2, [ebx]
- movq mm3, mm2
- punpcklbw mm2, mm7
- punpckhbw mm3, mm7
- paddw mm0, mm2
- paddw mm1, mm3
- movq mm2, [ecx]
- movq mm3, mm2
- punpcklbw mm2, mm7
- punpckhbw mm3, mm7
- paddw mm0, mm2
- paddw mm1, mm3
- movq mm2, [edx]
- movq mm3, mm2
- punpcklbw mm2, mm7
- punpckhbw mm3, mm7
- paddw mm0, mm2
- paddw mm1, mm3
- paddw mm0, mm6
- paddw mm1, mm6
- psrlw mm0, 2
- psrlw mm1, 2
-
- mov eax, pb
- mov ebx, pba
- mov ecx, pbb
- mov edx, pbc
- movq mm2, [eax]
- movq mm3, mm2
- punpcklbw mm2, mm7
- punpckhbw mm3, mm7
- movq mm4, [ebx]
- movq mm5, mm4
- punpcklbw mm4, mm7
- punpckhbw mm5, mm7
- paddw mm2, mm4
- paddw mm3, mm5
- movq mm4, [ecx]
- movq mm5, mm4
- punpcklbw mm4, mm7
- punpckhbw mm5, mm7
- paddw mm2, mm4
- paddw mm3, mm5
- movq mm4, [edx]
- movq mm5, mm4
- punpcklbw mm4, mm7
- punpckhbw mm5, mm7
- paddw mm2, mm4
- paddw mm3, mm5
-
- paddw mm2, mm6
- paddw mm3, mm6
- psrlw mm2, 2
- psrlw mm3, 2
-
- paddw mm0, mm2
- paddw mm1, mm3
- psrlw mm6, 1
- paddw mm0, mm6
- paddw mm1, mm6
- psllw mm6, 1
- psrlw mm0, 1
- psrlw mm1, 1
-
- mov eax, p2
- movq mm2, [eax]
- movq mm3, mm2
- punpcklbw mm2, mm7
- punpckhbw mm3, mm7
-
- psubw mm0, mm2
- psubw mm1, mm3
- pmaddwd mm0, mm0
- pmaddwd mm1, mm1
- paddd mm0, mm1
-
- movd eax, mm0
- psrlq mm0, 32
- movd ebx, mm0
- add esi, eax
- add esi, ebx
-
- mov eax, pf
- mov ebx, pfa
- mov ecx, pfb
- mov edx, pfc
- movq mm0, [eax+8]
- movq mm1, mm0
- punpcklbw mm0, mm7
- punpckhbw mm1, mm7
- movq mm2, [ebx+8]
- movq mm3, mm2
- punpcklbw mm2, mm7
- punpckhbw mm3, mm7
- paddw mm0, mm2
- paddw mm1, mm3
- movq mm2, [ecx+8]
- movq mm3, mm2
- punpcklbw mm2, mm7
- punpckhbw mm3, mm7
- paddw mm0, mm2
- paddw mm1, mm3
- movq mm2, [edx+8]
- movq mm3, mm2
- punpcklbw mm2, mm7
- punpckhbw mm3, mm7
- paddw mm0, mm2
- paddw mm1, mm3
- paddw mm0, mm6
- paddw mm1, mm6
- psrlw mm0, 2
- psrlw mm1, 2
-
- mov eax, pb
- mov ebx, pba
- mov ecx, pbb
- mov edx, pbc
- movq mm2, [eax+8]
- movq mm3, mm2
- punpcklbw mm2, mm7
- punpckhbw mm3, mm7
- movq mm4, [ebx+8]
- movq mm5, mm4
- punpcklbw mm4, mm7
- punpckhbw mm5, mm7
- paddw mm2, mm4
- paddw mm3, mm5
- movq mm4, [ecx+8]
- movq mm5, mm4
- punpcklbw mm4, mm7
- punpckhbw mm5, mm7
- paddw mm2, mm4
- paddw mm3, mm5
- movq mm4, [edx+8]
- movq mm5, mm4
- punpcklbw mm4, mm7
- punpckhbw mm5, mm7
- paddw mm2, mm4
- paddw mm3, mm5
- paddw mm2, mm6
- paddw mm3, mm6
- psrlw mm2, 2
- psrlw mm3, 2
-
- paddw mm0, mm2
- paddw mm1, mm3
- psrlw mm6, 1
- paddW mm0, mm6
- paddw mm1, mm6
- psllw mm6, 1
- psrlw mm0, 1
- psrlw mm1, 1
-
- mov eax, p2
- movq mm2, [eax+8]
- movq mm3, mm2
- punpcklbw mm2, mm7
- punpckhbw mm3, mm7
-
- psubw mm0, mm2
- psubw mm1, mm3
- pmaddwd mm0, mm0
- pmaddwd mm1, mm1
- paddd mm0, mm1
-
- movd eax, mm0
- psrlq mm0, 32
- movd ebx, mm0
- add esi, eax
- add esi, ebx
-
- mov eax, lx
- add p2, eax
- add pf, eax
- add pfa, eax
- add pfb, eax
- add pfc, eax
- add pb, eax
- add pba, eax
- add pbb, eax
- add pbc, eax
-
- dec edi
- jg near bdist2top
- mov eax, esi
-
-bdist2exit:
-
- ;;
- ;; Get rid of local variables
- add esp, 32
-
- ;; Retore (callee saves convention...)
- ;;
- pop edi
- pop esi
- pop edx
- pop ecx
- pop ebx
-
- pop ebp ; restore stack pointer
-
- emms ; clear mmx registers
- ret
-