--- /dev/null
+;
+; bdist2_mmx.s: MMX optimized bidirectional squared distance sum
+;
+; Original believed to be Copyright (C) 2000 Brent Byeler
+;
+; This program is free software; you can reaxstribute it and/or
+; modify it under the terms of the GNU General Public License
+; as published by the Free Software Foundation; either version 2
+; of the License, or (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program; if not, write to the Free Software
+; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+;
+
+;/*
+; * squared error between a (16*h) block and a bidirectional
+; * prediction
+; *
+; * p2: address of top left pel of block
+; * pf,hxf,hyf: address and half pel flags of forward ref. block
+; * pb,hxb,hyb: address and half pel flags of backward ref. block
+; * h: height of block
+; * lx: distance (in bytes) of vertically adjacent pels in p2,pf,pb
+; * mmX version
+; */
+
+;int bdist2_mmx(
+;unsigned char *pf, unsigned char *pb, unsigned char *p2,
+;int lx, int hxf, int hyf, int hxb, int hyb, int h)
+;{
+; unsigned char *pfa,*pfb,*pfc,*pba,*pbb,*pbc;
+; int s;
+
+; Handy macros for readbility
+
+%define pf [ebp+8]
+%define pb [ebp+12]
+%define p2 [ebp+16]
+%define lx [ebp+20]
+%define hxf [ebp+24]
+%define hyf [ebp+28]
+%define hxb [ebp+32]
+%define hyb [ebp+36]
+%define h [ebp+40]
+
+
+%define pfa [esp+4]
+%define pfb [esp+8]
+%define pfc [esp+12]
+%define pba [esp+16]
+%define pbb [esp+20]
+%define pbc [esp+24]
+
+SECTION .text
+global bdist2_mmx
+
+align 32
+bdist2_mmx:
+ push ebp ; save frame pointer
+ mov ebp, esp ; link
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+
+ ;;
+ ;; Make space for local variables on stack
+ sub esp, 32
+
+ mov edx, hxb
+ mov eax, hxf
+ mov esi, lx
+
+ mov ecx, pf
+ add ecx, eax
+ mov pfa, ecx
+ mov ecx, esi
+ imul ecx, hyf
+ mov ebx, pf
+ add ecx, ebx
+ mov pfb, ecx
+ add eax, ecx
+ mov pfc, eax
+ mov eax, pb
+ add eax, edx
+ mov pba, eax
+ mov eax, esi
+ imul eax, hyb
+ mov ecx, pb
+ add eax, ecx
+ mov pbb, eax
+ add edx, eax
+ mov pbc, edx
+ xor esi, esi ; esi = s (accumulated sym)
+ mov eax, esi
+
+ mov edi, h
+ test edi, edi ; h = 0?
+ jle near bdist2exit
+
+ pxor mm7, mm7
+ pxor mm6, mm6
+ pcmpeqw mm5, mm5
+ psubw mm6, mm5
+ psllw mm6, 1
+
+bdist2top:
+ mov eax, pf
+ mov ebx, pfa
+ mov ecx, pfb
+ mov edx, pfc
+ movq mm0, [eax]
+ movq mm1, mm0
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+ movq mm2, [ebx]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+ paddw mm0, mm2
+ paddw mm1, mm3
+ movq mm2, [ecx]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+ paddw mm0, mm2
+ paddw mm1, mm3
+ movq mm2, [edx]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+ paddw mm0, mm2
+ paddw mm1, mm3
+ paddw mm0, mm6
+ paddw mm1, mm6
+ psrlw mm0, 2
+ psrlw mm1, 2
+
+ mov eax, pb
+ mov ebx, pba
+ mov ecx, pbb
+ mov edx, pbc
+ movq mm2, [eax]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+ movq mm4, [ebx]
+ movq mm5, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm5, mm7
+ paddw mm2, mm4
+ paddw mm3, mm5
+ movq mm4, [ecx]
+ movq mm5, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm5, mm7
+ paddw mm2, mm4
+ paddw mm3, mm5
+ movq mm4, [edx]
+ movq mm5, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm5, mm7
+ paddw mm2, mm4
+ paddw mm3, mm5
+
+ paddw mm2, mm6
+ paddw mm3, mm6
+ psrlw mm2, 2
+ psrlw mm3, 2
+
+ paddw mm0, mm2
+ paddw mm1, mm3
+ psrlw mm6, 1
+ paddw mm0, mm6
+ paddw mm1, mm6
+ psllw mm6, 1
+ psrlw mm0, 1
+ psrlw mm1, 1
+
+ mov eax, p2
+ movq mm2, [eax]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+
+ psubw mm0, mm2
+ psubw mm1, mm3
+ pmaddwd mm0, mm0
+ pmaddwd mm1, mm1
+ paddd mm0, mm1
+
+ movd eax, mm0
+ psrlq mm0, 32
+ movd ebx, mm0
+ add esi, eax
+ add esi, ebx
+
+ mov eax, pf
+ mov ebx, pfa
+ mov ecx, pfb
+ mov edx, pfc
+ movq mm0, [eax+8]
+ movq mm1, mm0
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+ movq mm2, [ebx+8]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+ paddw mm0, mm2
+ paddw mm1, mm3
+ movq mm2, [ecx+8]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+ paddw mm0, mm2
+ paddw mm1, mm3
+ movq mm2, [edx+8]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+ paddw mm0, mm2
+ paddw mm1, mm3
+ paddw mm0, mm6
+ paddw mm1, mm6
+ psrlw mm0, 2
+ psrlw mm1, 2
+
+ mov eax, pb
+ mov ebx, pba
+ mov ecx, pbb
+ mov edx, pbc
+ movq mm2, [eax+8]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+ movq mm4, [ebx+8]
+ movq mm5, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm5, mm7
+ paddw mm2, mm4
+ paddw mm3, mm5
+ movq mm4, [ecx+8]
+ movq mm5, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm5, mm7
+ paddw mm2, mm4
+ paddw mm3, mm5
+ movq mm4, [edx+8]
+ movq mm5, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm5, mm7
+ paddw mm2, mm4
+ paddw mm3, mm5
+ paddw mm2, mm6
+ paddw mm3, mm6
+ psrlw mm2, 2
+ psrlw mm3, 2
+
+ paddw mm0, mm2
+ paddw mm1, mm3
+ psrlw mm6, 1
+ paddW mm0, mm6
+ paddw mm1, mm6
+ psllw mm6, 1
+ psrlw mm0, 1
+ psrlw mm1, 1
+
+ mov eax, p2
+ movq mm2, [eax+8]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+
+ psubw mm0, mm2
+ psubw mm1, mm3
+ pmaddwd mm0, mm0
+ pmaddwd mm1, mm1
+ paddd mm0, mm1
+
+ movd eax, mm0
+ psrlq mm0, 32
+ movd ebx, mm0
+ add esi, eax
+ add esi, ebx
+
+ mov eax, lx
+ add p2, eax
+ add pf, eax
+ add pfa, eax
+ add pfb, eax
+ add pfc, eax
+ add pb, eax
+ add pba, eax
+ add pbb, eax
+ add pbc, eax
+
+ dec edi
+ jg near bdist2top
+ mov eax, esi
+
+bdist2exit:
+
+ ;;
+ ;; Get rid of local variables
+ add esp, 32
+
+ ;; Retore (callee saves convention...)
+ ;;
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+
+ pop ebp ; restore stack pointer
+
+ emms ; clear mmx registers
+ ret
+