--- /dev/null
+;
+; dist2_mmx.s: mmX optimized squared distance sum
+;
+; Original believed to be Copyright (C) 2000 Brent Byeler
+;
+; This program is free software; you can reaxstribute it and/or
+; modify it under the terms of the GNU General Public License
+; as published by the Free Software Foundation; either version 2
+; of the License, or (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program; if not, write to the Free Software
+; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+;
+
+; total squared difference between two (16*h) blocks
+; including optional half pel interpolation of [ebp+8] ; blk1 (hx,hy)
+; blk1,blk2: addresses of top left pels of both blocks
+; lx: distance (in bytes) of vertically adjacent pels
+; hx,hy: flags for horizontal and/or vertical interpolation
+; h: height of block (usually 8 or 16)
+; mmX version
+
+global dist2_mmx
+; int dist2_mmx(unsigned char *blk1, unsigned char *blk2,
+; int lx, int hx, int hy, int h)
+
+; mm7 = 0
+
+; eax = pblk1
+; ebx = pblk2
+; ecx = temp
+; edx = distance_sum
+; edi = h
+; esi = lx
+
+ ;;
+ ;; private constants needed
+ ;;
+
+SECTION .data
+align 16
+twos:
+ dw 2
+ dw 2
+ dw 2
+ dw 2
+
+align 32
+dist2_mmx:
+ push ebp ; save frame pointer
+ mov ebp, esp ; link
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+
+ mov esi, [ebp+16] ; lx
+ mov eax, [ebp+20] ; hx
+ mov edx, [ebp+24] ; hy
+ mov edi, [ebp+28] ; h
+
+ pxor mm5, mm5 ; sum
+ test edi, edi ; h = 0?
+ jle near d2exit
+
+ pxor mm7, mm7 ; get zeros i mm7
+
+ test eax, eax ; hx != 0?
+ jne near d2is10
+ test edx, edx ; hy != 0?
+ jne near d2is10
+
+ mov eax, [ebp+8]
+ mov ebx, [ebp+12]
+ jmp d2top00
+
+align 32
+d2top00:
+ movq mm0, [eax]
+ movq mm1, mm0
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+
+ movq mm2, [ebx]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+
+ psubw mm0, mm2
+ psubw mm1, mm3
+ pmaddwd mm0, mm0
+ pmaddwd mm1, mm1
+ paddd mm0, mm1
+
+ movq mm1, [eax+8]
+ movq mm2, mm1
+ punpcklbw mm1, mm7
+ punpckhbw mm2, mm7
+
+ movq mm3, [ebx+8]
+ movq mm4, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm4, mm7
+
+ psubw mm1, mm3
+ psubw mm2, mm4
+ pmaddwd mm1, mm1
+ pmaddwd mm2, mm2
+ paddd mm1, mm2
+
+ paddd mm0, mm1
+
+ ;; Accumulate sum in edx... we use mm5
+ ;movd ecx, mm0
+ ;add edx, ecx
+ ;psrlq mm0, 32
+ ;movd ecx, mm0
+ ;add edx, ecx
+ paddd mm5, mm0
+
+ add eax, esi
+ add ebx, esi
+ dec edi
+ jg d2top00
+ jmp d2exit
+
+
+d2is10:
+ test eax, eax
+ je near d2is01
+ test edx, edx
+ jne near d2is01
+
+
+ mov eax, [ebp+8] ; blk1
+ mov ebx, [ebp+12] ; blk1
+
+ pxor mm6, mm6 ; mm6 = 0 and isn't changed anyplace in the loop..
+ pcmpeqw mm1, mm1
+ psubw mm6, mm1
+ jmp d2top10
+
+align 32
+d2top10:
+ movq mm0, [eax]
+ movq mm1, mm0
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+ movq mm2, [eax+1]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+ paddw mm0, mm2
+ paddw mm1, mm3
+ paddw mm0, mm6 ; here we add mm6 = 0.... weird...
+ paddw mm1, mm6
+ psrlw mm0, 1
+ psrlw mm1, 1
+
+ movq mm2, [ebx]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+
+ psubw mm0, mm2
+ psubw mm1, mm3
+ pmaddwd mm0, mm0
+ pmaddwd mm1, mm1
+ paddd mm0, mm1
+
+ movq mm1, [eax+8]
+ movq mm2, mm1
+ punpcklbw mm1, mm7
+ punpckhbw mm2, mm7
+ movq mm3, [eax+9]
+ movq mm4, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm4, mm7
+ paddw mm1, mm3
+ paddw mm2, mm4
+ paddw mm1, mm6
+ paddw mm2, mm6
+ psrlw mm1, 1
+ psrlw mm2, 1
+
+ movq mm3, [ebx+8]
+ movq mm4, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm4, mm7
+
+ psubw mm1, mm3
+ psubw mm2, mm4
+ pmaddwd mm1, mm1
+ pmaddwd mm2, mm2
+ paddd mm1, mm2
+
+
+ paddd mm0, mm1
+ ; Accumulate mm0 sum on edx... we'll use mm5 for this and add up at the end
+ ; movd ecx, mm0
+ ; add edx, ecx
+ ; psrlq mm0, 32
+ ; movd ecx, mm0
+ ; add edx, ecx
+ paddd mm5, mm0
+ add eax, esi
+ add ebx, esi
+ dec edi
+ jg near d2top10
+
+
+ jmp d2exit
+
+d2is01:
+ test eax, eax
+ jne near d2is11
+ test edx, edx
+ je near d2is11
+
+ mov eax, [ebp+8] ; blk1
+ mov edx, [ebp+12] ; blk2
+ mov ebx, eax
+ add ebx, esi ; blk1 + lx
+
+ pxor mm6, mm6
+ pcmpeqw mm1, mm1
+ psubw mm6, mm1 ; mm6 = 1
+ jmp d2top01
+
+align 32
+d2top01:
+ movq mm0, [eax]
+ movq mm1, mm0
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+ movq mm2, [ebx]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+ paddw mm0, mm2
+ paddw mm1, mm3
+ paddw mm0, mm6
+ paddw mm1, mm6
+ psrlw mm0, 1
+ psrlw mm1, 1
+
+ movq mm2, [edx]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+
+ psubw mm0, mm2
+ psubw mm1, mm3
+
+ pmaddwd mm0, mm0
+ pmaddwd mm1, mm1
+ paddd mm0, mm1
+
+ movq mm1, [eax+8]
+ movq mm2, mm1
+ punpcklbw mm1, mm7
+ punpckhbw mm2, mm7
+
+ movq mm3, [ebx+8]
+ movq mm4, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm4, mm7
+
+ paddw mm1, mm3
+ paddw mm2, mm4
+ paddw mm1, mm6
+ paddw mm2, mm6
+ psrlw mm1, 1
+ psrlw mm2, 1
+
+ movq mm3, [edx+8]
+ movq mm4, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm4, mm7
+
+ psubw mm1, mm3
+ psubw mm2, mm4
+
+ pmaddwd mm1, mm1
+ pmaddwd mm2, mm2
+ paddd mm0, mm1
+ paddd mm0, mm2
+
+ ;; Accumulate in "s" - we use mm5 for the purpose
+ ;;
+ ;movd ecx, mm0
+ ;add s, ecx
+ ;psrlq mm0, 32
+ ;movd ecx, mm0
+ ;add s, ecx
+ paddd mm5, mm0
+
+ ;; Originally this moved
+ mov eax, ebx ; eax = eax + lx
+ add edx, esi ; edx = edx + lx
+ add ebx, esi ; ebx = ebx + lx
+ dec edi
+ jg near d2top01
+ jmp d2exit
+
+d2is11:
+ mov eax, [ebp+8] ; blk1
+ mov edx, [ebp+12] ; blk2
+ mov ebx, eax ; blk1
+ add ebx, esi ; ebx = blk1 + lx
+ jmp d2top11
+
+align 32
+d2top11:
+ movq mm0, [eax]
+ movq mm1, mm0
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+ movq mm2, [eax+1]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+ paddw mm0, mm2
+ paddw mm1, mm3
+ movq mm2, [ebx]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+ movq mm4, [ebx+1]
+ movq mm6, mm4
+ punpcklbw mm4, mm7
+ punpckhbw mm6, mm7
+ paddw mm2, mm4
+ paddw mm3, mm6
+ paddw mm0, mm2
+ paddw mm1, mm3
+ ;pxor mm6, mm6 ; mm6 = 0
+ ;pcmpeqw mm5, mm5 ; mm5 = -1
+ ;psubw mm6, mm5 ; mm6 = 1
+ ;paddw mm6, mm6 ; mm6 = 2
+ movq mm6, [twos]
+ paddw mm0, mm6 ; round mm0
+ paddw mm1, mm6 ; round mm1
+ psrlw mm0, 2
+ psrlw mm1, 2
+
+ movq mm2, [edx]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+
+ psubw mm0, mm2
+ psubw mm1, mm3
+ pmaddwd mm0, mm0
+ pmaddwd mm1, mm1
+ paddd mm0, mm1
+
+ movq mm1, [eax+8]
+ movq mm2, mm1
+ punpcklbw mm1, mm7
+ punpckhbw mm2, mm7
+
+ movq mm3, [eax+9]
+ movq mm4, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm4, mm7
+
+ paddw mm1, mm3
+ paddw mm2, mm4
+
+ movq mm3, [ebx+8]
+ movq mm4, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm4, mm7
+ paddw mm1, mm3
+ paddw mm2, mm4
+
+ movq mm3, [ebx+9]
+ movq mm4, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm4, mm7
+
+ paddw mm1, mm3
+ paddw mm2, mm4
+
+ ;pxor mm6, mm6 ; Zero mm6
+ ;pcmpeqw mm5, mm5 ; mm5 = -1
+ ;psubw mm6, mm5 ; mm6 = 1
+ ;paddw mm6, mm6 ; mm6 = 2
+ ;paddw mm1, mm6 ; round mm1 and mm2
+ ;paddw mm2, mm6
+ movq mm6, [twos]
+ paddw mm1, mm6
+ paddw mm2, mm6
+
+ psrlw mm1, 2
+ psrlw mm2, 2
+
+ movq mm3, [edx+8]
+ movq mm4, mm3
+ punpcklbw mm3, mm7
+ punpckhbw mm4, mm7
+
+ psubw mm1, mm3
+ psubw mm2, mm4
+ pmaddwd mm1, mm1
+ pmaddwd mm2, mm2
+ paddd mm1, mm2
+
+ paddd mm0, mm1
+
+ ;;
+ ;; Accumulate the result in "s" we use mm6 for the purpose...
+ ;movd ecx, mm0
+ ; add s, ecx
+ ;psrlq mm0, 32
+ ;movd ecx, mm0
+ ;add s, ecx
+ paddd mm5, mm0
+
+ mov eax, ebx ; ahem ebx = eax at start of loop and wasn't changed...
+ add ebx, esi
+ add edx, esi
+ dec edi
+ jg near d2top11
+
+
+d2exit:
+ ;; Put the final sum in eax for return...
+ movd eax, mm5
+ psrlq mm5, 32
+ movd ecx, mm5
+ add eax, ecx
+
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+
+ pop ebp ; restore stack pointer
+
+ emms ; clear mmx registers
+ ret
+
+
+; total squared difference between two (8*h) blocks
+; blk1,blk2: addresses of top left pels of both blocks
+; lx: distance (in bytes) of vertically adjacent pels
+; h: height of block (usually 4, or 8)
+; mmX version
+
+global dist2_22_mmx
+; int dist2_22_mmx(unsigned char *blk1, unsigned char *blk2,
+; int lx, int h)
+
+; mm7 = 0
+
+; eax = pblk1
+; ebx = pblk2
+; ecx = temp
+; edx = distance_sum
+; edi = h
+; esi = lx
+
+align 32
+dist2_22_mmx:
+ push ebp ; save frame pointer
+ mov ebp, esp ; link
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+
+ mov esi, [ebp+16] ; lx
+ mov edi, [ebp+20] ; h
+
+ pxor mm5, mm5 ; sum
+ test edi, edi ; h = 0?
+ jle near d2exit
+
+ pxor mm7, mm7 ; get zeros i mm7
+
+ mov eax, [ebp+8] ; blk1
+ mov ebx, [ebp+12] ; blk2
+ jmp d2top22
+
+align 32
+d2top22:
+ movq mm0, [eax]
+ movq mm1, mm0
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+
+ movq mm2, [ebx]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+
+ psubw mm0, mm2
+ psubw mm1, mm3
+ pmaddwd mm0, mm0
+ pmaddwd mm1, mm1
+ paddd mm5, mm0
+ paddd mm5, mm1
+
+ add eax, esi
+ add ebx, esi
+ dec edi
+ jg d2top22
+ jmp d2exit
+
+
+; total squared difference between interpolation of two (8*h) blocks and
+; another 8*h block
+; blk1,blk2: addresses of top left pels of both blocks
+; lx: distance (in bytes) of vertically adjacent pels
+; h: height of block (usually 4, or 8)
+; mmX version
+
+global bdist2_22_mmx
+; int bdist2_22_mmx(unsigned char *blk1f, unsigned char*blk1b,
+; unsigned char *blk2,
+; int lx, int h)
+
+; mm7 = 0
+
+; eax = pblk1f
+; ebx = pblk2
+; ecx = pblk1b
+; edx = distance_sum
+; edi = h
+; esi = lx
+
+align 32
+bdist2_22_mmx:
+ push ebp ; save frame pointer
+ mov ebp, esp ; link
+ push ebx
+ push ecx
+ push edx
+ push esi
+ push edi
+
+ mov esi, [ebp+20] ; lx
+ mov edi, [ebp+24] ; h
+
+ pxor mm5, mm5 ; sum
+ test edi, edi ; h = 0?
+ jle near d2exit
+
+ pxor mm7, mm7 ; get zeros i mm7
+
+ mov eax, [ebp+8] ; blk1f
+ mov ebx, [ebp+12] ; blk1b
+ mov ecx, [ebp+16] ; blk2
+ jmp bd2top22
+
+align 32
+bd2top22:
+ movq mm0, [eax]
+ movq mm1, mm0
+ movq mm4, [ebx]
+ movq mm6, mm4
+ punpcklbw mm0, mm7
+ punpckhbw mm1, mm7
+ punpcklbw mm4, mm7
+ punpckhbw mm6, mm7
+
+ movq mm2, [ecx]
+ movq mm3, mm2
+ punpcklbw mm2, mm7
+ punpckhbw mm3, mm7
+
+ paddw mm0, mm4
+ psrlw mm0, 1
+ psubw mm0, mm2
+ pmaddwd mm0, mm0
+ paddw mm1, mm6
+ psrlw mm1, 1
+ psubw mm1, mm3
+ pmaddwd mm1, mm1
+ paddd mm5, mm0
+ paddd mm5, mm1
+
+ add eax, esi
+ add ebx, esi
+ add ecx, esi
+ dec edi
+ jg bd2top22
+ jmp d2exit
+
\ No newline at end of file