--- /dev/null
+;;;
+;;; mblock_sad_mmxe.s:
+;;;
+;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks
+;;; (interpolated, 1-pel, 2*2 sub-sampled pel and 4*4 sub-sampled pel)
+;
+; dist1_* Original Copyright (C) 2000 Chris Atenasio <chris@crud.net>
+; Enhancements and rest Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+
+;
+; This program is free software; you can redistribute it and/or
+; modify it under the terms of the GNU General Public License
+; as published by the Free Software Foundation; either version 2
+; of the License, or (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program; if not, write to the Free Software
+; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+;
+;
+;
+
+
+global dist1_00_mmx
+
+; int dist1_mmx(unsigned char *blk1,unsigned char *blk2,int lx,int h, int distlim);
+; N.b. distlim is *ignored* as testing for it is more expensive than the
+; occasional saving by aborting the computionation early...
+; esi = p1 (init: blk1)
+; edi = p2 (init: blk2)
+; ebx = distlim
+; ecx = rowsleft (init: h)
+; edx = lx;
+
+; mm0 = distance accumulators (4 words)
+; mm1 = temp
+; mm2 = temp
+; mm3 = temp
+; mm4 = temp
+; mm5 = temp
+; mm6 = 0
+; mm7 = temp
+
+
+align 32
+dist1_00_mmx:
+ push ebp ; save frame pointer
+ mov ebp, esp
+
+ push ebx ; Saves registers (called saves convention in
+ push ecx ; x86 GCC it seems)
+ push edx ;
+ push esi
+ push edi
+
+ pxor mm0, mm0 ; zero acculumators
+ pxor mm6, mm6
+ mov esi, [ebp+8] ; get p1
+ mov edi, [ebp+12] ; get p2
+ mov edx, [ebp+16] ; get lx
+ mov ecx, [ebp+20] ; get rowsleft
+ ;mov ebx, [ebp+24] ; distlim
+ jmp nextrowmm00
+align 32
+nextrowmm00:
+ movq mm4, [esi] ; load first 8 bytes of p1 row
+ movq mm5, [edi] ; load first 8 bytes of p2 row
+
+ movq mm7, mm4 ; mm5 = abs(mm4-mm5)
+ psubusb mm7, mm5
+ psubusb mm5, mm4
+ paddb mm5, mm7
+
+ ;; Add the abs(mm4-mm5) bytes to the accumulators
+ movq mm2, [esi+8] ; load second 8 bytes of p1 row (interleaved)
+ movq mm7,mm5 ; mm7 := [i : B0..3, mm1]W
+ punpcklbw mm7,mm6
+ movq mm3, [edi+8] ; load second 8 bytes of p2 row (interleaved)
+ paddw mm0, mm7
+ punpckhbw mm5,mm6
+ paddw mm0, mm5
+
+ ;; This is logically where the mm2, mm3 loads would go...
+
+ movq mm7, mm2 ; mm3 = abs(mm2-mm3)
+ psubusb mm7, mm3
+ psubusb mm3, mm2
+ paddb mm3, mm7
+
+ ;; Add the abs(mm4-mm5) bytes to the accumulators
+ movq mm7,mm3 ; mm7 := [i : B0..3, mm1]W
+ punpcklbw mm7,mm6
+ punpckhbw mm3,mm6
+ paddw mm0, mm7
+
+ add esi, edx ; update pointer to next row
+ add edi, edx ; ditto
+
+ paddw mm0, mm3
+
+
+
+ sub ecx,1
+ jnz near nextrowmm00
+
+returnmm00:
+
+ ;; Sum the Accumulators
+ movq mm5, mm0 ; mm5 := [W0+W2,W1+W3, mm0
+ psrlq mm5, 32
+ movq mm4, mm0
+ paddw mm4, mm5
+
+ movq mm7, mm4 ; mm6 := [W0+W2+W1+W3, mm0]
+ psrlq mm7, 16
+ paddw mm4, mm7
+ movd eax, mm4 ; store return value
+ and eax, 0xffff
+
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+
+ pop ebp
+
+ emms ; clear mmx registers
+ ret
+;
+;;; dist1_01_mmx.s: mmx1 optimised 7bit*8 word absolute difference sum
+;;; We're reduce to seven bits as otherwise we also have to mess
+;;; horribly with carries and signed only comparisons make the code
+;;; simply enormous (and probably barely faster than a simple loop).
+;;; Since signals with a bona-fide 8bit res will be rare we simply
+;;; take the precision hit...
+;;; Actually we don't worry about carries from the low-order bits
+;;; either so 1/4 of the time we'll be 1 too low...
+;;;
+; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+
+;
+; This program is free software; you can redistribute it and/or
+; modify it under the terms of the GNU General Public License
+; as published by the Free Software Foundation; either version 2
+; of the License, or (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program; if not, write to the Free Software
+; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+;
+;
+;
+
+
+global dist1_01_mmx
+
+; int dist1_01_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
+
+; esi = p1 (init: blk1)
+; edi = p2 (init: blk2)
+; ecx = rowsleft (init: h)
+; edx = lx;
+
+; mm0 = distance accumulators (4 words)
+; mm1 = bytes p2
+; mm2 = bytes p1
+; mm3 = bytes p1+1
+; mm4 = temp 4 bytes in words interpolating p1, p1+1
+; mm5 = temp 4 bytes in words from p2
+; mm6 = temp comparison bit mask p1,p2
+; mm7 = temp comparison bit mask p2,p1
+
+
+align 32
+dist1_01_mmx:
+ push ebp ; save stack pointer
+ mov ebp, esp ; so that we can do this
+
+ push ebx ; Saves registers (called saves convention in
+ push ecx ; x86 GCC it seems)
+ push edx ;
+ push esi
+ push edi
+
+ pxor mm0, mm0 ; zero acculumators
+
+ mov esi, [ebp+8] ; get p1
+ mov edi, [ebp+12] ; get p2
+ mov edx, [ebp+16] ; get lx
+ mov ecx, [ebp+20] ; rowsleft := h
+ jmp nextrowmm01 ; snap to it
+align 32
+nextrowmm01:
+
+ ;;
+ ;; First 8 bytes of row
+ ;;
+
+ ;; First 4 bytes of 8
+
+ movq mm4, [esi] ; mm4 := first 4 bytes p1
+ pxor mm7, mm7
+ movq mm2, mm4 ; mm2 records all 8 bytes
+ punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
+
+ movq mm6, [esi+1] ; mm6 := first 4 bytes p1+1
+ movq mm3, mm6 ; mm3 records all 8 bytes
+ punpcklbw mm6, mm7
+ paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
+ psrlw mm4, 1
+
+ movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
+ movq mm1, mm5
+ punpcklbw mm5, mm7
+
+ movq mm7,mm4
+ pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
+
+ movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ psubw mm6,mm5
+ pand mm6, mm7
+
+ paddw mm0, mm6 ; Add to accumulator
+
+ movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
+ pcmpgtw mm6,mm4
+ psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+ pand mm5, mm6
+
+ paddw mm0, mm5 ; Add to accumulator
+
+ ;; Second 4 bytes of 8
+
+ movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
+ pxor mm7, mm7
+ punpckhbw mm4, mm7
+ movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
+ punpckhbw mm6, mm7
+
+ paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
+ psrlw mm4, 1
+
+ movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
+ punpckhbw mm5, mm7
+
+ movq mm7,mm4
+ pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
+
+ movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ psubw mm6,mm5
+ pand mm6, mm7
+
+ paddw mm0, mm6 ; Add to accumulator
+
+ movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
+ pcmpgtw mm6,mm4
+ psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+ pand mm5, mm6
+
+ paddw mm0, mm5 ; Add to accumulator
+
+
+ ;;
+ ;; Second 8 bytes of row
+ ;;
+ ;; First 4 bytes of 8
+
+ movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
+ pxor mm7, mm7
+ movq mm2, mm4 ; mm2 records all 8 bytes
+ punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
+
+ movq mm6, [esi+9] ; mm6 := first 4 bytes p1+9
+ movq mm3, mm6 ; mm3 records all 8 bytes
+ punpcklbw mm6, mm7
+ paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
+ psrlw mm4, 1
+
+ movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
+ movq mm1, mm5
+ punpcklbw mm5, mm7
+
+ movq mm7,mm4
+ pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
+
+ movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ psubw mm6,mm5
+ pand mm6, mm7
+
+ paddw mm0, mm6 ; Add to accumulator
+
+ movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
+ pcmpgtw mm6,mm4
+ psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+ pand mm5, mm6
+
+ paddw mm0, mm5 ; Add to accumulator
+
+ ;; Second 4 bytes of 8
+
+ movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
+ pxor mm7, mm7
+ punpckhbw mm4, mm7
+ movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
+ punpckhbw mm6, mm7
+
+ paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
+ psrlw mm4, 1
+
+ movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
+ punpckhbw mm5, mm7
+
+ movq mm7,mm4
+ pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
+
+ movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ psubw mm6,mm5
+ pand mm6, mm7
+
+ paddw mm0, mm6 ; Add to accumulator
+
+ movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
+ pcmpgtw mm6,mm4
+ psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+ pand mm5, mm6
+
+ paddw mm0, mm5 ; Add to accumulator
+
+
+ ;;
+ ;; Loop termination condition... and stepping
+ ;;
+
+ add esi, edx ; update pointer to next row
+ add edi, edx ; ditto
+
+ sub ecx,1
+ test ecx, ecx ; check rowsleft
+ jnz near nextrowmm01
+
+
+ ;; Sum the Accumulators
+ movq mm4, mm0
+ psrlq mm4, 32
+ paddw mm0, mm4
+ movq mm6, mm0
+ psrlq mm6, 16
+ paddw mm0, mm6
+ movd eax, mm0 ; store return value
+ and eax, 0xffff
+
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+
+ pop ebp ; restore stack pointer
+
+ emms ; clear mmx registers
+ ret ; we now return you to your regular programming
+;
+;;; dist1_01_mmx.s: mmx1 optimised 7bit*8 word absolute difference sum
+;;; We're reduce to seven bits as otherwise we also have to mess
+;;; horribly with carries and signed only comparisons make the code
+;;; simply enormous (and probably barely faster than a simple loop).
+;;; Since signals with a bona-fide 8bit res will be rare we simply
+;;; take the precision hit...
+;;; Actually we don't worry about carries from the low-order bits
+;;; either so 1/4 of the time we'll be 1 too low...
+;;;
+; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+
+;
+; This program is free software; you can redistribute it and/or
+; modify it under the terms of the GNU General Public License
+; as published by the Free Software Foundation; either version 2
+; of the License, or (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program; if not, write to the Free Software
+; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+;
+;
+;
+
+
+global dist1_10_mmx
+
+; int dist1_10_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
+
+; esi = p1 (init: blk1)
+; edi = p2 (init: blk2)
+; ebx = p1+lx
+; ecx = rowsleft (init: h)
+; edx = lx;
+
+; mm0 = distance accumulators (4 words)
+; mm1 = bytes p2
+; mm2 = bytes p1
+; mm3 = bytes p1+1
+; mm4 = temp 4 bytes in words interpolating p1, p1+1
+; mm5 = temp 4 bytes in words from p2
+; mm6 = temp comparison bit mask p1,p2
+; mm7 = temp comparison bit mask p2,p1
+
+
+align 32
+dist1_10_mmx:
+ push ebp ; save stack pointer
+ mov ebp, esp ; so that we can do this
+
+ push ebx ; Saves registers (called saves convention in
+ push ecx ; x86 GCC it seems)
+ push edx ;
+ push esi
+ push edi
+
+ pxor mm0, mm0 ; zero acculumators
+
+ mov esi, [ebp+8] ; get p1
+ mov edi, [ebp+12] ; get p2
+ mov edx, [ebp+16] ; get lx
+ mov ecx, [ebp+20] ; rowsleft := h
+ mov ebx, esi
+ add ebx, edx
+ jmp nextrowmm10 ; snap to it
+align 32
+nextrowmm10:
+
+ ;;
+ ;; First 8 bytes of row
+ ;;
+
+ ;; First 4 bytes of 8
+
+ movq mm4, [esi] ; mm4 := first 4 bytes p1
+ pxor mm7, mm7
+ movq mm2, mm4 ; mm2 records all 8 bytes
+ punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
+
+ movq mm6, [ebx] ; mm6 := first 4 bytes p1+lx
+ movq mm3, mm6 ; mm3 records all 8 bytes
+ punpcklbw mm6, mm7
+ paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
+ psrlw mm4, 1
+
+ movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
+ movq mm1, mm5
+ punpcklbw mm5, mm7
+
+ movq mm7,mm4
+ pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
+
+ movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ psubw mm6,mm5
+ pand mm6, mm7
+
+ paddw mm0, mm6 ; Add to accumulator
+
+ movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
+ pcmpgtw mm6,mm4
+ psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+ pand mm5, mm6
+
+ paddw mm0, mm5 ; Add to accumulator
+
+ ;; Second 4 bytes of 8
+
+ movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
+ pxor mm7, mm7
+ punpckhbw mm4, mm7
+ movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
+ punpckhbw mm6, mm7
+
+ paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
+ psrlw mm4, 1
+
+ movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
+ punpckhbw mm5, mm7
+
+ movq mm7,mm4
+ pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
+
+ movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ psubw mm6,mm5
+ pand mm6, mm7
+
+ paddw mm0, mm6 ; Add to accumulator
+
+ movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
+ pcmpgtw mm6,mm4
+ psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+ pand mm5, mm6
+
+ paddw mm0, mm5 ; Add to accumulator
+
+
+ ;;
+ ;; Second 8 bytes of row
+ ;;
+ ;; First 4 bytes of 8
+
+ movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
+ pxor mm7, mm7
+ movq mm2, mm4 ; mm2 records all 8 bytes
+ punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
+
+ movq mm6, [ebx+8] ; mm6 := first 4 bytes p1+lx+8
+ movq mm3, mm6 ; mm3 records all 8 bytes
+ punpcklbw mm6, mm7
+ paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
+ psrlw mm4, 1
+
+ movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
+ movq mm1, mm5
+ punpcklbw mm5, mm7
+
+ movq mm7,mm4
+ pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
+
+ movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ psubw mm6,mm5
+ pand mm6, mm7
+
+ paddw mm0, mm6 ; Add to accumulator
+
+ movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
+ pcmpgtw mm6,mm4
+ psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+ pand mm5, mm6
+
+ paddw mm0, mm5 ; Add to accumulator
+
+ ;; Second 4 bytes of 8
+
+ movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
+ pxor mm7, mm7
+ punpckhbw mm4, mm7
+ movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
+ punpckhbw mm6, mm7
+
+ paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
+ psrlw mm4, 1
+
+ movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
+ punpckhbw mm5, mm7
+
+ movq mm7,mm4
+ pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
+
+ movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ psubw mm6,mm5
+ pand mm6, mm7
+
+ paddw mm0, mm6 ; Add to accumulator
+
+ movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
+ pcmpgtw mm6,mm4
+ psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+ pand mm5, mm6
+
+ paddw mm0, mm5 ; Add to accumulator
+
+
+ ;;
+ ;; Loop termination condition... and stepping
+ ;;
+
+ add esi, edx ; update pointer to next row
+ add edi, edx ; ditto
+ add ebx, edx
+
+ sub ecx,1
+ test ecx, ecx ; check rowsleft
+ jnz near nextrowmm10
+
+ ;; Sum the Accumulators
+ movq mm4, mm0
+ psrlq mm4, 32
+ paddw mm0, mm4
+ movq mm6, mm0
+ psrlq mm6, 16
+ paddw mm0, mm6
+ movd eax, mm0 ; store return value
+ and eax, 0xffff
+
+
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+
+ pop ebp ; restore stack pointer
+
+ emms ; clear mmx registers
+ ret ; we now return you to your regular programming
+;
+;;; dist1_01_mmx.s:
+;;;
+; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
+
+;
+; This program is free software; you can redistribute it and/or
+; modify it under the terms of the GNU General Public License
+; as published by the Free Software Foundation; either version 2
+; of the License, or (at your option) any later version.
+;
+; This program is distributed in the hope that it will be useful,
+; but WITHOUT ANY WARRANTY; without even the implied warranty of
+; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+; GNU General Public License for more details.
+;
+; You should have received a copy of the GNU General Public License
+; along with this program; if not, write to the Free Software
+; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+;
+;
+;
+
+
+global dist1_11_mmx
+
+; int dist1_11_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
+
+; esi = p1 (init: blk1)
+; edi = p2 (init: blk2)
+; ebx = p1+lx
+; ecx = rowsleft (init: h)
+; edx = lx;
+
+; mm0 = distance accumulators (4 words)
+; mm1 = bytes p2
+; mm2 = bytes p1
+; mm3 = bytes p1+lx
+; I'd love to find someplace to stash p1+1 and p1+lx+1's bytes
+; but I don't think thats going to happen in iA32-land...
+; mm4 = temp 4 bytes in words interpolating p1, p1+1
+; mm5 = temp 4 bytes in words from p2
+; mm6 = temp comparison bit mask p1,p2
+; mm7 = temp comparison bit mask p2,p1
+
+
+align 32
+dist1_11_mmx:
+ push ebp ; save stack pointer
+ mov ebp, esp ; so that we can do this
+
+ push ebx ; Saves registers (called saves convention in
+ push ecx ; x86 GCC it seems)
+ push edx ;
+ push esi
+ push edi
+
+ pxor mm0, mm0 ; zero acculumators
+
+ mov esi, [ebp+8] ; get p1
+ mov edi, [ebp+12] ; get p2
+ mov edx, [ebp+16] ; get lx
+ mov ecx, [ebp+20] ; rowsleft := h
+ mov ebx, esi
+ add ebx, edx
+ jmp nextrowmm11 ; snap to it
+align 32
+nextrowmm11:
+
+ ;;
+ ;; First 8 bytes of row
+ ;;
+
+ ;; First 4 bytes of 8
+
+ movq mm4, [esi] ; mm4 := first 4 bytes p1
+ pxor mm7, mm7
+ movq mm2, mm4 ; mm2 records all 8 bytes
+ punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
+
+ movq mm6, [ebx] ; mm6 := first 4 bytes p1+lx
+ movq mm3, mm6 ; mm3 records all 8 bytes
+ punpcklbw mm6, mm7
+ paddw mm4, mm6
+
+
+ movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1
+ punpcklbw mm5, mm7 ; First 4 bytes p1 in Words...
+ paddw mm4, mm5
+ movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1
+ punpcklbw mm6, mm7
+ paddw mm4, mm6
+
+ psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
+
+ movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
+ movq mm1, mm5
+ punpcklbw mm5, mm7
+
+ movq mm7,mm4
+ pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
+
+ movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ psubw mm6,mm5
+ pand mm6, mm7
+
+ paddw mm0, mm6 ; Add to accumulator
+
+ movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
+ pcmpgtw mm6,mm4
+ psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+ pand mm5, mm6
+
+ paddw mm0, mm5 ; Add to accumulator
+
+ ;; Second 4 bytes of 8
+
+ movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
+ pxor mm7, mm7
+ punpckhbw mm4, mm7
+ movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
+ punpckhbw mm6, mm7
+ paddw mm4, mm6
+
+ movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1
+ punpckhbw mm5, mm7 ; First 4 bytes p1 in Words...
+ paddw mm4, mm5
+ movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1
+ punpckhbw mm6, mm7
+ paddw mm4, mm6
+
+ psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
+
+ movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
+ punpckhbw mm5, mm7
+
+ movq mm7,mm4
+ pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
+
+ movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ psubw mm6,mm5
+ pand mm6, mm7
+
+ paddw mm0, mm6 ; Add to accumulator
+
+ movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
+ pcmpgtw mm6,mm4
+ psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+ pand mm5, mm6
+
+ paddw mm0, mm5 ; Add to accumulator
+
+
+ ;;
+ ;; Second 8 bytes of row
+ ;;
+ ;; First 4 bytes of 8
+
+ movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
+ pxor mm7, mm7
+ movq mm2, mm4 ; mm2 records all 8 bytes
+ punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
+
+ movq mm6, [ebx+8] ; mm6 := first 4 bytes p1+lx+8
+ movq mm3, mm6 ; mm3 records all 8 bytes
+ punpcklbw mm6, mm7
+ paddw mm4, mm6
+
+
+ movq mm5, [esi+9] ; mm5 := first 4 bytes p1+9
+ punpcklbw mm5, mm7 ; First 4 bytes p1 in Words...
+ paddw mm4, mm5
+ movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+9
+ punpcklbw mm6, mm7
+ paddw mm4, mm6
+
+ psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
+
+ movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
+ movq mm1, mm5
+ punpcklbw mm5, mm7
+
+ movq mm7,mm4
+ pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
+
+ movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ psubw mm6,mm5
+ pand mm6, mm7
+
+ paddw mm0, mm6 ; Add to accumulator
+
+ movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
+ pcmpgtw mm6,mm4
+ psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+ pand mm5, mm6
+
+ paddw mm0, mm5 ; Add to accumulator
+
+ ;; Second 4 bytes of 8
+
+ movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
+ pxor mm7, mm7
+ punpckhbw mm4, mm7
+ movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
+ punpckhbw mm6, mm7
+ paddw mm4, mm6
+
+ movq mm5, [esi+9] ; mm5 := first 4 bytes p1+1
+ punpckhbw mm5, mm7 ; First 4 bytes p1 in Words...
+ paddw mm4, mm5
+ movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+1
+ punpckhbw mm6, mm7
+ paddw mm4, mm6
+
+ psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
+
+ movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
+ punpckhbw mm5, mm7
+
+ movq mm7,mm4
+ pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
+
+ movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
+ psubw mm6,mm5
+ pand mm6, mm7
+
+ paddw mm0, mm6 ; Add to accumulator
+
+ movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
+ pcmpgtw mm6,mm4
+ psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
+ pand mm5, mm6
+
+ paddw mm0, mm5 ; Add to accumulator
+
+
+ ;;
+ ;; Loop termination condition... and stepping
+ ;;
+
+ add esi, edx ; update pointer to next row
+ add edi, edx ; ditto
+ add ebx, edx
+
+ sub ecx,1
+ test ecx, ecx ; check rowsleft
+ jnz near nextrowmm11
+
+ ;; Sum the Accumulators
+ movq mm4, mm0
+ psrlq mm4, 32
+ paddw mm0, mm4
+ movq mm6, mm0
+ psrlq mm6, 16
+ paddw mm0, mm6
+ movd eax, mm0 ; store return value
+ and eax, 0xffff
+
+ pop edi
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+
+ pop ebp ; restore stack pointer
+
+ emms ; clear mmx registers
+ ret ; we now return you to your regular programming
+
+
+global dist22_mmx
+
+; int dist22_mmx(unsigned char *blk1,unsigned char *blk2,int lx,int h);
+
+; eax = p1 (init: blk1)
+; ebx = p2 (init: blk2)
+; ecx = rowsleft (init: h)
+; edx = lx;
+
+; mm0 = distance accumulators (4 words)
+; mm1 = temp
+; mm2 = temp
+; mm3 = temp
+; mm4 = temp
+; mm5 = temp
+; mm6 = 0
+; mm7 = temp
+
+
+align 32
+dist22_mmx:
+ push ebp ; save stack pointer
+ mov ebp, esp ; so that we can do this
+
+ push ebx ; Saves registers (called saves convention in
+ push ecx ; x86 GCC it seems)
+ push edx ;
+
+ pxor mm0, mm0 ; zero acculumators
+ pxor mm6, mm6
+ mov eax, [ebp+8] ; get p1
+ mov ebx, [ebp+12] ; get p2
+ mov edx, [ebp+16] ; get lx
+
+ mov ecx, [ebp+20] ; get rowsleft
+
+ jmp nextrow ; snap to it
+align 32
+nextrow:
+ movq mm4, [eax] ; load 8 bytes of p1
+ movq mm5, [ebx] ; load 8 bytes of p2
+
+ movq mm7, mm4 ; mm5 = abs(*p1-*p2)
+ psubusb mm7, mm5
+ psubusb mm5, mm4
+ add eax, edx ; update pointer to next row
+ paddb mm5,mm7
+
+ ;; Add the mm5 bytes to the accumulatores
+ movq mm7,mm5
+ punpcklbw mm7,mm6
+ paddw mm0, mm7
+ punpckhbw mm5,mm6
+ add ebx, edx ; update pointer to next row
+ paddw mm0, mm5
+
+ movq mm4, [eax] ; load 8 bytes of p1 (next row)
+ movq mm5, [ebx] ; load 8 bytes of p2 (next row)
+
+ movq mm7, mm4 ; mm5 = abs(*p1-*p2)
+ psubusb mm7, mm5
+ psubusb mm5, mm4
+ add eax, edx ; update pointer to next row
+ paddb mm5,mm7
+
+ ;; Add the mm5 bytes to the accumulatores
+ movq mm7,mm5
+ punpcklbw mm7,mm6
+ add ebx, edx ; update pointer to next row
+ paddw mm0, mm7
+ punpckhbw mm5,mm6
+ sub ecx,2
+ paddw mm0, mm5
+
+
+ jnz nextrow
+
+ ;; Sum the Accumulators
+ movq mm1, mm0
+ psrlq mm1, 16
+ movq mm2, mm0
+ psrlq mm2, 32
+ movq mm3, mm0
+ psrlq mm3, 48
+ paddw mm0, mm1
+ paddw mm2, mm3
+ paddw mm0, mm2
+
+ movd eax, mm0 ; store return value
+ and eax, 0xffff
+
+ pop edx ; pop pop
+ pop ecx ; fizz fizz
+ pop ebx ; ia86 needs a fizz instruction
+
+ pop ebp ; restore stack pointer
+
+ emms ; clear mmx registers
+ ret ; we now return you to your regular programming
+
+
+
+
+global dist44_mmx
+
+; int dist44_mmx(unsigned char *blk1,unsigned char *blk2,int qlx,int qh);
+
+; eax = p1
+; ebx = p2
+; ecx = temp
+; edx = qlx;
+; esi = rowsleft
+
+; mm0 = distance accumulator left block p1
+; mm1 = distance accumulator right block p1
+; mm2 = 0
+; mm3 = right block of p1
+; mm4 = left block of p1
+; mm5 = p2
+; mm6 = temp
+; mm7 = temp
+
+align 32
+dist44_mmx:
+ push ebp ; save stack pointer
+ mov ebp, esp ; so that we can do this
+
+ push ebx
+ push ecx
+ push edx
+ push esi
+
+ pxor mm0, mm0 ; zero acculumator
+ pxor mm1, mm1
+ pxor mm2, mm2
+ mov eax, [ebp+8] ; get p1
+ mov ebx, [ebp+12] ; get p2
+ mov edx, [ebp+16] ; get qlx
+ mov esi, [ebp+20] ; get rowsleft
+ jmp nextrowqd ; snap to it
+align 32
+nextrowqd:
+
+ ;;
+ ;; Beware loop obfuscated by interleaving to try to
+ ;; hide latencies...
+ ;;
+ movq mm4, [eax] ; mm4 = first 4 bytes of p1 in words
+ movq mm5, [ebx] ; mm5 = 4 bytes of p2 in words
+ movq mm3, mm4
+ punpcklbw mm4, mm2
+ punpcklbw mm5, mm2
+
+ movq mm7, mm4
+ movq mm6, mm5
+ psubusw mm7, mm5
+ psubusw mm6, mm4
+
+ add eax, edx ; update a pointer to next row
+; punpckhbw mm3, mm2 ; mm3 = 2nd 4 bytes of p1 in words
+
+ paddw mm7, mm6
+ paddw mm0, mm7 ; Add absolute differences to left block accumulators
+
+; movq mm7,mm3
+; psubusw mm7, mm5
+; psubusw mm5, mm3
+
+ add ebx, edx ; update a pointer to next row
+ sub esi, 1
+
+; paddw mm7, mm5
+; paddw mm1, mm7 ; Add absolute differences to right block accumulators
+
+
+
+ jnz nextrowqd
+
+ ;; Sum the accumulators
+
+ movq mm4, mm0
+ psrlq mm4, 32
+ paddw mm0, mm4
+ movq mm6, mm0
+ psrlq mm6, 16
+ paddw mm0, mm6
+ movd eax, mm0 ; store return value
+
+; movq mm4, mm1
+; psrlq mm4, 32
+; paddw mm1, mm4
+; movq mm6, mm1
+; psrlq mm6, 16
+; paddw mm1, mm6
+; movd ebx, mm1
+
+ and eax, 0xffff
+; sal ebx, 16
+; or eax, ebx
+
+ pop esi
+ pop edx
+ pop ecx
+ pop ebx
+
+ pop ebp ; restore stack pointer
+
+ emms ; clear mmx registers
+ ret ; we now return you to your regular programming