;;; ;;; mblock_sad_mmxe.s: ;;; ;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks ;;; (interpolated, 1-pel, 2*2 sub-sampled pel and 4*4 sub-sampled pel) ; ; dist1_* Original Copyright (C) 2000 Chris Atenasio ; Enhancements and rest Copyright (C) 2000 Andrew Stevens ;; Yes, I tried prefetch-ing. It makes no difference or makes ;; stuff *slower*. ; ; This program is free software; you can reaxstribute it and/or ; modify it under the terms of the GNU General Public License ; as published by the Free Software Foundation; either version 2 ; of the License, or (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; ; You should have received a copy of the GNU General Public License ; along with this program; if not, write to the Free Software ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ; ; ; global dist1_00_mmxe ; int dist1_00(char *blk1,char *blk2,int lx,int h,int distlim); ; distlim unused - costs more to check than the savings of ; aborting the computation early from time to time... ; eax = p1 ; ebx = p2 ; ecx = rowsleft ; edx = lx; ; mm0 = distance accumulator ; mm1 = temp ; mm2 = temp ; mm3 = temp ; mm4 = temp ; mm5 = temp ; mm6 = temp align 32 dist1_00_mmxe: push ebp ; save frame pointer mov ebp, esp ; link push ebx push ecx push edx pxor mm0, mm0 ; zero acculumator mov eax, [ebp+8] ; get p1 dist1_00_0misalign: mov ebx, [ebp+12] ; get p2 mov edx, [ebp+16] ; get lx mov ecx, [ebp+20] ; get rowsleft jmp nextrow00sse align 32 nextrow00sse: movq mm4, [eax] ; load first 8 bytes of p1 (row 1) psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1) movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1) add eax, edx ; update pointer to next row paddd mm0, mm4 ; accumulate difference psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1) add ebx, edx ; ditto paddd mm0, mm5 ; accumulate difference movq mm6, [eax] ; load first 8 bytes of p1 (row 2) psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2) movq mm4, [eax+8] ; load next 8 bytes of p1 (row 2) add eax, edx ; update pointer to next row paddd mm0, mm6 ; accumulate difference psadbw mm4, [ebx+8] ; compare to next 8 bytes of p2 (row 2) add ebx, edx ; ditto paddd mm0, mm4 ; accumulate difference ;psubd mm2, mm3 ; decrease rowsleft ;movq mm5, mm1 ; copy distlim ;pcmpgtd mm5, mm0 ; distlim > dist? ;pand mm2, mm5 ; mask rowsleft with answer ;movd ecx, mm2 ; move rowsleft to ecx ;add eax, edx ; update pointer to next row ;add ebx, edx ; ditto ;test ecx, ecx ; check rowsleft sub ecx, 2 jnz nextrow00sse movd eax, mm0 ; store return value pop edx pop ecx pop ebx pop ebp emms ret global dist1_00_Ammxe ;; This is a special version that only does aligned accesses... ;; Wonder if it'll make it faster on a P-III ;; ANSWER: NO its slower hence no longer used. ; int dist1_00(char *blk1,char *blk2,int lx,int h,int distlim); ; distlim unused - costs more to check than the savings of ; aborting the computation early from time to time... ; eax = p1 ; ebx = p2 ; ecx = rowsleft ; edx = lx; ; mm0 = distance accumulator ; mm1 = temp ; mm2 = right shift to adjust for mis-align ; mm3 = left shift to adjust for mis-align ; mm4 = temp ; mm5 = temp ; mm6 = temp align 32 dist1_00_Ammxe: push ebp ; save frame pointer mov ebp, esp ; link push ebx push ecx push edx pxor mm0, mm0 ; zero acculumator mov eax, [ebp+8] ; get p1 mov ebx, eax and ebx, 7 ; Misalignment! cmp ebx, 0 jz near dist1_00_0misalign sub eax, ebx ; Align eax mov ecx, 8 ; ecx = 8-misalignment sub ecx, ebx shl ebx, 3 ; Convert into bit-shifts... shl ecx, 3 movd mm2, ebx ; mm2 = shift to start msb movd mm3, ecx ; mm3 = shift to end lsb mov ebx, [ebp+12] ; get p2 mov edx, [ebp+16] ; get lx mov ecx, [ebp+20] ; get rowsleft jmp nextrow00ssea align 32 nextrow00ssea: movq mm4, [eax] ; load first 8 bytes of aligned p1 (row 1) movq mm5, [eax+8] ; load next 8 bytes of aligned p1 (row 1) movq mm6, mm5 psrlq mm4, mm2 ; mm4 first 8 bytes of p1 proper psllq mm5, mm3 por mm4, mm5 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 movq mm7, [eax+16] ; load last 8 bytes of aligned p1 add eax, edx ; update pointer to next row psrlq mm6, mm2 ; mm6 2nd 8 bytes of p1 proper psllq mm7, mm3 por mm6, mm7 paddd mm0, mm4 ; accumulate difference psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1) add ebx, edx ; ditto paddd mm0, mm6 ; accumulate difference sub ecx, 1 jnz nextrow00ssea movd eax, mm0 ; store return value pop edx pop ecx pop ebx pop ebp emms ret global dist1_01_mmxe ; int dist1_01(char *blk1,char *blk2,int lx,int h); ; eax = p1 ; ebx = p2 ; ecx = counter temp ; edx = lx; ; mm0 = distance accumulator ; mm1 = distlim ; mm2 = rowsleft ; mm3 = 2 (rows per loop) ; mm4 = temp ; mm5 = temp ; mm6 = temp align 32 dist1_01_mmxe: push ebp mov ebp, esp push ebx push ecx push edx pxor mm0, mm0 ; zero acculumator mov eax, [ebp+8] ; get p1 mov ebx, [ebp+12] ; get p2 mov edx, [ebp+16] ; get lx mov ecx, [ebp+20] ; get rowsleft jmp nextrow01 ; snap to it align 32 nextrow01: movq mm4, [eax] ; load first 8 bytes of p1 (row 1) pavgb mm4, [eax+1] ; Interpolate... psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1) paddd mm0, mm4 ; accumulate difference movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1) pavgb mm5, [eax+9] ; Interpolate psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1) paddd mm0, mm5 ; accumulate difference add eax, edx ; update pointer to next row add ebx, edx ; ditto movq mm6, [eax] ; load first 8 bytes of p1 (row 2) pavgb mm6, [eax+1] ; Interpolate psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2) paddd mm0, mm6 ; accumulate difference movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2) pavgb mm7, [eax+9] psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2) paddd mm0, mm7 ; accumulate difference add eax, edx ; update pointer to next row add ebx, edx ; ditto sub ecx, 2 ; check rowsleft jnz nextrow01 ; rinse and repeat movd eax, mm0 ; store return value pop edx pop ecx pop ebx pop ebp ; restore stack pointer emms ; clear mmx registers ret ; we now return you to your regular programming global dist1_10_mmxe ; int dist1_10(char *blk1,char *blk2,int lx,int h); ; eax = p1 ; ebx = p2 ; ecx = counter temp ; edx = lx; ; edi = p1+lx ; mm0 = distance accumulator ; mm2 = rowsleft ; mm3 = 2 (rows per loop) ; mm4 = temp ; mm5 = temp ; mm6 = temp align 32 dist1_10_mmxe: push ebp ; save stack pointer mov ebp, esp push ebx push ecx push edx push edi pxor mm0, mm0 ; zero acculumator mov eax, [ebp+8] ; get p1 mov ebx, [ebp+12] ; get p2 mov edx, [ebp+16] ; get lx mov edi, eax add edi, edx mov ecx, [ebp+20] ; get rowsleft jmp nextrow10 ; snap to it align 32 nextrow10: movq mm4, [eax] ; load first 8 bytes of p1 (row 1) pavgb mm4, [edi] ; Interpolate... psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1) paddd mm0, mm4 ; accumulate difference movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1) pavgb mm5, [edi+8] ; Interpolate psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1) paddd mm0, mm5 ; accumulate difference add eax, edx ; update pointer to next row add ebx, edx ; ditto add edi, edx movq mm6, [eax] ; load first 8 bytes of p1 (row 2) pavgb mm6, [edi] ; Interpolate psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2) paddd mm0, mm6 ; accumulate difference movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2) pavgb mm7, [edi+8] psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2) paddd mm0, mm7 ; accumulate difference psubd mm2, mm3 ; decrease rowsleft add eax, edx ; update pointer to next row add ebx, edx ; ditto add edi, edx sub ecx, 2 ; check rowsleft (we're doing 2 at a time) jnz nextrow10 ; rinse and repeat movd eax, mm0 ; store return value pop edi pop edx pop ecx pop ebx pop ebp ; restore stack pointer emms ; clear mmx registers ret ; we now return you to your regular programming global dist1_11_mmxe ; int dist1_11(char *blk1,char *blk2,int lx,int h); ; eax = p1 ; ebx = p2 ; ecx = counter temp ; edx = lx; ; edi = p1+lx ; mm0 = distance accumulator ; mm2 = rowsleft ; mm3 = 2 (rows per loop) ; mm4 = temp ; mm5 = temp ; mm6 = temp align 32 dist1_11_mmxe: push ebp ; save stack pointer mov ebp, esp ; so that we can do this push ebx ; save the pigs push ecx ; make them squeal push edx ; lets have pigs for every meal push edi pxor mm0, mm0 ; zero acculumator mov eax, [ebp+8] ; get p1 mov ebx, [ebp+12] ; get p2 mov edx, [ebp+16] ; get lx mov edi, eax add edi, edx mov ecx, [ebp+20] ; get rowsleft jmp nextrow11 ; snap to it align 32 nextrow11: movq mm4, [eax] ; load first 8 bytes of p1 (row 1) pavgb mm4, [edi] ; Interpolate... movq mm5, [eax+1] pavgb mm5, [edi+1] pavgb mm4, mm5 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1) paddd mm0, mm4 ; accumulate difference movq mm6, [eax+8] ; load next 8 bytes of p1 (row 1) pavgb mm6, [edi+8] ; Interpolate movq mm7, [eax+9] pavgb mm7, [edi+9] pavgb mm6, mm7 psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1) paddd mm0, mm6 ; accumulate difference add eax, edx ; update pointer to next row add ebx, edx ; ditto add edi, edx movq mm4, [eax] ; load first 8 bytes of p1 (row 1) pavgb mm4, [edi] ; Interpolate... movq mm5, [eax+1] pavgb mm5, [edi+1] pavgb mm4, mm5 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1) paddd mm0, mm4 ; accumulate difference movq mm6, [eax+8] ; load next 8 bytes of p1 (row 1) pavgb mm6, [edi+8] ; Interpolate movq mm7, [eax+9] pavgb mm7, [edi+9] pavgb mm6, mm7 psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1) paddd mm0, mm6 ; accumulate difference add eax, edx ; update pointer to next row add ebx, edx ; ditto add edi, edx sub ecx, 2 ; check rowsleft jnz near nextrow11 ; rinse and repeat movd eax, mm0 ; store return value pop edi pop edx pop ecx pop ebx pop ebp ; restore stack pointer emms ; clear mmx registers ret ; we now return you to your regular programming global dist22_mmxe ; int dist22_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh); ; eax = p1 ; ebx = p2 ; ecx = counter temp ; edx = flx; ; mm0 = distance accumulator ; mm2 = rowsleft ; mm3 = 2 (rows per loop) ; mm4 = temp ; mm5 = temp ; mm6 = temp align 32 dist22_mmxe: push ebp ; save frame pointer mov ebp, esp push ebx push ecx push edx pxor mm0, mm0 ; zero acculumator mov eax, [ebp+8] ; get p1 mov ebx, [ebp+12] ; get p2 mov edx, [ebp+16] ; get lx mov ecx, [ebp+20] jmp nextrowfd align 32 nextrowfd: movq mm4, [eax] ; load first 8 bytes of p1 (row 1) add eax, edx ; update pointer to next row psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1) add ebx, edx ; ditto paddd mm0, mm4 ; accumulate difference movq mm6, [eax] ; load first 8 bytes of p1 (row 2) add eax, edx ; update pointer to next row psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2) add ebx, edx ; ditto paddd mm0, mm6 ; accumulate difference sub ecx, 2 jnz nextrowfd movd eax, mm0 pop edx pop ecx pop ebx pop ebp emms ret global dist44_mmxe ; int dist44_mmxe(unsigned char *blk1,unsigned char *blk2,int qlx,int qh); ; eax = p1 ; ebx = p2 ; ecx = temp ; edx = qlx; ; esi = rowsleft ; mm0 = distance accumulator left block p1 ; mm1 = distance accumulator right block p1 ; mm2 = 0 ; mm3 = 0 ; mm4 = temp ; mm5 = temp ; mm6 = temp align 32 dist44_mmxe: push ebp mov ebp, esp push ebx push ecx push edx push esi pxor mm0, mm0 ; zero acculumator pxor mm1, mm1 pxor mm2, mm2 mov eax, [ebp+8] ; get p1 mov ebx, [ebp+12] ; get p2 mov edx, [ebp+16] ; get qlx mov esi, [ebp+20] ; get rowsleft jmp nextrowqd ; snap to it align 32 nextrowqd: movq mm4, [eax] ; load 8 bytes of p1 (two blocks!) add eax, edx ; update pointer to next row movq mm6, mm4 ; mov ecx, [ebx] ; load 4 bytes of p2 punpcklbw mm4, mm2 ; mm4 = bytes 0..3 p1 (spaced out) movd mm5, ecx punpcklbw mm5, mm2 ; mm5 = bytes 0..3 p2 (spaced out) psadbw mm4, mm5 ; compare to left block add ebx, edx ; ditto ; punpckhbw mm6, mm2 ; mm6 = bytes 4..7 p1 (spaced out) paddd mm0, mm4 ; accumulate difference left block ; psadbw mm6,mm5 ; compare to right block ; paddd mm1, mm6 ; accumulate difference right block sub esi, 1 jnz nextrowqd movd eax, mm0 ; movd ebx, mm1 ; sal ebx, 16 ; or eax, ebx pop esi pop edx pop ecx pop ebx pop ebp ; restore stack pointer emms ; clear mmx registers ret ; we now return you to your regular programming