;;; ;;; mblockq_sad_mmxe.s: ;;; ;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblock ;;; quads (2 by 2 squares of adjacent macroblocks) ;;; Explanation: the motion compensation search at 1-pel and 2*2 sub-sampled ;;; evaluates macroblock quads. A lot of memory accesses can be saved ;;; if each quad is done together rather than each macroblock in the ;;; quad handled individually. ;;; TODO: Really there ought to be MMX versions and the function's ;;; specification should be documented... ; ; Copyright (C) 2000 Andrew Stevens ; ; This program is free software; you can reaxstribute it and/or ; modify it under the terms of the GNU General Public License ; as published by the Free Software Foundation; either version 2 ; of the License, or (at your option) any later version. ; ; This program is distributed in the hope that it will be useful, ; but WITHOUT ANY WARRANTY; without even the implied warranty of ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ; GNU General Public License for more details. ; ; You should have received a copy of the GNU General Public License ; along with this program; if not, write to the Free Software ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ; ; ;;; CURRENTLY not used but used in testing as reference for tweaks... global mblockq_sad1_REF ; void mblockq_dist1_REF(char *blk1,char *blk2,int lx,int h,int *weightvec); ; eax = p1 ; ebx = p2 ; ecx = unused ; edx = lx; ; edi = rowsleft ; esi = h ; mm0 = SAD (x+0,y+0) ; mm1 = SAD (x+2,y+0) ; mm2 = SAD (x+0,y+2) ; mm3 = SAD (x+2,y+2) ; mm4 = temp ; mm5 = temp ; mm6 = temp ; mm7 = temp SECTION .text align 32 mblockq_dist1_REF: push ebp ; save frame pointer mov ebp, esp ; link push eax push ebx push ecx push edx push edi push esi pxor mm0, mm0 ; zero accumulators pxor mm1, mm1 pxor mm2, mm2 pxor mm3, mm3 mov eax, [ebp+8] ; get p1 mov ebx, [ebp+12] ; get p2 mov edx, [ebp+16] ; get lx mov edi, [ebp+20] ; get rowsleft mov esi, edi jmp nextrow_block_d1 align 32 nextrow_block_d1: ;; Do the (+0,+0) SAD movq mm4, [eax] ; load 1st 8 bytes of p1 movq mm6, mm4 movq mm5, [ebx] psadbw mm4, mm5 ; compare to 1st 8 bytes of p2 paddd mm0, mm4 ; accumulate difference movq mm4, [eax+8] ; load 2nd 8 bytes of p1 movq mm7, mm4 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2 paddd mm0, mm4 ; accumulate difference cmp edi, esi jz firstrow0 ;; Do the (0,+2) SAD sub ebx, edx psadbw mm6, [ebx] ; compare to next 8 bytes of p2 (row 1) paddd mm2, mm6 ; accumulate difference psadbw mm7, [ebx+8] ; next 8 bytes of p1 (row 1) add ebx, edx paddd mm2, mm7 firstrow0: ;; Do the (+2,0) SAD movq mm4, [eax+1] movq mm6, mm4 psadbw mm4, mm5 ; compare to 1st 8 bytes of p2 paddd mm1, mm4 ; accumulate difference movq mm4, [eax+9] movq mm7, mm4 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2 paddd mm1, mm4 ; accumulate difference cmp edi, esi jz firstrow1 ;; Do the (+2, +2 ) SAD sub ebx, edx psadbw mm6, [ebx] ; compare to 1st 8 bytes of prev p2 psadbw mm7, [ebx+8] ; 2nd 8 bytes of prev p2 add ebx, edx paddd mm3, mm6 ; accumulate difference paddd mm3, mm7 firstrow1: add eax, edx ; update pointer to next row add ebx, edx ; ditto sub edi, 1 jnz near nextrow_block_d1 ;; Do the last row of the (0,+2) SAD movq mm4, [eax] ; load 1st 8 bytes of p1 movq mm5, [eax+8] ; load 2nd 8 bytes of p1 sub ebx, edx psadbw mm4, [ebx] ; compare to next 8 bytes of p2 (row 1) psadbw mm5, [ebx+8] ; next 8 bytes of p1 (row 1) paddd mm2, mm4 ; accumulate difference paddd mm2, mm5 movq mm4, [eax+1] movq mm5, [eax+9] ;; Do the last row of rhw (+2, +2) SAD psadbw mm4, [ebx] ; compare to 1st 8 bytes of prev p2 psadbw mm5, [ebx+8] ; 2nd 8 bytes of prev p2 paddd mm3, mm4 ; accumulate difference paddd mm3, mm5 mov eax, [ebp+24] ; Weightvec movd [eax+0], mm0 movd [eax+4], mm1 movd [eax+8], mm2 movd [eax+12], mm3 pop esi pop edi pop edx pop ecx pop ebx pop eax pop ebp emms ret global mblockq_dist1_mmxe ; void mblockq_dist1_mmxe(char *blk1,char *blk2,int lx,int h,int *weightvec); ; eax = p1 ; ebx = p2 ; ecx = unused ; edx = lx; ; edi = rowsleft ; esi = h ; mm0 = SAD (x+0,y+0),SAD (x+0,y+2) ; mm1 = SAD (x+2,y+0),SAD (x+2,y+2) ; mm4 = temp ; mm5 = temp ; mm6 = temp ; mm7 = temp align 32 mblockq_dist1_mmxe: push ebp ; save frame pointer mov ebp, esp ; link push eax push ebx push ecx push edx push edi push esi mov eax, [ebp+8] ; get p1 prefetcht0 [eax] pxor mm0, mm0 ; zero accumulators pxor mm1, mm1 mov ebx, [ebp+12] ; get p2 mov edx, [ebp+16] ; get lx mov edi, [ebp+20] ; get rowsleft mov esi, edi jmp nextrow_block_e1 align 32 nextrow_block_e1: ;; Do the (+0,+0) SAD prefetcht0 [eax+edx] movq mm4, [eax] ; load 1st 8 bytes of p1 movq mm6, mm4 movq mm5, [ebx] psadbw mm4, mm5 ; compare to 1st 8 bytes of p2 paddd mm0, mm4 ; accumulate difference movq mm4, [eax+8] ; load 2nd 8 bytes of p1 movq mm7, mm4 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2 paddd mm0, mm4 ; accumulate difference cmp edi, esi jz firstrowe0 ;; Do the (0,+2) SAD sub ebx, edx pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 movq mm2, [ebx] psadbw mm6, mm2 ; compare to next 8 bytes of p2 (row 1) paddd mm0, mm6 ; accumulate difference movq mm3, [ebx+8] psadbw mm7, mm3 ; next 8 bytes of p1 (row 1) add ebx, edx paddd mm0, mm7 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 firstrowe0: ;; Do the (+2,0) SAD movq mm4, [eax+1] movq mm6, mm4 psadbw mm4, mm5 ; compare to 1st 8 bytes of p2 paddd mm1, mm4 ; accumulate difference movq mm4, [eax+9] movq mm7, mm4 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2 paddd mm1, mm4 ; accumulate difference cmp edi, esi jz firstrowe1 ;; Do the (+2, +2 ) SAD sub ebx, edx pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 psadbw mm6, mm2 ; compare to 1st 8 bytes of prev p2 psadbw mm7, mm3 ; 2nd 8 bytes of prev p2 add ebx, edx paddd mm1, mm6 ; accumulate difference paddd mm1, mm7 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 firstrowe1: add eax, edx ; update pointer to next row add ebx, edx ; ditto sub edi, 1 jnz near nextrow_block_e1 ;; Do the last row of the (0,+2) SAD pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 movq mm4, [eax] ; load 1st 8 bytes of p1 movq mm5, [eax+8] ; load 2nd 8 bytes of p1 sub ebx, edx psadbw mm4, [ebx] ; compare to next 8 bytes of p2 (row 1) psadbw mm5, [ebx+8] ; next 8 bytes of p1 (row 1) paddd mm0, mm4 ; accumulate difference paddd mm0, mm5 ;; Do the last row of rhw (+2, +2) SAD pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 movq mm4, [eax+1] movq mm5, [eax+9] psadbw mm4, [ebx] ; compare to 1st 8 bytes of prev p2 psadbw mm5, [ebx+8] ; 2nd 8 bytes of prev p2 paddd mm1, mm4 ; accumulate difference paddd mm1, mm5 mov eax, [ebp+24] ; Weightvec movd [eax+8], mm0 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 movd [eax+12], mm1 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 movd [eax+0], mm0 movd [eax+4], mm1 pop esi pop edi pop edx pop ecx pop ebx pop eax pop ebp emms ret global mblockq_dist22_mmxe ; void mblockq_dist22_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh, int* resvec); ; eax = p1 ; ebx = p2 ; ecx = counter temp ; edx = flx; ; mm0 = distance accumulator ; mm1 = distance accumulator ; mm2 = previous p1 row ; mm3 = previous p1 displaced by 1 byte... ; mm4 = temp ; mm5 = temp ; mm6 = temp ; mm7 = temp / 0 if first row 0xff otherwise align 32 mblockq_dist22_mmxe: push ebp ; save frame pointer mov ebp, esp push eax push ebx push ecx push edx pxor mm0, mm0 ; zero acculumator pxor mm1, mm1 ; zero acculumator pxor mm2, mm2 ; zero acculumator pxor mm3, mm3 ; zero acculumator mov eax, [ebp+8] ; get p1 mov ebx, [ebp+12] ; get p2 mov edx, [ebp+16] ; get lx mov ecx, [ebp+20] movq mm2, [eax+edx] movq mm3, [eax+edx+1] jmp nextrowbd22 align 32 nextrowbd22: movq mm5, [ebx] ; load previous row reference block ; mm2 /mm3 containts current row target block psadbw mm2, mm5 ; Comparse (x+0,y+2) paddd mm1, mm2 psadbw mm3, mm5 ; Compare (x+2,y+2) pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 paddd mm1, mm3 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 movq mm2, [eax] ; Load current row traget block into mm2 / mm3 movq mm6, mm2 movq mm3, [eax+1] sub eax, edx sub ebx, edx prefetcht0 [eax] movq mm7, mm3 psadbw mm6, mm5 ; Compare (x+0,y+0) paddd mm0, mm6 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 psadbw mm7, mm5 ; Compare (x+2,y+0) paddd mm0, mm7 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64 sub ecx, 1 jnz nextrowbd22 mov eax, [ebp+24] movq [eax+0], mm0 movq [eax+8], mm1 pop edx pop ecx pop ebx pop eax pop ebp emms ret