;;; ;;; predcomp_00_mmx.s: ;;; ;;; Extended MMX prediction composition ;;; routines handling the four different interpolation cases... ;;; ;;; Copyright (C) 2000 Andrew Stevens ;;; ;;; This program is free software; you can reaxstribute it and/or ;;; modify it under the terms of the GNU General Public License ;;; as published by the Free Software Foundation; either version 2 ;;; of the License, or (at your option) any later version. ;;; ;;; This program is distributed in the hope that it will be useful, ;;; but WITHOUT ANY WARRANTY; without even the implied warranty of ;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ;;; GNU General Public License for more details. ;;; ;;; You should have received a copy of the GNU General Public License ;;; along with this program; if not, write to the Free Software ;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA ;;; 02111-1307, USA. ;;; ;;; ;;; ;;; The no interpolation case... global predcomp_00_mmx ;;; void predcomp__mmx(char *src,char *dst,int lx, int w, int h, int addflag); ;;; ix - Interpolation in x iy - Interpolation in y ;;; eax = pdst ;;; ebx = psrc ;;; ecx = h left ;;; edx = lx; ;;; edi = w (8 or 16) ;;; mm1 = one's mask for src ;;; mm0 = zero mask for src... align 32 predcomp_00_mmx: push ebp ; save frame pointer mov ebp, esp ; link push eax push ebx push ecx push edx push edi push esi mov eax, 0x00010001 movd mm1, eax punpckldq mm1,mm1 mov ebx, [ebp+8] ; get psrc mov eax, [ebp+12] ; get pdst mov edx, [ebp+16] ; get lx mov edi, [ebp+20] ; get w mov ecx, [ebp+24] ; get h mov esi, [ebp+28] ; get addflag ;; Extend addflag into bit-mask pxor mm0, mm0 jmp predrow00m ; align for speed align 32 predrow00m: movq mm4, [ebx] ; first 8 bytes of row cmp esi, 0 jz noadd00 movq mm5, mm4 punpcklbw mm4, mm0 punpckhbw mm5, mm0 movq mm2, [eax] movq mm3, mm2 punpcklbw mm2, mm0 punpckhbw mm3, mm0 paddw mm4, mm2 paddw mm5, mm3 paddw mm4, mm1 paddw mm5, mm1 psrlw mm4, 1 psrlw mm5, 1 packuswb mm4, mm5 noadd00: movq [eax], mm4 cmp edi, 8 jz eightwide00 movq mm4, [ebx+8] ; first 8 bytes of row cmp esi, 0 jz noadd00w movq mm5, mm4 punpcklbw mm4, mm0 punpckhbw mm5, mm0 movq mm2, [eax+8] movq mm3, mm2 punpcklbw mm2, mm0 punpckhbw mm3, mm0 paddw mm4, mm2 paddw mm5, mm3 paddw mm4, mm1 paddw mm5, mm1 psrlw mm4, 1 psrlw mm5, 1 packuswb mm4, mm5 noadd00w: movq [eax+8], mm4 eightwide00: add eax, edx ; update pointer to next row add ebx, edx ; ditto sub ecx, 1 ; check h left jnz near predrow00m pop esi pop edi pop edx pop ecx pop ebx pop eax pop ebp emms ret ;;; The x-axis interpolation case... global predcomp_10_mmx align 32 predcomp_10_mmx: push ebp ; save frame pointer mov ebp, esp ; link push eax push ebx push ecx push edx push edi push esi mov eax, 0x00010001 movd mm1, eax punpckldq mm1,mm1 mov ebx, [ebp+8] ; get psrc mov eax, [ebp+12] ; get pdst mov edx, [ebp+16] ; get lx mov edi, [ebp+20] ; get w mov ecx, [ebp+24] ; get h mov esi, [ebp+28] ; get addflag ;; Extend addflag into bit-mask pxor mm0, mm0 jmp predrow10m ; align for speed align 32 predrow10m: movq mm4, [ebx] ; first 8 bytes of row movq mm5, mm4 punpcklbw mm4, mm0 punpckhbw mm5, mm0 movq mm2, [ebx+1] movq mm3, mm2 punpcklbw mm2, mm0 punpckhbw mm3, mm0 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3 paddw mm5, mm3 paddw mm4, mm1 paddw mm5, mm1 psrlw mm4, 1 psrlw mm5, 1 cmp esi, 0 jz noadd10 movq mm2, [eax] ; Add movq mm3, mm2 punpcklbw mm2, mm0 punpckhbw mm3, mm0 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3 paddw mm5, mm3 paddw mm4, mm1 paddw mm5, mm1 psrlw mm4, 1 psrlw mm5, 1 noadd10: packuswb mm4, mm5 movq [eax], mm4 cmp edi, 8 jz eightwide10 movq mm4, [ebx+8] ; first 8 bytes of row movq mm5, mm4 punpcklbw mm4, mm0 punpckhbw mm5, mm0 movq mm2, [ebx+9] movq mm3, mm2 punpcklbw mm2, mm0 punpckhbw mm3, mm0 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3 paddw mm5, mm3 paddw mm4, mm1 paddw mm5, mm1 psrlw mm4, 1 psrlw mm5, 1 cmp esi, 0 jz noadd10w movq mm2, [eax+8] ; Add movq mm3, mm2 punpcklbw mm2, mm0 punpckhbw mm3, mm0 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3 paddw mm5, mm3 paddw mm4, mm1 paddw mm5, mm1 psrlw mm4, 1 psrlw mm5, 1 noadd10w: packuswb mm4, mm5 movq [eax+8], mm4 eightwide10: add eax, edx ; update pointer to next row add ebx, edx ; ditto sub ecx, 1 ; check h left jnz near predrow10m pop esi pop edi pop edx pop ecx pop ebx pop eax pop ebp emms ret ;;; The y-axis interpolation case... global predcomp_01_mmx align 32 predcomp_01_mmx: push ebp ; save frame pointer mov ebp, esp ; link push eax push ebx push ecx push edx push edi push esi mov eax, 0x00010001 movd mm1, eax punpckldq mm1,mm1 mov ebx, [ebp+8] ; get psrc mov eax, [ebp+12] ; get pdst mov edx, [ebp+16] ; get lx mov edi, [ebp+20] ; get w mov ecx, [ebp+24] ; get h mov esi, [ebp+28] ; get addflag pxor mm0, mm0 jmp predrow01m ; align for speed align 32 predrow01m: movq mm4, [ebx] ; first 8 bytes of row movq mm5, mm4 add ebx, edx ; Next row punpcklbw mm4, mm0 punpckhbw mm5, mm0 movq mm2, [ebx] movq mm3, mm2 punpcklbw mm2, mm0 punpckhbw mm3, mm0 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3 paddw mm5, mm3 paddw mm4, mm1 paddw mm5, mm1 psrlw mm4, 1 psrlw mm5, 1 cmp esi, 0 jz noadd01 movq mm2, [eax] ; Add movq mm3, mm2 punpcklbw mm2, mm0 punpckhbw mm3, mm0 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3 paddw mm5, mm3 paddw mm4, mm1 paddw mm5, mm1 psrlw mm4, 1 psrlw mm5, 1 noadd01: packuswb mm4, mm5 movq [eax], mm4 cmp edi, 8 jz eightwide01 sub ebx, edx ; Back to first row... movq mm4, [ebx+8] ; first 8 bytes of row movq mm5, mm4 add ebx, edx ; Next row punpcklbw mm4, mm0 punpckhbw mm5, mm0 movq mm2, [ebx+8] movq mm3, mm2 punpcklbw mm2, mm0 punpckhbw mm3, mm0 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3 paddw mm5, mm3 paddw mm4, mm1 paddw mm5, mm1 psrlw mm4, 1 psrlw mm5, 1 cmp esi, 0 jz noadd01w movq mm2, [eax+8] ; Add movq mm3, mm2 punpcklbw mm2, mm0 punpckhbw mm3, mm0 paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3 paddw mm5, mm3 paddw mm4, mm1 paddw mm5, mm1 psrlw mm4, 1 psrlw mm5, 1 noadd01w: packuswb mm4, mm5 movq [eax+8], mm4 eightwide01: add eax, edx ; ditto sub ecx, 1 ; check h left jnz near predrow01m pop esi pop edi pop edx pop ecx pop ebx pop eax pop ebp emms ret ;;; The x-axis and y-axis interpolation case... global predcomp_11_mmx ;;; mm0 = [0,0,0,0]W ;;; mm1 = [1,1,1,1]W ;;; mm2 = [2,2,2,2]W align 32 predcomp_11_mmx: push ebp ; save frame pointer mov ebp, esp ; link push eax push ebx push ecx push edx push edi push esi mov eax, 0x00020002 movd mm2, eax punpckldq mm2,mm2 mov eax, 0x00010001 movd mm1, eax punpckldq mm1,mm1 pxor mm0, mm0 mov ebx, [ebp+8] ; get psrc mov eax, [ebp+12] ; get pdst mov edx, [ebp+16] ; get lx mov edi, [ebp+20] ; get w mov ecx, [ebp+24] ; get h mov esi, [ebp+28] ; Addflags ;; Extend addflag into bit-mask jmp predrow11 ; align for speed align 32 predrow11: movq mm4, [ebx] ; mm4 and mm6 accumulate partial sums for interp. movq mm6, mm4 punpcklbw mm4, mm0 punpckhbw mm6, mm0 movq mm5, [ebx+1] movq mm7, mm5 punpcklbw mm5, mm0 paddw mm4, mm5 punpckhbw mm7, mm0 paddw mm6, mm7 add ebx, edx ; update pointer to next row movq mm5, [ebx] ; first 8 bytes 1st row: avg src in x movq mm7, mm5 punpcklbw mm5, mm0 ; Accumulate partial interpolation paddw mm4, mm5 punpckhbw mm7, mm0 paddw mm6, mm7 movq mm5, [ebx+1] movq mm7, mm5 punpcklbw mm5, mm0 paddw mm4, mm5 punpckhbw mm7, mm0 paddw mm6, mm7 ;; Now round paddw mm4, mm2 paddw mm6, mm2 psrlw mm4, 2 psrlw mm6, 2 cmp esi, 0 jz noadd11 movq mm5, [eax] ; Add movq mm7, mm5 punpcklbw mm5, mm0 punpckhbw mm7, mm0 paddw mm4, mm5 ; Average mm4/mm6 and mm5/mm7 paddw mm6, mm7 paddw mm4, mm1 paddw mm6, mm1 psrlw mm4, 1 psrlw mm6, 1 noadd11: packuswb mm4, mm6 movq [eax], mm4 cmp edi, 8 jz near eightwide11 sub ebx, edx ; Back to first row... movq mm4, [ebx+8] ; mm4 and mm6 accumulate partial sums for interp. movq mm6, mm4 punpcklbw mm4, mm0 punpckhbw mm6, mm0 movq mm5, [ebx+9] movq mm7, mm5 punpcklbw mm5, mm0 paddw mm4, mm5 punpckhbw mm7, mm0 paddw mm6, mm7 add ebx, edx ; update pointer to next row movq mm5, [ebx+8] ; first 8 bytes 1st row: avg src in x movq mm7, mm5 punpcklbw mm5, mm0 ; Accumulate partial interpolation paddw mm4, mm5 punpckhbw mm7, mm0 paddw mm6, mm7 movq mm5, [ebx+9] movq mm7, mm5 punpcklbw mm5, mm0 paddw mm4, mm5 punpckhbw mm7, mm0 paddw mm6, mm7 ;; Now round paddw mm4, mm2 paddw mm6, mm2 psrlw mm4, 2 psrlw mm6, 2 cmp esi, 0 jz noadd11w movq mm5, [eax+8] ; Add and average movq mm7, mm5 punpcklbw mm5, mm0 punpckhbw mm7, mm0 paddw mm4, mm5 ; Average mm4/mm6 and mm5/mm7 paddw mm6, mm7 paddw mm4, mm1 paddw mm6, mm1 psrlw mm4, 1 psrlw mm6, 1 noadd11w: packuswb mm4, mm6 movq [eax+8], mm4 eightwide11: add eax, edx ; update pointer to next row sub ecx, 1 ; check h left jnz near predrow11 pop esi pop edi pop edx pop ecx pop ebx pop eax pop ebp emms ret