+++ /dev/null
-;;;
-;;; predcomp_00_mmx.s:
-;;;
-;;; Extended MMX prediction composition
-;;; routines handling the four different interpolation cases...
-;;;
-;;; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
-
-;;;
-;;; This program is free software; you can reaxstribute it and/or
-;;; modify it under the terms of the GNU General Public License
-;;; as published by the Free Software Foundation; either version 2
-;;; of the License, or (at your option) any later version.
-;;;
-;;; This program is distributed in the hope that it will be useful,
-;;; but WITHOUT ANY WARRANTY; without even the implied warranty of
-;;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-;;; GNU General Public License for more details.
-;;;
-;;; You should have received a copy of the GNU General Public License
-;;; along with this program; if not, write to the Free Software
-;;; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
-;;; 02111-1307, USA.
-;;;
-;;;
-;;;
-
-;;; The no interpolation case...
-
-global predcomp_00_mmx
-
-;;; void predcomp_<ix><iy>_mmx(char *src,char *dst,int lx, int w, int h, int addflag);
-
-;;; ix - Interpolation in x iy - Interpolation in y
-
-;;; eax = pdst
-;;; ebx = psrc
-;;; ecx = h left
-;;; edx = lx;
-;;; edi = w (8 or 16)
-
-
-;;; mm1 = one's mask for src
-;;; mm0 = zero mask for src...
-
-
-
-align 32
-predcomp_00_mmx:
- push ebp ; save frame pointer
- mov ebp, esp ; link
-
- push eax
- push ebx
- push ecx
- push edx
- push edi
- push esi
-
- mov eax, 0x00010001
- movd mm1, eax
- punpckldq mm1,mm1
-
- mov ebx, [ebp+8] ; get psrc
- mov eax, [ebp+12] ; get pdst
- mov edx, [ebp+16] ; get lx
- mov edi, [ebp+20] ; get w
- mov ecx, [ebp+24] ; get h
- mov esi, [ebp+28] ; get addflag
- ;; Extend addflag into bit-mask
- pxor mm0, mm0
- jmp predrow00m ; align for speed
-align 32
-predrow00m:
- movq mm4, [ebx] ; first 8 bytes of row
- cmp esi, 0
- jz noadd00
-
- movq mm5, mm4
- punpcklbw mm4, mm0
- punpckhbw mm5, mm0
-
- movq mm2, [eax]
- movq mm3, mm2
- punpcklbw mm2, mm0
- punpckhbw mm3, mm0
- paddw mm4, mm2
- paddw mm5, mm3
- paddw mm4, mm1
- paddw mm5, mm1
- psrlw mm4, 1
- psrlw mm5, 1
- packuswb mm4, mm5
-noadd00:
- movq [eax], mm4
-
- cmp edi, 8
- jz eightwide00
-
- movq mm4, [ebx+8] ; first 8 bytes of row
- cmp esi, 0
- jz noadd00w
-
- movq mm5, mm4
- punpcklbw mm4, mm0
- punpckhbw mm5, mm0
-
- movq mm2, [eax+8]
- movq mm3, mm2
- punpcklbw mm2, mm0
- punpckhbw mm3, mm0
- paddw mm4, mm2
- paddw mm5, mm3
- paddw mm4, mm1
- paddw mm5, mm1
- psrlw mm4, 1
- psrlw mm5, 1
- packuswb mm4, mm5
-noadd00w:
- movq [eax+8], mm4
-
-eightwide00:
- add eax, edx ; update pointer to next row
- add ebx, edx ; ditto
-
- sub ecx, 1 ; check h left
- jnz near predrow00m
-
- pop esi
- pop edi
- pop edx
- pop ecx
- pop ebx
- pop eax
- pop ebp
- emms
- ret
-
-
-;;; The x-axis interpolation case...
-
-global predcomp_10_mmx
-
-
-align 32
-predcomp_10_mmx:
- push ebp ; save frame pointer
- mov ebp, esp ; link
-
- push eax
- push ebx
- push ecx
- push edx
- push edi
- push esi
-
- mov eax, 0x00010001
- movd mm1, eax
- punpckldq mm1,mm1
-
- mov ebx, [ebp+8] ; get psrc
- mov eax, [ebp+12] ; get pdst
- mov edx, [ebp+16] ; get lx
- mov edi, [ebp+20] ; get w
- mov ecx, [ebp+24] ; get h
- mov esi, [ebp+28] ; get addflag
- ;; Extend addflag into bit-mask
- pxor mm0, mm0
- jmp predrow10m ; align for speed
-align 32
-predrow10m:
- movq mm4, [ebx] ; first 8 bytes of row
- movq mm5, mm4
- punpcklbw mm4, mm0
- punpckhbw mm5, mm0
- movq mm2, [ebx+1]
- movq mm3, mm2
- punpcklbw mm2, mm0
- punpckhbw mm3, mm0
-
- paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
- paddw mm5, mm3
- paddw mm4, mm1
- paddw mm5, mm1
- psrlw mm4, 1
- psrlw mm5, 1
-
- cmp esi, 0
- jz noadd10
-
- movq mm2, [eax] ; Add
- movq mm3, mm2
- punpcklbw mm2, mm0
- punpckhbw mm3, mm0
- paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
- paddw mm5, mm3
- paddw mm4, mm1
- paddw mm5, mm1
- psrlw mm4, 1
- psrlw mm5, 1
-noadd10:
- packuswb mm4, mm5
- movq [eax], mm4
-
- cmp edi, 8
- jz eightwide10
-
- movq mm4, [ebx+8] ; first 8 bytes of row
- movq mm5, mm4
- punpcklbw mm4, mm0
- punpckhbw mm5, mm0
- movq mm2, [ebx+9]
- movq mm3, mm2
- punpcklbw mm2, mm0
- punpckhbw mm3, mm0
-
- paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
- paddw mm5, mm3
- paddw mm4, mm1
- paddw mm5, mm1
- psrlw mm4, 1
- psrlw mm5, 1
-
- cmp esi, 0
- jz noadd10w
-
- movq mm2, [eax+8] ; Add
- movq mm3, mm2
- punpcklbw mm2, mm0
- punpckhbw mm3, mm0
- paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
- paddw mm5, mm3
- paddw mm4, mm1
- paddw mm5, mm1
- psrlw mm4, 1
- psrlw mm5, 1
-noadd10w:
- packuswb mm4, mm5
- movq [eax+8], mm4
-
-
-eightwide10:
- add eax, edx ; update pointer to next row
- add ebx, edx ; ditto
-
- sub ecx, 1 ; check h left
- jnz near predrow10m
-
- pop esi
- pop edi
- pop edx
- pop ecx
- pop ebx
- pop eax
- pop ebp
- emms
- ret
-
-;;; The y-axis interpolation case...
-
-global predcomp_01_mmx
-
-
-align 32
-predcomp_01_mmx:
- push ebp ; save frame pointer
- mov ebp, esp ; link
-
- push eax
- push ebx
- push ecx
- push edx
- push edi
- push esi
-
- mov eax, 0x00010001
- movd mm1, eax
- punpckldq mm1,mm1
-
- mov ebx, [ebp+8] ; get psrc
- mov eax, [ebp+12] ; get pdst
- mov edx, [ebp+16] ; get lx
- mov edi, [ebp+20] ; get w
- mov ecx, [ebp+24] ; get h
- mov esi, [ebp+28] ; get addflag
- pxor mm0, mm0
- jmp predrow01m ; align for speed
-
-align 32
-predrow01m:
- movq mm4, [ebx] ; first 8 bytes of row
- movq mm5, mm4
- add ebx, edx ; Next row
- punpcklbw mm4, mm0
- punpckhbw mm5, mm0
-
- movq mm2, [ebx]
- movq mm3, mm2
- punpcklbw mm2, mm0
- punpckhbw mm3, mm0
-
- paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
- paddw mm5, mm3
- paddw mm4, mm1
- paddw mm5, mm1
- psrlw mm4, 1
- psrlw mm5, 1
-
- cmp esi, 0
- jz noadd01
-
- movq mm2, [eax] ; Add
- movq mm3, mm2
- punpcklbw mm2, mm0
- punpckhbw mm3, mm0
- paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
- paddw mm5, mm3
- paddw mm4, mm1
- paddw mm5, mm1
- psrlw mm4, 1
- psrlw mm5, 1
-noadd01:
- packuswb mm4, mm5
- movq [eax], mm4
-
- cmp edi, 8
- jz eightwide01
-
- sub ebx, edx ; Back to first row...
- movq mm4, [ebx+8] ; first 8 bytes of row
- movq mm5, mm4
- add ebx, edx ; Next row
- punpcklbw mm4, mm0
- punpckhbw mm5, mm0
- movq mm2, [ebx+8]
- movq mm3, mm2
- punpcklbw mm2, mm0
- punpckhbw mm3, mm0
-
- paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
- paddw mm5, mm3
- paddw mm4, mm1
- paddw mm5, mm1
- psrlw mm4, 1
- psrlw mm5, 1
-
- cmp esi, 0
- jz noadd01w
-
- movq mm2, [eax+8] ; Add
- movq mm3, mm2
- punpcklbw mm2, mm0
- punpckhbw mm3, mm0
- paddw mm4, mm2 ; Average mm4/mm5 and mm2/mm3
- paddw mm5, mm3
- paddw mm4, mm1
- paddw mm5, mm1
- psrlw mm4, 1
- psrlw mm5, 1
-noadd01w:
- packuswb mm4, mm5
- movq [eax+8], mm4
-
-
-eightwide01:
- add eax, edx ; ditto
-
- sub ecx, 1 ; check h left
- jnz near predrow01m
-
- pop esi
- pop edi
- pop edx
- pop ecx
- pop ebx
- pop eax
- pop ebp
- emms
- ret
-
-
-;;; The x-axis and y-axis interpolation case...
-
-global predcomp_11_mmx
-
-;;; mm0 = [0,0,0,0]W
-;;; mm1 = [1,1,1,1]W
-;;; mm2 = [2,2,2,2]W
-align 32
-predcomp_11_mmx:
- push ebp ; save frame pointer
- mov ebp, esp ; link
-
- push eax
- push ebx
- push ecx
- push edx
- push edi
- push esi
-
- mov eax, 0x00020002
- movd mm2, eax
- punpckldq mm2,mm2
- mov eax, 0x00010001
- movd mm1, eax
- punpckldq mm1,mm1
- pxor mm0, mm0
-
- mov ebx, [ebp+8] ; get psrc
- mov eax, [ebp+12] ; get pdst
- mov edx, [ebp+16] ; get lx
- mov edi, [ebp+20] ; get w
- mov ecx, [ebp+24] ; get h
- mov esi, [ebp+28] ; Addflags
- ;; Extend addflag into bit-mask
-
-
- jmp predrow11 ; align for speed
-align 32
-predrow11:
- movq mm4, [ebx] ; mm4 and mm6 accumulate partial sums for interp.
- movq mm6, mm4
- punpcklbw mm4, mm0
- punpckhbw mm6, mm0
-
- movq mm5, [ebx+1]
- movq mm7, mm5
- punpcklbw mm5, mm0
- paddw mm4, mm5
- punpckhbw mm7, mm0
- paddw mm6, mm7
-
- add ebx, edx ; update pointer to next row
-
- movq mm5, [ebx] ; first 8 bytes 1st row: avg src in x
- movq mm7, mm5
- punpcklbw mm5, mm0 ; Accumulate partial interpolation
- paddw mm4, mm5
- punpckhbw mm7, mm0
- paddw mm6, mm7
-
- movq mm5, [ebx+1]
- movq mm7, mm5
- punpcklbw mm5, mm0
- paddw mm4, mm5
- punpckhbw mm7, mm0
- paddw mm6, mm7
-
- ;; Now round
- paddw mm4, mm2
- paddw mm6, mm2
- psrlw mm4, 2
- psrlw mm6, 2
-
- cmp esi, 0
- jz noadd11
-
- movq mm5, [eax] ; Add
- movq mm7, mm5
- punpcklbw mm5, mm0
- punpckhbw mm7, mm0
- paddw mm4, mm5 ; Average mm4/mm6 and mm5/mm7
- paddw mm6, mm7
- paddw mm4, mm1
- paddw mm6, mm1
- psrlw mm4, 1
- psrlw mm6, 1
-
-noadd11:
- packuswb mm4, mm6
- movq [eax], mm4
-
- cmp edi, 8
- jz near eightwide11
-
- sub ebx, edx ; Back to first row...
-
- movq mm4, [ebx+8] ; mm4 and mm6 accumulate partial sums for interp.
- movq mm6, mm4
- punpcklbw mm4, mm0
- punpckhbw mm6, mm0
-
- movq mm5, [ebx+9]
- movq mm7, mm5
- punpcklbw mm5, mm0
- paddw mm4, mm5
- punpckhbw mm7, mm0
- paddw mm6, mm7
-
- add ebx, edx ; update pointer to next row
-
- movq mm5, [ebx+8] ; first 8 bytes 1st row: avg src in x
- movq mm7, mm5
- punpcklbw mm5, mm0 ; Accumulate partial interpolation
- paddw mm4, mm5
- punpckhbw mm7, mm0
- paddw mm6, mm7
-
- movq mm5, [ebx+9]
- movq mm7, mm5
- punpcklbw mm5, mm0
- paddw mm4, mm5
- punpckhbw mm7, mm0
- paddw mm6, mm7
-
- ;; Now round
- paddw mm4, mm2
- paddw mm6, mm2
- psrlw mm4, 2
- psrlw mm6, 2
-
- cmp esi, 0
- jz noadd11w
-
- movq mm5, [eax+8] ; Add and average
- movq mm7, mm5
- punpcklbw mm5, mm0
- punpckhbw mm7, mm0
- paddw mm4, mm5 ; Average mm4/mm6 and mm5/mm7
- paddw mm6, mm7
- paddw mm4, mm1
- paddw mm6, mm1
- psrlw mm4, 1
- psrlw mm6, 1
-noadd11w:
- packuswb mm4, mm6
- movq [eax+8], mm4
-
-eightwide11:
- add eax, edx ; update pointer to next row
-
-
- sub ecx, 1 ; check h left
- jnz near predrow11
-
- pop esi
- pop edi
- pop edx
- pop ecx
- pop ebx
- pop eax
- pop ebp
- emms
- ret
-
-
-