2 ;;; mblockq_sad_mmxe.s:
4 ;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblock
5 ;;; quads (2 by 2 squares of adjacent macroblocks)
7 ;;; Explanation: the motion compensation search at 1-pel and 2*2 sub-sampled
8 ;;; evaluates macroblock quads. A lot of memory accesses can be saved
9 ;;; if each quad is done together rather than each macroblock in the
10 ;;; quad handled individually.
12 ;;; TODO: Really there ought to be MMX versions and the function's
13 ;;; specification should be documented...
15 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
19 ; This program is free software; you can reaxstribute it and/or
20 ; modify it under the terms of the GNU General Public License
21 ; as published by the Free Software Foundation; either version 2
22 ; of the License, or (at your option) any later version.
24 ; This program is distributed in the hope that it will be useful,
25 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
26 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
27 ; GNU General Public License for more details.
29 ; You should have received a copy of the GNU General Public License
30 ; along with this program; if not, write to the Free Software
31 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
35 ;;; CURRENTLY not used but used in testing as reference for tweaks...
36 global mblockq_sad1_REF
38 ; void mblockq_dist1_REF(char *blk1,char *blk2,int lx,int h,int *weightvec);
58 push ebp ; save frame pointer
67 pxor mm0, mm0 ; zero accumulators
71 mov eax, [ebp+8] ; get p1
72 mov ebx, [ebp+12] ; get p2
73 mov edx, [ebp+16] ; get lx
75 mov edi, [ebp+20] ; get rowsleft
84 movq mm4, [eax] ; load 1st 8 bytes of p1
87 psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
88 paddd mm0, mm4 ; accumulate difference
89 movq mm4, [eax+8] ; load 2nd 8 bytes of p1
91 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
92 paddd mm0, mm4 ; accumulate difference
100 psadbw mm6, [ebx] ; compare to next 8 bytes of p2 (row 1)
101 paddd mm2, mm6 ; accumulate difference
102 psadbw mm7, [ebx+8] ; next 8 bytes of p1 (row 1)
113 psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
114 paddd mm1, mm4 ; accumulate difference
117 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
118 paddd mm1, mm4 ; accumulate difference
123 ;; Do the (+2, +2 ) SAD
125 psadbw mm6, [ebx] ; compare to 1st 8 bytes of prev p2
126 psadbw mm7, [ebx+8] ; 2nd 8 bytes of prev p2
128 paddd mm3, mm6 ; accumulate difference
132 add eax, edx ; update pointer to next row
136 jnz near nextrow_block_d1
138 ;; Do the last row of the (0,+2) SAD
140 movq mm4, [eax] ; load 1st 8 bytes of p1
141 movq mm5, [eax+8] ; load 2nd 8 bytes of p1
143 psadbw mm4, [ebx] ; compare to next 8 bytes of p2 (row 1)
144 psadbw mm5, [ebx+8] ; next 8 bytes of p1 (row 1)
145 paddd mm2, mm4 ; accumulate difference
151 ;; Do the last row of rhw (+2, +2) SAD
152 psadbw mm4, [ebx] ; compare to 1st 8 bytes of prev p2
153 psadbw mm5, [ebx+8] ; 2nd 8 bytes of prev p2
154 paddd mm3, mm4 ; accumulate difference
158 mov eax, [ebp+24] ; Weightvec
177 global mblockq_dist1_mmxe
179 ; void mblockq_dist1_mmxe(char *blk1,char *blk2,int lx,int h,int *weightvec);
188 ; mm0 = SAD (x+0,y+0),SAD (x+0,y+2)
189 ; mm1 = SAD (x+2,y+0),SAD (x+2,y+2)
198 push ebp ; save frame pointer
207 mov eax, [ebp+8] ; get p1
209 pxor mm0, mm0 ; zero accumulators
211 mov ebx, [ebp+12] ; get p2
212 mov edx, [ebp+16] ; get lx
214 mov edi, [ebp+20] ; get rowsleft
221 ;; Do the (+0,+0) SAD
223 movq mm4, [eax] ; load 1st 8 bytes of p1
226 psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
227 paddd mm0, mm4 ; accumulate difference
228 movq mm4, [eax+8] ; load 2nd 8 bytes of p1
230 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
231 paddd mm0, mm4 ; accumulate difference
239 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
241 psadbw mm6, mm2 ; compare to next 8 bytes of p2 (row 1)
242 paddd mm0, mm6 ; accumulate difference
244 psadbw mm7, mm3 ; next 8 bytes of p1 (row 1)
247 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
255 psadbw mm4, mm5 ; compare to 1st 8 bytes of p2
256 paddd mm1, mm4 ; accumulate difference
261 psadbw mm4, [ebx+8] ; compare to 2nd 8 bytes of p2
262 paddd mm1, mm4 ; accumulate difference
267 ;; Do the (+2, +2 ) SAD
269 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
270 psadbw mm6, mm2 ; compare to 1st 8 bytes of prev p2
271 psadbw mm7, mm3 ; 2nd 8 bytes of prev p2
273 paddd mm1, mm6 ; accumulate difference
275 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
278 add eax, edx ; update pointer to next row
282 jnz near nextrow_block_e1
284 ;; Do the last row of the (0,+2) SAD
285 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
286 movq mm4, [eax] ; load 1st 8 bytes of p1
287 movq mm5, [eax+8] ; load 2nd 8 bytes of p1
289 psadbw mm4, [ebx] ; compare to next 8 bytes of p2 (row 1)
290 psadbw mm5, [ebx+8] ; next 8 bytes of p1 (row 1)
291 paddd mm0, mm4 ; accumulate difference
295 ;; Do the last row of rhw (+2, +2) SAD
296 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
300 psadbw mm4, [ebx] ; compare to 1st 8 bytes of prev p2
301 psadbw mm5, [ebx+8] ; 2nd 8 bytes of prev p2
302 paddd mm1, mm4 ; accumulate difference
306 mov eax, [ebp+24] ; Weightvec
308 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
310 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
325 global mblockq_dist22_mmxe
327 ; void mblockq_dist22_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh, int* resvec);
334 ; mm0 = distance accumulator
335 ; mm1 = distance accumulator
336 ; mm2 = previous p1 row
337 ; mm3 = previous p1 displaced by 1 byte...
341 ; mm7 = temp / 0 if first row 0xff otherwise
346 push ebp ; save frame pointer
353 pxor mm0, mm0 ; zero acculumator
354 pxor mm1, mm1 ; zero acculumator
355 pxor mm2, mm2 ; zero acculumator
356 pxor mm3, mm3 ; zero acculumator
358 mov eax, [ebp+8] ; get p1
359 mov ebx, [ebp+12] ; get p2
360 mov edx, [ebp+16] ; get lx
363 movq mm3, [eax+edx+1]
367 movq mm5, [ebx] ; load previous row reference block
368 ; mm2 /mm3 containts current row target block
370 psadbw mm2, mm5 ; Comparse (x+0,y+2)
373 psadbw mm3, mm5 ; Compare (x+2,y+2)
374 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
377 pshufw mm1, mm1, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
379 movq mm2, [eax] ; Load current row traget block into mm2 / mm3
387 psadbw mm6, mm5 ; Compare (x+0,y+0)
389 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64
390 psadbw mm7, mm5 ; Compare (x+2,y+0)
392 pshufw mm0, mm0, 2*1 + 3 * 4 + 0 * 16 + 1 * 64