4 ;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks
5 ;;; (interpolated, 1-pel, 2*2 sub-sampled pel and 4*4 sub-sampled pel)
7 ; dist1_* Original Copyright (C) 2000 Chris Atenasio <chris@crud.net>
8 ; Enhancements and rest Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
10 ;; Yes, I tried prefetch-ing. It makes no difference or makes
14 ; This program is free software; you can reaxstribute it and/or
15 ; modify it under the terms of the GNU General Public License
16 ; as published by the Free Software Foundation; either version 2
17 ; of the License, or (at your option) any later version.
19 ; This program is distributed in the hope that it will be useful,
20 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
21 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 ; GNU General Public License for more details.
24 ; You should have received a copy of the GNU General Public License
25 ; along with this program; if not, write to the Free Software
26 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
34 ; int dist1_00(char *blk1,char *blk2,int lx,int h,int distlim);
35 ; distlim unused - costs more to check than the savings of
36 ; aborting the computation early from time to time...
42 ; mm0 = distance accumulator
54 push ebp ; save frame pointer
61 pxor mm0, mm0 ; zero acculumator
63 mov eax, [ebp+8] ; get p1
65 mov ebx, [ebp+12] ; get p2
66 mov edx, [ebp+16] ; get lx
68 mov ecx, [ebp+20] ; get rowsleft
72 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
73 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
74 movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
75 add eax, edx ; update pointer to next row
76 paddd mm0, mm4 ; accumulate difference
78 psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
80 paddd mm0, mm5 ; accumulate difference
83 movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
84 psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
85 movq mm4, [eax+8] ; load next 8 bytes of p1 (row 2)
86 add eax, edx ; update pointer to next row
87 paddd mm0, mm6 ; accumulate difference
89 psadbw mm4, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
91 paddd mm0, mm4 ; accumulate difference
93 ;psubd mm2, mm3 ; decrease rowsleft
94 ;movq mm5, mm1 ; copy distlim
95 ;pcmpgtd mm5, mm0 ; distlim > dist?
96 ;pand mm2, mm5 ; mask rowsleft with answer
97 ;movd ecx, mm2 ; move rowsleft to ecx
99 ;add eax, edx ; update pointer to next row
100 ;add ebx, edx ; ditto
102 ;test ecx, ecx ; check rowsleft
106 movd eax, mm0 ; store return value
119 global dist1_00_Ammxe
120 ;; This is a special version that only does aligned accesses...
121 ;; Wonder if it'll make it faster on a P-III
122 ;; ANSWER: NO its slower hence no longer used.
124 ; int dist1_00(char *blk1,char *blk2,int lx,int h,int distlim);
125 ; distlim unused - costs more to check than the savings of
126 ; aborting the computation early from time to time...
132 ; mm0 = distance accumulator
134 ; mm2 = right shift to adjust for mis-align
135 ; mm3 = left shift to adjust for mis-align
143 push ebp ; save frame pointer
150 pxor mm0, mm0 ; zero acculumator
152 mov eax, [ebp+8] ; get p1
154 and ebx, 7 ; Misalignment!
156 jz near dist1_00_0misalign
157 sub eax, ebx ; Align eax
158 mov ecx, 8 ; ecx = 8-misalignment
160 shl ebx, 3 ; Convert into bit-shifts...
162 movd mm2, ebx ; mm2 = shift to start msb
163 movd mm3, ecx ; mm3 = shift to end lsb
165 mov ebx, [ebp+12] ; get p2
166 mov edx, [ebp+16] ; get lx
167 mov ecx, [ebp+20] ; get rowsleft
171 movq mm4, [eax] ; load first 8 bytes of aligned p1 (row 1)
172 movq mm5, [eax+8] ; load next 8 bytes of aligned p1 (row 1)
174 psrlq mm4, mm2 ; mm4 first 8 bytes of p1 proper
177 psadbw mm4, [ebx] ; compare to first 8 bytes of p2
179 movq mm7, [eax+16] ; load last 8 bytes of aligned p1
180 add eax, edx ; update pointer to next row
181 psrlq mm6, mm2 ; mm6 2nd 8 bytes of p1 proper
186 paddd mm0, mm4 ; accumulate difference
188 psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
190 paddd mm0, mm6 ; accumulate difference
195 movd eax, mm0 ; store return value
208 ; int dist1_01(char *blk1,char *blk2,int lx,int h);
215 ; mm0 = distance accumulator
218 ; mm3 = 2 (rows per loop)
233 pxor mm0, mm0 ; zero acculumator
235 mov eax, [ebp+8] ; get p1
236 mov ebx, [ebp+12] ; get p2
237 mov edx, [ebp+16] ; get lx
239 mov ecx, [ebp+20] ; get rowsleft
240 jmp nextrow01 ; snap to it
243 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
244 pavgb mm4, [eax+1] ; Interpolate...
245 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
246 paddd mm0, mm4 ; accumulate difference
248 movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
249 pavgb mm5, [eax+9] ; Interpolate
250 psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
251 paddd mm0, mm5 ; accumulate difference
253 add eax, edx ; update pointer to next row
256 movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
257 pavgb mm6, [eax+1] ; Interpolate
258 psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
259 paddd mm0, mm6 ; accumulate difference
261 movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2)
263 psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
264 paddd mm0, mm7 ; accumulate difference
266 add eax, edx ; update pointer to next row
269 sub ecx, 2 ; check rowsleft
270 jnz nextrow01 ; rinse and repeat
272 movd eax, mm0 ; store return value
278 pop ebp ; restore stack pointer
280 emms ; clear mmx registers
281 ret ; we now return you to your regular programming
286 ; int dist1_10(char *blk1,char *blk2,int lx,int h);
294 ; mm0 = distance accumulator
296 ; mm3 = 2 (rows per loop)
304 push ebp ; save stack pointer
312 pxor mm0, mm0 ; zero acculumator
314 mov eax, [ebp+8] ; get p1
315 mov ebx, [ebp+12] ; get p2
316 mov edx, [ebp+16] ; get lx
319 mov ecx, [ebp+20] ; get rowsleft
320 jmp nextrow10 ; snap to it
323 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
324 pavgb mm4, [edi] ; Interpolate...
325 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
326 paddd mm0, mm4 ; accumulate difference
328 movq mm5, [eax+8] ; load next 8 bytes of p1 (row 1)
329 pavgb mm5, [edi+8] ; Interpolate
330 psadbw mm5, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
331 paddd mm0, mm5 ; accumulate difference
333 add eax, edx ; update pointer to next row
337 movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
338 pavgb mm6, [edi] ; Interpolate
339 psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
340 paddd mm0, mm6 ; accumulate difference
342 movq mm7, [eax+8] ; load next 8 bytes of p1 (row 2)
344 psadbw mm7, [ebx+8] ; compare to next 8 bytes of p2 (row 2)
345 paddd mm0, mm7 ; accumulate difference
347 psubd mm2, mm3 ; decrease rowsleft
349 add eax, edx ; update pointer to next row
353 sub ecx, 2 ; check rowsleft (we're doing 2 at a time)
354 jnz nextrow10 ; rinse and repeat
356 movd eax, mm0 ; store return value
363 pop ebp ; restore stack pointer
365 emms ; clear mmx registers
366 ret ; we now return you to your regular programming
371 ; int dist1_11(char *blk1,char *blk2,int lx,int h);
380 ; mm0 = distance accumulator
382 ; mm3 = 2 (rows per loop)
390 push ebp ; save stack pointer
391 mov ebp, esp ; so that we can do this
393 push ebx ; save the pigs
394 push ecx ; make them squeal
395 push edx ; lets have pigs for every meal
398 pxor mm0, mm0 ; zero acculumator
400 mov eax, [ebp+8] ; get p1
401 mov ebx, [ebp+12] ; get p2
402 mov edx, [ebp+16] ; get lx
405 mov ecx, [ebp+20] ; get rowsleft
406 jmp nextrow11 ; snap to it
409 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
410 pavgb mm4, [edi] ; Interpolate...
414 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
415 paddd mm0, mm4 ; accumulate difference
417 movq mm6, [eax+8] ; load next 8 bytes of p1 (row 1)
418 pavgb mm6, [edi+8] ; Interpolate
422 psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
423 paddd mm0, mm6 ; accumulate difference
425 add eax, edx ; update pointer to next row
429 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
430 pavgb mm4, [edi] ; Interpolate...
434 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
435 paddd mm0, mm4 ; accumulate difference
437 movq mm6, [eax+8] ; load next 8 bytes of p1 (row 1)
438 pavgb mm6, [edi+8] ; Interpolate
442 psadbw mm6, [ebx+8] ; compare to next 8 bytes of p2 (row 1)
443 paddd mm0, mm6 ; accumulate difference
445 add eax, edx ; update pointer to next row
450 sub ecx, 2 ; check rowsleft
451 jnz near nextrow11 ; rinse and repeat
453 movd eax, mm0 ; store return value
460 pop ebp ; restore stack pointer
462 emms ; clear mmx registers
463 ret ; we now return you to your regular programming
467 ; int dist22_mmxe(unsigned char *blk1,unsigned char *blk2,int flx,int fh);
474 ; mm0 = distance accumulator
476 ; mm3 = 2 (rows per loop)
484 push ebp ; save frame pointer
491 pxor mm0, mm0 ; zero acculumator
493 mov eax, [ebp+8] ; get p1
494 mov ebx, [ebp+12] ; get p2
495 mov edx, [ebp+16] ; get lx
501 movq mm4, [eax] ; load first 8 bytes of p1 (row 1)
502 add eax, edx ; update pointer to next row
503 psadbw mm4, [ebx] ; compare to first 8 bytes of p2 (row 1)
505 paddd mm0, mm4 ; accumulate difference
508 movq mm6, [eax] ; load first 8 bytes of p1 (row 2)
509 add eax, edx ; update pointer to next row
510 psadbw mm6, [ebx] ; compare to first 8 bytes of p2 (row 2)
512 paddd mm0, mm6 ; accumulate difference
535 ; int dist44_mmxe(unsigned char *blk1,unsigned char *blk2,int qlx,int qh);
543 ; mm0 = distance accumulator left block p1
544 ; mm1 = distance accumulator right block p1
562 pxor mm0, mm0 ; zero acculumator
565 mov eax, [ebp+8] ; get p1
566 mov ebx, [ebp+12] ; get p2
567 mov edx, [ebp+16] ; get qlx
569 mov esi, [ebp+20] ; get rowsleft
570 jmp nextrowqd ; snap to it
573 movq mm4, [eax] ; load 8 bytes of p1 (two blocks!)
574 add eax, edx ; update pointer to next row
576 mov ecx, [ebx] ; load 4 bytes of p2
577 punpcklbw mm4, mm2 ; mm4 = bytes 0..3 p1 (spaced out)
579 punpcklbw mm5, mm2 ; mm5 = bytes 0..3 p2 (spaced out)
580 psadbw mm4, mm5 ; compare to left block
583 ; punpckhbw mm6, mm2 ; mm6 = bytes 4..7 p1 (spaced out)
585 paddd mm0, mm4 ; accumulate difference left block
587 ; psadbw mm6,mm5 ; compare to right block
590 ; paddd mm1, mm6 ; accumulate difference right block
605 pop ebp ; restore stack pointer
607 emms ; clear mmx registers
608 ret ; we now return you to your regular programming