4 ;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks
5 ;;; (interpolated, 1-pel, 2*2 sub-sampled pel and 4*4 sub-sampled pel)
7 ; dist1_* Original Copyright (C) 2000 Chris Atenasio <chris@crud.net>
8 ; Enhancements and rest Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
11 ; This program is free software; you can redistribute it and/or
12 ; modify it under the terms of the GNU General Public License
13 ; as published by the Free Software Foundation; either version 2
14 ; of the License, or (at your option) any later version.
16 ; This program is distributed in the hope that it will be useful,
17 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ; GNU General Public License for more details.
21 ; You should have received a copy of the GNU General Public License
22 ; along with this program; if not, write to the Free Software
23 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
31 ; int dist1_mmx(unsigned char *blk1,unsigned char *blk2,int lx,int h, int distlim);
32 ; N.b. distlim is *ignored* as testing for it is more expensive than the
33 ; occasional saving by aborting the computionation early...
34 ; esi = p1 (init: blk1)
35 ; edi = p2 (init: blk2)
37 ; ecx = rowsleft (init: h)
40 ; mm0 = distance accumulators (4 words)
52 push ebp ; save frame pointer
55 push ebx ; Saves registers (called saves convention in
56 push ecx ; x86 GCC it seems)
61 pxor mm0, mm0 ; zero acculumators
63 mov esi, [ebp+8] ; get p1
64 mov edi, [ebp+12] ; get p2
65 mov edx, [ebp+16] ; get lx
66 mov ecx, [ebp+20] ; get rowsleft
67 ;mov ebx, [ebp+24] ; distlim
71 movq mm4, [esi] ; load first 8 bytes of p1 row
72 movq mm5, [edi] ; load first 8 bytes of p2 row
74 movq mm7, mm4 ; mm5 = abs(mm4-mm5)
79 ;; Add the abs(mm4-mm5) bytes to the accumulators
80 movq mm2, [esi+8] ; load second 8 bytes of p1 row (interleaved)
81 movq mm7,mm5 ; mm7 := [i : B0..3, mm1]W
83 movq mm3, [edi+8] ; load second 8 bytes of p2 row (interleaved)
88 ;; This is logically where the mm2, mm3 loads would go...
90 movq mm7, mm2 ; mm3 = abs(mm2-mm3)
95 ;; Add the abs(mm4-mm5) bytes to the accumulators
96 movq mm7,mm3 ; mm7 := [i : B0..3, mm1]W
101 add esi, edx ; update pointer to next row
113 ;; Sum the Accumulators
114 movq mm5, mm0 ; mm5 := [W0+W2,W1+W3, mm0
119 movq mm7, mm4 ; mm6 := [W0+W2+W1+W3, mm0]
122 movd eax, mm4 ; store return value
133 emms ; clear mmx registers
136 ;;; dist1_01_mmx.s: mmx1 optimised 7bit*8 word absolute difference sum
137 ;;; We're reduce to seven bits as otherwise we also have to mess
138 ;;; horribly with carries and signed only comparisons make the code
139 ;;; simply enormous (and probably barely faster than a simple loop).
140 ;;; Since signals with a bona-fide 8bit res will be rare we simply
141 ;;; take the precision hit...
142 ;;; Actually we don't worry about carries from the low-order bits
143 ;;; either so 1/4 of the time we'll be 1 too low...
145 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
148 ; This program is free software; you can redistribute it and/or
149 ; modify it under the terms of the GNU General Public License
150 ; as published by the Free Software Foundation; either version 2
151 ; of the License, or (at your option) any later version.
153 ; This program is distributed in the hope that it will be useful,
154 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
155 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
156 ; GNU General Public License for more details.
158 ; You should have received a copy of the GNU General Public License
159 ; along with this program; if not, write to the Free Software
160 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
168 ; int dist1_01_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
170 ; esi = p1 (init: blk1)
171 ; edi = p2 (init: blk2)
172 ; ecx = rowsleft (init: h)
175 ; mm0 = distance accumulators (4 words)
179 ; mm4 = temp 4 bytes in words interpolating p1, p1+1
180 ; mm5 = temp 4 bytes in words from p2
181 ; mm6 = temp comparison bit mask p1,p2
182 ; mm7 = temp comparison bit mask p2,p1
187 push ebp ; save stack pointer
188 mov ebp, esp ; so that we can do this
190 push ebx ; Saves registers (called saves convention in
191 push ecx ; x86 GCC it seems)
196 pxor mm0, mm0 ; zero acculumators
198 mov esi, [ebp+8] ; get p1
199 mov edi, [ebp+12] ; get p2
200 mov edx, [ebp+16] ; get lx
201 mov ecx, [ebp+20] ; rowsleft := h
202 jmp nextrowmm01 ; snap to it
207 ;; First 8 bytes of row
210 ;; First 4 bytes of 8
212 movq mm4, [esi] ; mm4 := first 4 bytes p1
214 movq mm2, mm4 ; mm2 records all 8 bytes
215 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
217 movq mm6, [esi+1] ; mm6 := first 4 bytes p1+1
218 movq mm3, mm6 ; mm3 records all 8 bytes
220 paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
223 movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
228 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
230 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
234 paddw mm0, mm6 ; Add to accumulator
236 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
238 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
241 paddw mm0, mm5 ; Add to accumulator
243 ;; Second 4 bytes of 8
245 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
248 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
251 paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
254 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
258 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
260 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
264 paddw mm0, mm6 ; Add to accumulator
266 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
268 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
271 paddw mm0, mm5 ; Add to accumulator
275 ;; Second 8 bytes of row
277 ;; First 4 bytes of 8
279 movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
281 movq mm2, mm4 ; mm2 records all 8 bytes
282 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
284 movq mm6, [esi+9] ; mm6 := first 4 bytes p1+9
285 movq mm3, mm6 ; mm3 records all 8 bytes
287 paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
290 movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
295 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
297 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
301 paddw mm0, mm6 ; Add to accumulator
303 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
305 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
308 paddw mm0, mm5 ; Add to accumulator
310 ;; Second 4 bytes of 8
312 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
315 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
318 paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
321 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
325 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
327 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
331 paddw mm0, mm6 ; Add to accumulator
333 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
335 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
338 paddw mm0, mm5 ; Add to accumulator
342 ;; Loop termination condition... and stepping
345 add esi, edx ; update pointer to next row
349 test ecx, ecx ; check rowsleft
353 ;; Sum the Accumulators
360 movd eax, mm0 ; store return value
369 pop ebp ; restore stack pointer
371 emms ; clear mmx registers
372 ret ; we now return you to your regular programming
374 ;;; dist1_01_mmx.s: mmx1 optimised 7bit*8 word absolute difference sum
375 ;;; We're reduce to seven bits as otherwise we also have to mess
376 ;;; horribly with carries and signed only comparisons make the code
377 ;;; simply enormous (and probably barely faster than a simple loop).
378 ;;; Since signals with a bona-fide 8bit res will be rare we simply
379 ;;; take the precision hit...
380 ;;; Actually we don't worry about carries from the low-order bits
381 ;;; either so 1/4 of the time we'll be 1 too low...
383 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
386 ; This program is free software; you can redistribute it and/or
387 ; modify it under the terms of the GNU General Public License
388 ; as published by the Free Software Foundation; either version 2
389 ; of the License, or (at your option) any later version.
391 ; This program is distributed in the hope that it will be useful,
392 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
393 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
394 ; GNU General Public License for more details.
396 ; You should have received a copy of the GNU General Public License
397 ; along with this program; if not, write to the Free Software
398 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
406 ; int dist1_10_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
408 ; esi = p1 (init: blk1)
409 ; edi = p2 (init: blk2)
411 ; ecx = rowsleft (init: h)
414 ; mm0 = distance accumulators (4 words)
418 ; mm4 = temp 4 bytes in words interpolating p1, p1+1
419 ; mm5 = temp 4 bytes in words from p2
420 ; mm6 = temp comparison bit mask p1,p2
421 ; mm7 = temp comparison bit mask p2,p1
426 push ebp ; save stack pointer
427 mov ebp, esp ; so that we can do this
429 push ebx ; Saves registers (called saves convention in
430 push ecx ; x86 GCC it seems)
435 pxor mm0, mm0 ; zero acculumators
437 mov esi, [ebp+8] ; get p1
438 mov edi, [ebp+12] ; get p2
439 mov edx, [ebp+16] ; get lx
440 mov ecx, [ebp+20] ; rowsleft := h
443 jmp nextrowmm10 ; snap to it
448 ;; First 8 bytes of row
451 ;; First 4 bytes of 8
453 movq mm4, [esi] ; mm4 := first 4 bytes p1
455 movq mm2, mm4 ; mm2 records all 8 bytes
456 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
458 movq mm6, [ebx] ; mm6 := first 4 bytes p1+lx
459 movq mm3, mm6 ; mm3 records all 8 bytes
461 paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
464 movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
469 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
471 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
475 paddw mm0, mm6 ; Add to accumulator
477 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
479 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
482 paddw mm0, mm5 ; Add to accumulator
484 ;; Second 4 bytes of 8
486 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
489 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
492 paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
495 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
499 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
501 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
505 paddw mm0, mm6 ; Add to accumulator
507 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
509 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
512 paddw mm0, mm5 ; Add to accumulator
516 ;; Second 8 bytes of row
518 ;; First 4 bytes of 8
520 movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
522 movq mm2, mm4 ; mm2 records all 8 bytes
523 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
525 movq mm6, [ebx+8] ; mm6 := first 4 bytes p1+lx+8
526 movq mm3, mm6 ; mm3 records all 8 bytes
528 paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
531 movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
536 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
538 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
542 paddw mm0, mm6 ; Add to accumulator
544 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
546 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
549 paddw mm0, mm5 ; Add to accumulator
551 ;; Second 4 bytes of 8
553 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
556 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
559 paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
562 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
566 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
568 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
572 paddw mm0, mm6 ; Add to accumulator
574 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
576 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
579 paddw mm0, mm5 ; Add to accumulator
583 ;; Loop termination condition... and stepping
586 add esi, edx ; update pointer to next row
591 test ecx, ecx ; check rowsleft
594 ;; Sum the Accumulators
601 movd eax, mm0 ; store return value
611 pop ebp ; restore stack pointer
613 emms ; clear mmx registers
614 ret ; we now return you to your regular programming
618 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
621 ; This program is free software; you can redistribute it and/or
622 ; modify it under the terms of the GNU General Public License
623 ; as published by the Free Software Foundation; either version 2
624 ; of the License, or (at your option) any later version.
626 ; This program is distributed in the hope that it will be useful,
627 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
628 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
629 ; GNU General Public License for more details.
631 ; You should have received a copy of the GNU General Public License
632 ; along with this program; if not, write to the Free Software
633 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
641 ; int dist1_11_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
643 ; esi = p1 (init: blk1)
644 ; edi = p2 (init: blk2)
646 ; ecx = rowsleft (init: h)
649 ; mm0 = distance accumulators (4 words)
653 ; I'd love to find someplace to stash p1+1 and p1+lx+1's bytes
654 ; but I don't think thats going to happen in iA32-land...
655 ; mm4 = temp 4 bytes in words interpolating p1, p1+1
656 ; mm5 = temp 4 bytes in words from p2
657 ; mm6 = temp comparison bit mask p1,p2
658 ; mm7 = temp comparison bit mask p2,p1
663 push ebp ; save stack pointer
664 mov ebp, esp ; so that we can do this
666 push ebx ; Saves registers (called saves convention in
667 push ecx ; x86 GCC it seems)
672 pxor mm0, mm0 ; zero acculumators
674 mov esi, [ebp+8] ; get p1
675 mov edi, [ebp+12] ; get p2
676 mov edx, [ebp+16] ; get lx
677 mov ecx, [ebp+20] ; rowsleft := h
680 jmp nextrowmm11 ; snap to it
685 ;; First 8 bytes of row
688 ;; First 4 bytes of 8
690 movq mm4, [esi] ; mm4 := first 4 bytes p1
692 movq mm2, mm4 ; mm2 records all 8 bytes
693 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
695 movq mm6, [ebx] ; mm6 := first 4 bytes p1+lx
696 movq mm3, mm6 ; mm3 records all 8 bytes
701 movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1
702 punpcklbw mm5, mm7 ; First 4 bytes p1 in Words...
704 movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1
708 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
710 movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
715 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
717 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
721 paddw mm0, mm6 ; Add to accumulator
723 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
725 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
728 paddw mm0, mm5 ; Add to accumulator
730 ;; Second 4 bytes of 8
732 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
735 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
739 movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1
740 punpckhbw mm5, mm7 ; First 4 bytes p1 in Words...
742 movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1
746 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
748 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
752 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
754 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
758 paddw mm0, mm6 ; Add to accumulator
760 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
762 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
765 paddw mm0, mm5 ; Add to accumulator
769 ;; Second 8 bytes of row
771 ;; First 4 bytes of 8
773 movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
775 movq mm2, mm4 ; mm2 records all 8 bytes
776 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
778 movq mm6, [ebx+8] ; mm6 := first 4 bytes p1+lx+8
779 movq mm3, mm6 ; mm3 records all 8 bytes
784 movq mm5, [esi+9] ; mm5 := first 4 bytes p1+9
785 punpcklbw mm5, mm7 ; First 4 bytes p1 in Words...
787 movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+9
791 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
793 movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
798 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
800 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
804 paddw mm0, mm6 ; Add to accumulator
806 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
808 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
811 paddw mm0, mm5 ; Add to accumulator
813 ;; Second 4 bytes of 8
815 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
818 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
822 movq mm5, [esi+9] ; mm5 := first 4 bytes p1+1
823 punpckhbw mm5, mm7 ; First 4 bytes p1 in Words...
825 movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+1
829 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
831 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
835 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
837 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
841 paddw mm0, mm6 ; Add to accumulator
843 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
845 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
848 paddw mm0, mm5 ; Add to accumulator
852 ;; Loop termination condition... and stepping
855 add esi, edx ; update pointer to next row
860 test ecx, ecx ; check rowsleft
863 ;; Sum the Accumulators
870 movd eax, mm0 ; store return value
879 pop ebp ; restore stack pointer
881 emms ; clear mmx registers
882 ret ; we now return you to your regular programming
887 ; int dist22_mmx(unsigned char *blk1,unsigned char *blk2,int lx,int h);
889 ; eax = p1 (init: blk1)
890 ; ebx = p2 (init: blk2)
891 ; ecx = rowsleft (init: h)
894 ; mm0 = distance accumulators (4 words)
906 push ebp ; save stack pointer
907 mov ebp, esp ; so that we can do this
909 push ebx ; Saves registers (called saves convention in
910 push ecx ; x86 GCC it seems)
913 pxor mm0, mm0 ; zero acculumators
915 mov eax, [ebp+8] ; get p1
916 mov ebx, [ebp+12] ; get p2
917 mov edx, [ebp+16] ; get lx
919 mov ecx, [ebp+20] ; get rowsleft
921 jmp nextrow ; snap to it
924 movq mm4, [eax] ; load 8 bytes of p1
925 movq mm5, [ebx] ; load 8 bytes of p2
927 movq mm7, mm4 ; mm5 = abs(*p1-*p2)
930 add eax, edx ; update pointer to next row
933 ;; Add the mm5 bytes to the accumulatores
938 add ebx, edx ; update pointer to next row
941 movq mm4, [eax] ; load 8 bytes of p1 (next row)
942 movq mm5, [ebx] ; load 8 bytes of p2 (next row)
944 movq mm7, mm4 ; mm5 = abs(*p1-*p2)
947 add eax, edx ; update pointer to next row
950 ;; Add the mm5 bytes to the accumulatores
953 add ebx, edx ; update pointer to next row
962 ;; Sum the Accumulators
973 movd eax, mm0 ; store return value
978 pop ebx ; ia86 needs a fizz instruction
980 pop ebp ; restore stack pointer
982 emms ; clear mmx registers
983 ret ; we now return you to your regular programming
990 ; int dist44_mmx(unsigned char *blk1,unsigned char *blk2,int qlx,int qh);
998 ; mm0 = distance accumulator left block p1
999 ; mm1 = distance accumulator right block p1
1001 ; mm3 = right block of p1
1002 ; mm4 = left block of p1
1009 push ebp ; save stack pointer
1010 mov ebp, esp ; so that we can do this
1017 pxor mm0, mm0 ; zero acculumator
1020 mov eax, [ebp+8] ; get p1
1021 mov ebx, [ebp+12] ; get p2
1022 mov edx, [ebp+16] ; get qlx
1023 mov esi, [ebp+20] ; get rowsleft
1024 jmp nextrowqd ; snap to it
1029 ;; Beware loop obfuscated by interleaving to try to
1030 ;; hide latencies...
1032 movq mm4, [eax] ; mm4 = first 4 bytes of p1 in words
1033 movq mm5, [ebx] ; mm5 = 4 bytes of p2 in words
1043 add eax, edx ; update a pointer to next row
1044 ; punpckhbw mm3, mm2 ; mm3 = 2nd 4 bytes of p1 in words
1047 paddw mm0, mm7 ; Add absolute differences to left block accumulators
1053 add ebx, edx ; update a pointer to next row
1057 ; paddw mm1, mm7 ; Add absolute differences to right block accumulators
1063 ;; Sum the accumulators
1071 movd eax, mm0 ; store return value
1090 pop ebp ; restore stack pointer
1092 emms ; clear mmx registers
1093 ret ; we now return you to your regular programming