4 ;;; Enhanced MMX optimized Sum Absolute Differences routines for macroblocks
5 ;;; (interpolated, 1-pel, 2*2 sub-sampled pel and 4*4 sub-sampled pel)
7 ; dist1_* Original Copyright (C) 2000 Chris Atenasio <chris@crud.net>
8 ; Enhancements and rest Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
11 ; This program is free software; you can redistribute it and/or
12 ; modify it under the terms of the GNU General Public License
13 ; as published by the Free Software Foundation; either version 2
14 ; of the License, or (at your option) any later version.
16 ; This program is distributed in the hope that it will be useful,
17 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
18 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 ; GNU General Public License for more details.
21 ; You should have received a copy of the GNU General Public License
22 ; along with this program; if not, write to the Free Software
23 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
31 ; int dist1_mmx(unsigned char *blk1,unsigned char *blk2,int lx,int h, int distlim);
32 ; N.b. distlim is *ignored* as testing for it is more expensive than the
33 ; occasional saving by aborting the computionation early...
34 ; esi = p1 (init: blk1)
35 ; edi = p2 (init: blk2)
37 ; ecx = rowsleft (init: h)
40 ; mm0 = distance accumulators (4 words)
53 push ebp ; save frame pointer
56 push ebx ; Saves registers (called saves convention in
57 push ecx ; x86 GCC it seems)
62 pxor mm0, mm0 ; zero acculumators
64 mov esi, [ebp+8] ; get p1
65 mov edi, [ebp+12] ; get p2
66 mov edx, [ebp+16] ; get lx
67 mov ecx, [ebp+20] ; get rowsleft
68 ;mov ebx, [ebp+24] ; distlim
72 movq mm4, [esi] ; load first 8 bytes of p1 row
73 movq mm5, [edi] ; load first 8 bytes of p2 row
75 movq mm7, mm4 ; mm5 = abs(mm4-mm5)
80 ;; Add the abs(mm4-mm5) bytes to the accumulators
81 movq mm2, [esi+8] ; load second 8 bytes of p1 row (interleaved)
82 movq mm7,mm5 ; mm7 := [i : B0..3, mm1]W
84 movq mm3, [edi+8] ; load second 8 bytes of p2 row (interleaved)
89 ;; This is logically where the mm2, mm3 loads would go...
91 movq mm7, mm2 ; mm3 = abs(mm2-mm3)
96 ;; Add the abs(mm4-mm5) bytes to the accumulators
97 movq mm7,mm3 ; mm7 := [i : B0..3, mm1]W
102 add esi, edx ; update pointer to next row
114 ;; Sum the Accumulators
115 movq mm5, mm0 ; mm5 := [W0+W2,W1+W3, mm0
120 movq mm7, mm4 ; mm6 := [W0+W2+W1+W3, mm0]
123 movd eax, mm4 ; store return value
134 emms ; clear mmx registers
137 ;;; dist1_01_mmx.s: mmx1 optimised 7bit*8 word absolute difference sum
138 ;;; We're reduce to seven bits as otherwise we also have to mess
139 ;;; horribly with carries and signed only comparisons make the code
140 ;;; simply enormous (and probably barely faster than a simple loop).
141 ;;; Since signals with a bona-fide 8bit res will be rare we simply
142 ;;; take the precision hit...
143 ;;; Actually we don't worry about carries from the low-order bits
144 ;;; either so 1/4 of the time we'll be 1 too low...
146 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
149 ; This program is free software; you can redistribute it and/or
150 ; modify it under the terms of the GNU General Public License
151 ; as published by the Free Software Foundation; either version 2
152 ; of the License, or (at your option) any later version.
154 ; This program is distributed in the hope that it will be useful,
155 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
156 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
157 ; GNU General Public License for more details.
159 ; You should have received a copy of the GNU General Public License
160 ; along with this program; if not, write to the Free Software
161 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
169 ; int dist1_01_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
171 ; esi = p1 (init: blk1)
172 ; edi = p2 (init: blk2)
173 ; ecx = rowsleft (init: h)
176 ; mm0 = distance accumulators (4 words)
180 ; mm4 = temp 4 bytes in words interpolating p1, p1+1
181 ; mm5 = temp 4 bytes in words from p2
182 ; mm6 = temp comparison bit mask p1,p2
183 ; mm7 = temp comparison bit mask p2,p1
188 push ebp ; save stack pointer
189 mov ebp, esp ; so that we can do this
191 push ebx ; Saves registers (called saves convention in
192 push ecx ; x86 GCC it seems)
197 pxor mm0, mm0 ; zero acculumators
199 mov esi, [ebp+8] ; get p1
200 mov edi, [ebp+12] ; get p2
201 mov edx, [ebp+16] ; get lx
202 mov ecx, [ebp+20] ; rowsleft := h
203 jmp nextrowmm01 ; snap to it
208 ;; First 8 bytes of row
211 ;; First 4 bytes of 8
213 movq mm4, [esi] ; mm4 := first 4 bytes p1
215 movq mm2, mm4 ; mm2 records all 8 bytes
216 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
218 movq mm6, [esi+1] ; mm6 := first 4 bytes p1+1
219 movq mm3, mm6 ; mm3 records all 8 bytes
221 paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
224 movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
229 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
231 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
235 paddw mm0, mm6 ; Add to accumulator
237 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
239 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
242 paddw mm0, mm5 ; Add to accumulator
244 ;; Second 4 bytes of 8
246 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
249 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
252 paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
255 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
259 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
261 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
265 paddw mm0, mm6 ; Add to accumulator
267 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
269 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
272 paddw mm0, mm5 ; Add to accumulator
276 ;; Second 8 bytes of row
278 ;; First 4 bytes of 8
280 movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
282 movq mm2, mm4 ; mm2 records all 8 bytes
283 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
285 movq mm6, [esi+9] ; mm6 := first 4 bytes p1+9
286 movq mm3, mm6 ; mm3 records all 8 bytes
288 paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
291 movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
296 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
298 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
302 paddw mm0, mm6 ; Add to accumulator
304 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
306 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
309 paddw mm0, mm5 ; Add to accumulator
311 ;; Second 4 bytes of 8
313 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
316 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
319 paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
322 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
326 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
328 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
332 paddw mm0, mm6 ; Add to accumulator
334 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
336 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
339 paddw mm0, mm5 ; Add to accumulator
343 ;; Loop termination condition... and stepping
346 add esi, edx ; update pointer to next row
350 test ecx, ecx ; check rowsleft
354 ;; Sum the Accumulators
361 movd eax, mm0 ; store return value
370 pop ebp ; restore stack pointer
372 emms ; clear mmx registers
373 ret ; we now return you to your regular programming
375 ;;; dist1_01_mmx.s: mmx1 optimised 7bit*8 word absolute difference sum
376 ;;; We're reduce to seven bits as otherwise we also have to mess
377 ;;; horribly with carries and signed only comparisons make the code
378 ;;; simply enormous (and probably barely faster than a simple loop).
379 ;;; Since signals with a bona-fide 8bit res will be rare we simply
380 ;;; take the precision hit...
381 ;;; Actually we don't worry about carries from the low-order bits
382 ;;; either so 1/4 of the time we'll be 1 too low...
384 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
387 ; This program is free software; you can redistribute it and/or
388 ; modify it under the terms of the GNU General Public License
389 ; as published by the Free Software Foundation; either version 2
390 ; of the License, or (at your option) any later version.
392 ; This program is distributed in the hope that it will be useful,
393 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
394 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
395 ; GNU General Public License for more details.
397 ; You should have received a copy of the GNU General Public License
398 ; along with this program; if not, write to the Free Software
399 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
407 ; int dist1_10_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
409 ; esi = p1 (init: blk1)
410 ; edi = p2 (init: blk2)
412 ; ecx = rowsleft (init: h)
415 ; mm0 = distance accumulators (4 words)
419 ; mm4 = temp 4 bytes in words interpolating p1, p1+1
420 ; mm5 = temp 4 bytes in words from p2
421 ; mm6 = temp comparison bit mask p1,p2
422 ; mm7 = temp comparison bit mask p2,p1
427 push ebp ; save stack pointer
428 mov ebp, esp ; so that we can do this
430 push ebx ; Saves registers (called saves convention in
431 push ecx ; x86 GCC it seems)
436 pxor mm0, mm0 ; zero acculumators
438 mov esi, [ebp+8] ; get p1
439 mov edi, [ebp+12] ; get p2
440 mov edx, [ebp+16] ; get lx
441 mov ecx, [ebp+20] ; rowsleft := h
444 jmp nextrowmm10 ; snap to it
449 ;; First 8 bytes of row
452 ;; First 4 bytes of 8
454 movq mm4, [esi] ; mm4 := first 4 bytes p1
456 movq mm2, mm4 ; mm2 records all 8 bytes
457 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
459 movq mm6, [ebx] ; mm6 := first 4 bytes p1+lx
460 movq mm3, mm6 ; mm3 records all 8 bytes
462 paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
465 movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
470 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
472 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
476 paddw mm0, mm6 ; Add to accumulator
478 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
480 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
483 paddw mm0, mm5 ; Add to accumulator
485 ;; Second 4 bytes of 8
487 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
490 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
493 paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
496 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
500 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
502 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
506 paddw mm0, mm6 ; Add to accumulator
508 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
510 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
513 paddw mm0, mm5 ; Add to accumulator
517 ;; Second 8 bytes of row
519 ;; First 4 bytes of 8
521 movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
523 movq mm2, mm4 ; mm2 records all 8 bytes
524 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
526 movq mm6, [ebx+8] ; mm6 := first 4 bytes p1+lx+8
527 movq mm3, mm6 ; mm3 records all 8 bytes
529 paddw mm4, mm6 ; mm4 := First 4 bytes interpolated in words
532 movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
537 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
539 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
543 paddw mm0, mm6 ; Add to accumulator
545 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
547 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
550 paddw mm0, mm5 ; Add to accumulator
552 ;; Second 4 bytes of 8
554 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
557 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
560 paddw mm4, mm6 ; mm4 := First 4 Interpolated bytes in words
563 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
567 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
569 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
573 paddw mm0, mm6 ; Add to accumulator
575 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
577 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
580 paddw mm0, mm5 ; Add to accumulator
584 ;; Loop termination condition... and stepping
587 add esi, edx ; update pointer to next row
592 test ecx, ecx ; check rowsleft
595 ;; Sum the Accumulators
602 movd eax, mm0 ; store return value
612 pop ebp ; restore stack pointer
614 emms ; clear mmx registers
615 ret ; we now return you to your regular programming
619 ; Copyright (C) 2000 Andrew Stevens <as@comlab.ox.ac.uk>
622 ; This program is free software; you can redistribute it and/or
623 ; modify it under the terms of the GNU General Public License
624 ; as published by the Free Software Foundation; either version 2
625 ; of the License, or (at your option) any later version.
627 ; This program is distributed in the hope that it will be useful,
628 ; but WITHOUT ANY WARRANTY; without even the implied warranty of
629 ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
630 ; GNU General Public License for more details.
632 ; You should have received a copy of the GNU General Public License
633 ; along with this program; if not, write to the Free Software
634 ; Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
642 ; int dist1_11_mmx(unsigned char *p1,unsigned char *p2,int lx,int h);
644 ; esi = p1 (init: blk1)
645 ; edi = p2 (init: blk2)
647 ; ecx = rowsleft (init: h)
650 ; mm0 = distance accumulators (4 words)
654 ; I'd love to find someplace to stash p1+1 and p1+lx+1's bytes
655 ; but I don't think thats going to happen in iA32-land...
656 ; mm4 = temp 4 bytes in words interpolating p1, p1+1
657 ; mm5 = temp 4 bytes in words from p2
658 ; mm6 = temp comparison bit mask p1,p2
659 ; mm7 = temp comparison bit mask p2,p1
664 push ebp ; save stack pointer
665 mov ebp, esp ; so that we can do this
667 push ebx ; Saves registers (called saves convention in
668 push ecx ; x86 GCC it seems)
673 pxor mm0, mm0 ; zero acculumators
675 mov esi, [ebp+8] ; get p1
676 mov edi, [ebp+12] ; get p2
677 mov edx, [ebp+16] ; get lx
678 mov ecx, [ebp+20] ; rowsleft := h
681 jmp nextrowmm11 ; snap to it
686 ;; First 8 bytes of row
689 ;; First 4 bytes of 8
691 movq mm4, [esi] ; mm4 := first 4 bytes p1
693 movq mm2, mm4 ; mm2 records all 8 bytes
694 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
696 movq mm6, [ebx] ; mm6 := first 4 bytes p1+lx
697 movq mm3, mm6 ; mm3 records all 8 bytes
702 movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1
703 punpcklbw mm5, mm7 ; First 4 bytes p1 in Words...
705 movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1
709 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
711 movq mm5, [edi] ; mm5:=first 4 bytes of p2 in words
716 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
718 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
722 paddw mm0, mm6 ; Add to accumulator
724 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
726 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
729 paddw mm0, mm5 ; Add to accumulator
731 ;; Second 4 bytes of 8
733 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
736 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
740 movq mm5, [esi+1] ; mm5 := first 4 bytes p1+1
741 punpckhbw mm5, mm7 ; First 4 bytes p1 in Words...
743 movq mm6, [ebx+1] ; mm6 := first 4 bytes p1+lx+1
747 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
749 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
753 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
755 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
759 paddw mm0, mm6 ; Add to accumulator
761 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
763 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
766 paddw mm0, mm5 ; Add to accumulator
770 ;; Second 8 bytes of row
772 ;; First 4 bytes of 8
774 movq mm4, [esi+8] ; mm4 := first 4 bytes p1+8
776 movq mm2, mm4 ; mm2 records all 8 bytes
777 punpcklbw mm4, mm7 ; First 4 bytes p1 in Words...
779 movq mm6, [ebx+8] ; mm6 := first 4 bytes p1+lx+8
780 movq mm3, mm6 ; mm3 records all 8 bytes
785 movq mm5, [esi+9] ; mm5 := first 4 bytes p1+9
786 punpcklbw mm5, mm7 ; First 4 bytes p1 in Words...
788 movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+9
792 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
794 movq mm5, [edi+8] ; mm5:=first 4 bytes of p2+8 in words
799 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
801 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
805 paddw mm0, mm6 ; Add to accumulator
807 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
809 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
812 paddw mm0, mm5 ; Add to accumulator
814 ;; Second 4 bytes of 8
816 movq mm4, mm2 ; mm4 := Second 4 bytes p1 in words
819 movq mm6, mm3 ; mm6 := Second 4 bytes p1+1 in words
823 movq mm5, [esi+9] ; mm5 := first 4 bytes p1+1
824 punpckhbw mm5, mm7 ; First 4 bytes p1 in Words...
826 movq mm6, [ebx+9] ; mm6 := first 4 bytes p1+lx+1
830 psrlw mm4, 2 ; mm4 := First 4 bytes interpolated in words
832 movq mm5, mm1 ; mm5:= second 4 bytes of p2 in words
836 pcmpgtw mm7,mm5 ; mm7 := [i : W0..3,mm4>mm5]
838 movq mm6,mm4 ; mm6 := [i : W0..3, (mm4-mm5)*(mm4-mm5 > 0)]
842 paddw mm0, mm6 ; Add to accumulator
844 movq mm6,mm5 ; mm6 := [i : W0..3,mm5>mm4]
846 psubw mm5,mm4 ; mm5 := [i : B0..7, (mm5-mm4)*(mm5-mm4 > 0)]
849 paddw mm0, mm5 ; Add to accumulator
853 ;; Loop termination condition... and stepping
856 add esi, edx ; update pointer to next row
861 test ecx, ecx ; check rowsleft
864 ;; Sum the Accumulators
871 movd eax, mm0 ; store return value
880 pop ebp ; restore stack pointer
882 emms ; clear mmx registers
883 ret ; we now return you to your regular programming
888 ; int dist22_mmx(unsigned char *blk1,unsigned char *blk2,int lx,int h);
890 ; eax = p1 (init: blk1)
891 ; ebx = p2 (init: blk2)
892 ; ecx = rowsleft (init: h)
895 ; mm0 = distance accumulators (4 words)
907 push ebp ; save stack pointer
908 mov ebp, esp ; so that we can do this
910 push ebx ; Saves registers (called saves convention in
911 push ecx ; x86 GCC it seems)
914 pxor mm0, mm0 ; zero acculumators
916 mov eax, [ebp+8] ; get p1
917 mov ebx, [ebp+12] ; get p2
918 mov edx, [ebp+16] ; get lx
920 mov ecx, [ebp+20] ; get rowsleft
922 jmp nextrow ; snap to it
925 movq mm4, [eax] ; load 8 bytes of p1
926 movq mm5, [ebx] ; load 8 bytes of p2
928 movq mm7, mm4 ; mm5 = abs(*p1-*p2)
931 add eax, edx ; update pointer to next row
934 ;; Add the mm5 bytes to the accumulatores
939 add ebx, edx ; update pointer to next row
942 movq mm4, [eax] ; load 8 bytes of p1 (next row)
943 movq mm5, [ebx] ; load 8 bytes of p2 (next row)
945 movq mm7, mm4 ; mm5 = abs(*p1-*p2)
948 add eax, edx ; update pointer to next row
951 ;; Add the mm5 bytes to the accumulatores
954 add ebx, edx ; update pointer to next row
963 ;; Sum the Accumulators
974 movd eax, mm0 ; store return value
979 pop ebx ; ia86 needs a fizz instruction
981 pop ebp ; restore stack pointer
983 emms ; clear mmx registers
984 ret ; we now return you to your regular programming
991 ; int dist44_mmx(unsigned char *blk1,unsigned char *blk2,int qlx,int qh);
999 ; mm0 = distance accumulator left block p1
1000 ; mm1 = distance accumulator right block p1
1002 ; mm3 = right block of p1
1003 ; mm4 = left block of p1
1010 push ebp ; save stack pointer
1011 mov ebp, esp ; so that we can do this
1018 pxor mm0, mm0 ; zero acculumator
1021 mov eax, [ebp+8] ; get p1
1022 mov ebx, [ebp+12] ; get p2
1023 mov edx, [ebp+16] ; get qlx
1024 mov esi, [ebp+20] ; get rowsleft
1025 jmp nextrowqd ; snap to it
1030 ;; Beware loop obfuscated by interleaving to try to
1031 ;; hide latencies...
1033 movq mm4, [eax] ; mm4 = first 4 bytes of p1 in words
1034 movq mm5, [ebx] ; mm5 = 4 bytes of p2 in words
1044 add eax, edx ; update a pointer to next row
1045 ; punpckhbw mm3, mm2 ; mm3 = 2nd 4 bytes of p1 in words
1048 paddw mm0, mm7 ; Add absolute differences to left block accumulators
1054 add ebx, edx ; update a pointer to next row
1058 ; paddw mm1, mm7 ; Add absolute differences to right block accumulators
1064 ;; Sum the accumulators
1072 movd eax, mm0 ; store return value
1091 pop ebp ; restore stack pointer
1093 emms ; clear mmx registers
1094 ret ; we now return you to your regular programming