11 #define m_(v) (*(mmx_t*)(((long long *)(v))))
13 static uint32_t sadd1[2] = { 0x00010001, 0x00010001 };
14 static uint32_t sadd2[2] = { 0x00020002, 0x00020002 };
16 static uint32_t bmask[2] = { 0x7f7f7f7f, 0x7f7f7f7f };
17 static uint32_t badd1[2] = { 0x01010101, 0x01010101 };
21 static inline void reca(uint8_t *s, uint8_t *d, int lx2, int h)
24 int j; for( j=0; j<h; ++j ) {
25 dp[0] = (uint32_t)(dp[0] + sp[0] + 1) >> 1;
26 dp[1] = (uint32_t)(dp[1] + sp[1] + 1) >> 1;
27 dp[2] = (uint32_t)(dp[2] + sp[2] + 1) >> 1;
28 dp[3] = (uint32_t)(dp[3] + sp[3] + 1) >> 1;
29 dp[4] = (uint32_t)(dp[4] + sp[4] + 1) >> 1;
30 dp[5] = (uint32_t)(dp[5] + sp[5] + 1) >> 1;
31 dp[6] = (uint32_t)(dp[6] + sp[6] + 1) >> 1;
32 dp[7] = (uint32_t)(dp[7] + sp[7] + 1) >> 1;
33 dp[8] = (uint32_t)(dp[8] + sp[8] + 1) >> 1;
34 dp[9] = (uint32_t)(dp[9] + sp[9] + 1) >> 1;
35 dp[10] = (uint32_t)(dp[10] + sp[10] + 1) >> 1;
36 dp[11] = (uint32_t)(dp[11] + sp[11] + 1) >> 1;
37 dp[12] = (uint32_t)(dp[12] + sp[12] + 1) >> 1;
38 dp[13] = (uint32_t)(dp[13] + sp[13] + 1) >> 1;
39 dp[14] = (uint32_t)(dp[14] + sp[14] + 1) >> 1;
40 dp[15] = (uint32_t)(dp[15] + sp[15] + 1) >> 1;
45 static inline void mreca(uint8_t *s, uint8_t *d, int lx2, int h)
49 movq_m2r(m_(sadd1),mm7);
50 int j; for( j=0; j<h; ++j ) {
51 movq_m2r(m_(sp+0),mm1);
52 movq_m2r(m_(dp+0),mm3);
55 punpcklbw_r2r(mm0,mm1);
56 punpckhbw_r2r(mm0,mm2);
57 punpcklbw_r2r(mm0,mm3);
58 punpckhbw_r2r(mm0,mm4);
65 packuswb_r2r(mm2,mm1);
66 movq_r2m(mm1,m_(dp+0));
67 movq_m2r(m_(sp+8),mm1);
68 movq_m2r(m_(dp+8),mm3);
71 punpcklbw_r2r(mm0,mm1);
72 punpckhbw_r2r(mm0,mm2);
73 punpcklbw_r2r(mm0,mm3);
74 punpckhbw_r2r(mm0,mm4);
81 packuswb_r2r(mm2,mm1);
82 movq_r2m(mm1,m_(dp+8));
88 static inline void recac(uint8_t *s, uint8_t *d, int lx2, int h)
91 int j; for( j=0; j<h; ++j ) {
92 dp[0] = (uint32_t)(dp[0] + sp[0] + 1)>>1;
93 dp[1] = (uint32_t)(dp[1] + sp[1] + 1)>>1;
94 dp[2] = (uint32_t)(dp[2] + sp[2] + 1)>>1;
95 dp[3] = (uint32_t)(dp[3] + sp[3] + 1)>>1;
96 dp[4] = (uint32_t)(dp[4] + sp[4] + 1)>>1;
97 dp[5] = (uint32_t)(dp[5] + sp[5] + 1)>>1;
98 dp[6] = (uint32_t)(dp[6] + sp[6] + 1)>>1;
99 dp[7] = (uint32_t)(dp[7] + sp[7] + 1)>>1;
100 sp += lx2; dp += lx2;
104 static inline void mrecac(uint8_t *s, uint8_t *d, int lx2, int h)
106 uint8_t *dp=d, *sp=s;
108 movq_m2r(m_(sadd1),mm7);
109 int j; for( j=0; j<h; ++j ) {
110 movq_m2r(m_(sp+0),mm1);
111 movq_m2r(m_(dp+0),mm3);
114 punpcklbw_r2r(mm0,mm1);
115 punpckhbw_r2r(mm0,mm2);
116 punpcklbw_r2r(mm0,mm3);
117 punpckhbw_r2r(mm0,mm4);
118 paddusw_r2r(mm3,mm1);
119 paddusw_r2r(mm4,mm2);
120 paddusw_r2r(mm7,mm1);
121 paddusw_r2r(mm7,mm2);
124 packuswb_r2r(mm2,mm1);
125 movq_r2m(mm1,m_(dp+0));
126 sp += lx2; dp += lx2;
131 static inline void recv(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
133 uint8_t *dp=d, *sp=s, *sp2=s+lx;
134 int j; for( j=0; j<h; ++j ) {
135 dp[0] = (uint32_t)(sp[0] + sp2[0] + 1) >> 1;
136 dp[1] = (uint32_t)(sp[1] + sp2[1] + 1) >> 1;
137 dp[2] = (uint32_t)(sp[2] + sp2[2] + 1) >> 1;
138 dp[3] = (uint32_t)(sp[3] + sp2[3] + 1) >> 1;
139 dp[4] = (uint32_t)(sp[4] + sp2[4] + 1) >> 1;
140 dp[5] = (uint32_t)(sp[5] + sp2[5] + 1) >> 1;
141 dp[6] = (uint32_t)(sp[6] + sp2[6] + 1) >> 1;
142 dp[7] = (uint32_t)(sp[7] + sp2[7] + 1) >> 1;
143 dp[8] = (uint32_t)(sp[8] + sp2[8] + 1) >> 1;
144 dp[9] = (uint32_t)(sp[9] + sp2[9] + 1) >> 1;
145 dp[10] = (uint32_t)(sp[10] + sp2[10] + 1) >> 1;
146 dp[11] = (uint32_t)(sp[11] + sp2[11] + 1) >> 1;
147 dp[12] = (uint32_t)(sp[12] + sp2[12] + 1) >> 1;
148 dp[13] = (uint32_t)(sp[13] + sp2[13] + 1) >> 1;
149 dp[14] = (uint32_t)(sp[14] + sp2[14] + 1) >> 1;
150 dp[15] = (uint32_t)(sp[15] + sp2[15] + 1) >> 1;
151 sp += lx2; sp2 += lx2 ; dp += lx2;
155 static inline void mrecv(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
157 uint8_t *dp=d, *sp=s, *sp2=s+lx;
159 movq_m2r(m_(sadd1),mm7);
160 int j; for( j=0; j<h; ++j ) {
161 movq_m2r(m_(sp +0),mm1);
162 movq_m2r(m_(sp2+0),mm3);
165 punpcklbw_r2r(mm0,mm1);
166 punpckhbw_r2r(mm0,mm2);
167 punpcklbw_r2r(mm0,mm3);
168 punpckhbw_r2r(mm0,mm4);
169 paddusw_r2r(mm3,mm1);
170 paddusw_r2r(mm4,mm2);
171 paddusw_r2r(mm7,mm1);
172 paddusw_r2r(mm7,mm2);
175 packuswb_r2r(mm2,mm1);
176 movq_r2m(mm1,m_(dp+0));
177 movq_m2r(m_(sp +8),mm1);
178 movq_m2r(m_(sp2+8),mm3);
181 punpcklbw_r2r(mm0,mm1);
182 punpckhbw_r2r(mm0,mm2);
183 punpcklbw_r2r(mm0,mm3);
184 punpckhbw_r2r(mm0,mm4);
185 paddusw_r2r(mm3,mm1);
186 paddusw_r2r(mm4,mm2);
187 paddusw_r2r(mm7,mm1);
188 paddusw_r2r(mm7,mm2);
191 packuswb_r2r(mm2,mm1);
192 movq_r2m(mm1,m_(dp+8));
193 sp += lx2; sp2 += lx2 ; dp += lx2;
198 static inline void recvc(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
200 uint8_t *dp=d, *sp=s, *sp2=s+lx;
201 int j; for( j=0; j<h; ++j ) {
202 dp[0] = (uint32_t)(sp[0]+sp2[0]+1)>>1;
203 dp[1] = (uint32_t)(sp[1]+sp2[1]+1)>>1;
204 dp[2] = (uint32_t)(sp[2]+sp2[2]+1)>>1;
205 dp[3] = (uint32_t)(sp[3]+sp2[3]+1)>>1;
206 dp[4] = (uint32_t)(sp[4]+sp2[4]+1)>>1;
207 dp[5] = (uint32_t)(sp[5]+sp2[5]+1)>>1;
208 dp[6] = (uint32_t)(sp[6]+sp2[6]+1)>>1;
209 dp[7] = (uint32_t)(sp[7]+sp2[7]+1)>>1;
210 sp += lx2; sp2 += lx2; dp += lx2;
214 static inline void mrecvc(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
216 uint8_t *dp=d, *sp=s, *sp2=s+lx;
218 movq_m2r(m_(sadd1),mm7);
219 int j; for( j=0; j<h; ++j ) {
220 movq_m2r(m_(sp +0),mm1);
221 movq_m2r(m_(sp2+0),mm3);
224 punpcklbw_r2r(mm0,mm1);
225 punpckhbw_r2r(mm0,mm2);
226 punpcklbw_r2r(mm0,mm3);
227 punpckhbw_r2r(mm0,mm4);
228 paddusw_r2r(mm3,mm1);
229 paddusw_r2r(mm4,mm2);
230 paddusw_r2r(mm7,mm1);
231 paddusw_r2r(mm7,mm2);
234 packuswb_r2r(mm2,mm1);
235 movq_r2m(mm1,m_(dp+0));
236 sp += lx2; sp2 += lx2; dp += lx2;
242 static inline void recva(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
244 uint8_t *dp=d, *sp=s, *sp2=s+lx;
245 int j; for( j=0; j<h; ++j ) {
246 dp[0] = (dp[0] + ((uint32_t)(sp[0]+sp2[0]+1)>>1) + 1)>>1;
247 dp[1] = (dp[1] + ((uint32_t)(sp[1]+sp2[1]+1)>>1) + 1)>>1;
248 dp[2] = (dp[2] + ((uint32_t)(sp[2]+sp2[2]+1)>>1) + 1)>>1;
249 dp[3] = (dp[3] + ((uint32_t)(sp[3]+sp2[3]+1)>>1) + 1)>>1;
250 dp[4] = (dp[4] + ((uint32_t)(sp[4]+sp2[4]+1)>>1) + 1)>>1;
251 dp[5] = (dp[5] + ((uint32_t)(sp[5]+sp2[5]+1)>>1) + 1)>>1;
252 dp[6] = (dp[6] + ((uint32_t)(sp[6]+sp2[6]+1)>>1) + 1)>>1;
253 dp[7] = (dp[7] + ((uint32_t)(sp[7]+sp2[7]+1)>>1) + 1)>>1;
254 dp[8] = (dp[8] + ((uint32_t)(sp[8]+sp2[8]+1)>>1) + 1)>>1;
255 dp[9] = (dp[9] + ((uint32_t)(sp[9]+sp2[9]+1)>>1) + 1)>>1;
256 dp[10] = (dp[10] + ((uint32_t)(sp[10]+sp2[10]+1)>>1) + 1)>>1;
257 dp[11] = (dp[11] + ((uint32_t)(sp[11]+sp2[11]+1)>>1) + 1)>>1;
258 dp[12] = (dp[12] + ((uint32_t)(sp[12]+sp2[12]+1)>>1) + 1)>>1;
259 dp[13] = (dp[13] + ((uint32_t)(sp[13]+sp2[13]+1)>>1) + 1)>>1;
260 dp[14] = (dp[14] + ((uint32_t)(sp[14]+sp2[14]+1)>>1) + 1)>>1;
261 dp[15] = (dp[15] + ((uint32_t)(sp[15]+sp2[15]+1)>>1) + 1)>>1;
262 sp += lx2; sp2 += lx2; dp += lx2;
266 static inline void mrecva(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
268 uint8_t *dp=d, *sp=s, *sp2=s+lx;
270 movq_m2r(m_(sadd1),mm7);
271 int j; for( j=0; j<h; ++j ) {
272 movq_m2r(m_(sp +0),mm1);
273 movq_m2r(m_(sp2+0),mm3);
274 movq_m2r(m_(dp +0),mm5);
278 punpcklbw_r2r(mm0,mm1);
279 punpckhbw_r2r(mm0,mm2);
280 punpcklbw_r2r(mm0,mm3);
281 punpckhbw_r2r(mm0,mm4);
282 punpcklbw_r2r(mm0,mm5);
283 punpckhbw_r2r(mm0,mm6);
284 paddusw_r2r(mm3,mm1);
285 paddusw_r2r(mm4,mm2);
286 paddusw_r2r(mm7,mm1);
287 paddusw_r2r(mm7,mm2);
290 paddusw_r2r(mm5,mm1);
291 paddusw_r2r(mm6,mm2);
292 paddusw_r2r(mm7,mm1);
293 paddusw_r2r(mm7,mm2);
296 packuswb_r2r(mm2,mm1);
297 movq_r2m(mm1,m_(dp+0));
298 movq_m2r(m_(sp +8),mm1);
299 movq_m2r(m_(sp2+8),mm3);
300 movq_m2r(m_(dp +8),mm5);
304 punpcklbw_r2r(mm0,mm1);
305 punpckhbw_r2r(mm0,mm2);
306 punpcklbw_r2r(mm0,mm3);
307 punpckhbw_r2r(mm0,mm4);
308 punpcklbw_r2r(mm0,mm5);
309 punpckhbw_r2r(mm0,mm6);
310 paddusw_r2r(mm3,mm1);
311 paddusw_r2r(mm4,mm2);
312 paddusw_r2r(mm7,mm1);
313 paddusw_r2r(mm7,mm2);
316 paddusw_r2r(mm5,mm1);
317 paddusw_r2r(mm6,mm2);
318 paddusw_r2r(mm7,mm1);
319 paddusw_r2r(mm7,mm2);
322 packuswb_r2r(mm2,mm1);
323 movq_r2m(mm1,m_(dp+8));
324 sp += lx2; sp2 += lx2; dp += lx2;
330 static inline void recvac(uint8_t *s, uint8_t *d, int lx,int lx2, int h)
332 uint8_t *dp=d, *sp=s, *sp2=s+lx;
333 int j; for( j=0; j<h; ++j ) {
334 dp[0] = (dp[0] + ((uint32_t)(sp[0]+sp2[0]+1)>>1) + 1)>>1;
335 dp[1] = (dp[1] + ((uint32_t)(sp[1]+sp2[1]+1)>>1) + 1)>>1;
336 dp[2] = (dp[2] + ((uint32_t)(sp[2]+sp2[2]+1)>>1) + 1)>>1;
337 dp[3] = (dp[3] + ((uint32_t)(sp[3]+sp2[3]+1)>>1) + 1)>>1;
338 dp[4] = (dp[4] + ((uint32_t)(sp[4]+sp2[4]+1)>>1) + 1)>>1;
339 dp[5] = (dp[5] + ((uint32_t)(sp[5]+sp2[5]+1)>>1) + 1)>>1;
340 dp[6] = (dp[6] + ((uint32_t)(sp[6]+sp2[6]+1)>>1) + 1)>>1;
341 dp[7] = (dp[7] + ((uint32_t)(sp[7]+sp2[7]+1)>>1) + 1)>>1;
342 sp += lx2; sp2 += lx2; dp+= lx2;
346 static inline void mrecvac(uint8_t *s, uint8_t *d, int lx,int lx2, int h)
348 uint8_t *dp=d, *sp=s, *sp2=s+lx;
350 movq_m2r(m_(sadd1),mm7);
351 int j; for( j=0; j<h; ++j ) {
352 movq_m2r(m_(sp +0),mm1);
353 movq_m2r(m_(sp2+0),mm3);
354 movq_m2r(m_(dp +0),mm5);
358 punpcklbw_r2r(mm0,mm1);
359 punpckhbw_r2r(mm0,mm2);
360 punpcklbw_r2r(mm0,mm3);
361 punpckhbw_r2r(mm0,mm4);
362 punpcklbw_r2r(mm0,mm5);
363 punpckhbw_r2r(mm0,mm6);
364 paddusw_r2r(mm3,mm1);
365 paddusw_r2r(mm4,mm2);
366 paddusw_r2r(mm7,mm1);
367 paddusw_r2r(mm7,mm2);
370 paddusw_r2r(mm5,mm1);
371 paddusw_r2r(mm6,mm2);
372 paddusw_r2r(mm7,mm1);
373 paddusw_r2r(mm7,mm2);
376 packuswb_r2r(mm2,mm1);
377 movq_r2m(mm1,m_(dp+0));
378 sp += lx2; sp2 += lx2; dp += lx2;
384 static inline void rech(uint8_t *s, uint8_t *d, int lx2, int h)
386 uint8_t *dp=d, *sp=s;
388 int j; for( j=0; j<h; ++j ) {
390 dp[0] = (uint32_t)(s1+(s2=sp[1])+1)>>1;
391 dp[1] = (uint32_t)(s2+(s1=sp[2])+1)>>1;
392 dp[2] = (uint32_t)(s1+(s2=sp[3])+1)>>1;
393 dp[3] = (uint32_t)(s2+(s1=sp[4])+1)>>1;
394 dp[4] = (uint32_t)(s1+(s2=sp[5])+1)>>1;
395 dp[5] = (uint32_t)(s2+(s1=sp[6])+1)>>1;
396 dp[6] = (uint32_t)(s1+(s2=sp[7])+1)>>1;
397 dp[7] = (uint32_t)(s2+(s1=sp[8])+1)>>1;
398 dp[8] = (uint32_t)(s1+(s2=sp[9])+1)>>1;
399 dp[9] = (uint32_t)(s2+(s1=sp[10])+1)>>1;
400 dp[10] = (uint32_t)(s1+(s2=sp[11])+1)>>1;
401 dp[11] = (uint32_t)(s2+(s1=sp[12])+1)>>1;
402 dp[12] = (uint32_t)(s1+(s2=sp[13])+1)>>1;
403 dp[13] = (uint32_t)(s2+(s1=sp[14])+1)>>1;
404 dp[14] = (uint32_t)(s1+(s2=sp[15])+1)>>1;
405 dp[15] = (uint32_t)(s2+sp[16]+1)>>1;
406 sp += lx2; dp += lx2;
410 static inline void mrech(uint8_t *s, uint8_t *d, int lx2, int h)
412 uint8_t *dp=d, *sp=s;
414 movq_m2r(m_(sadd1),mm7);
415 int j; for( j=0; j<h; ++j ) {
416 movq_m2r(m_(sp+0),mm1);
417 movq_m2r(m_(sp+1),mm3);
420 punpcklbw_r2r(mm0,mm1);
421 punpckhbw_r2r(mm0,mm2);
422 punpcklbw_r2r(mm0,mm3);
423 punpckhbw_r2r(mm0,mm4);
424 paddusw_r2r(mm3,mm1);
425 paddusw_r2r(mm4,mm2);
426 paddusw_r2r(mm7,mm1);
427 paddusw_r2r(mm7,mm2);
430 packuswb_r2r(mm2,mm1);
431 movq_r2m(mm1,m_(dp+0));
432 movq_m2r(m_(sp+8),mm1);
433 movq_m2r(m_(sp+9),mm3);
436 punpcklbw_r2r(mm0,mm1);
437 punpckhbw_r2r(mm0,mm2);
438 punpcklbw_r2r(mm0,mm3);
439 punpckhbw_r2r(mm0,mm4);
440 paddusw_r2r(mm3,mm1);
441 paddusw_r2r(mm4,mm2);
442 paddusw_r2r(mm7,mm1);
443 paddusw_r2r(mm7,mm2);
446 packuswb_r2r(mm2,mm1);
447 movq_r2m(mm1,m_(dp+8));
448 sp += lx2; dp += lx2;
454 static inline void rechc(uint8_t *s, uint8_t *d, int lx2, int h)
456 uint8_t *dp=d, *sp=s;
458 int j; for( j=0; j<h; ++j ) {
460 dp[0] = (uint32_t)(s1+(s2=sp[1])+1)>>1;
461 dp[1] = (uint32_t)(s2+(s1=sp[2])+1)>>1;
462 dp[2] = (uint32_t)(s1+(s2=sp[3])+1)>>1;
463 dp[3] = (uint32_t)(s2+(s1=sp[4])+1)>>1;
464 dp[4] = (uint32_t)(s1+(s2=sp[5])+1)>>1;
465 dp[5] = (uint32_t)(s2+(s1=sp[6])+1)>>1;
466 dp[6] = (uint32_t)(s1+(s2=sp[7])+1)>>1;
467 dp[7] = (uint32_t)(s2+sp[8]+1)>>1;
468 sp += lx2; dp += lx2;
472 static inline void mrechc(uint8_t *s, uint8_t *d, int lx2, int h)
474 uint8_t *dp=d, *sp=s;
476 movq_m2r(m_(sadd1),mm7);
477 int j; for( j=0; j<h; ++j ) {
478 movq_m2r(m_(sp+0),mm1);
479 movq_m2r(m_(sp+1),mm3);
482 punpcklbw_r2r(mm0,mm1);
483 punpckhbw_r2r(mm0,mm2);
484 punpcklbw_r2r(mm0,mm3);
485 punpckhbw_r2r(mm0,mm4);
486 paddusw_r2r(mm3,mm1);
487 paddusw_r2r(mm4,mm2);
488 paddusw_r2r(mm7,mm1);
489 paddusw_r2r(mm7,mm2);
492 packuswb_r2r(mm2,mm1);
493 movq_r2m(mm1,m_(dp+0));
494 sp += lx2; dp += lx2;
499 static inline void recha(uint8_t *s, uint8_t *d,int lx2, int h)
501 uint8_t *dp=d, *sp=s;
503 int j; for( j=0; j<h; ++j ) {
505 dp[0] = (dp[0] + ((uint32_t)(s1 + (s2 = sp[1]) + 1) >> 1) + 1) >> 1;
506 dp[1] = (dp[1] + ((uint32_t)(s2 + (s1 = sp[2]) + 1) >> 1) + 1) >> 1;
507 dp[2] = (dp[2] + ((uint32_t)(s1 + (s2 = sp[3]) + 1) >> 1) + 1) >> 1;
508 dp[3] = (dp[3] + ((uint32_t)(s2 + (s1 = sp[4]) + 1) >> 1) + 1) >> 1;
509 dp[4] = (dp[4] + ((uint32_t)(s1 + (s2 = sp[5]) + 1) >> 1) + 1) >> 1;
510 dp[5] = (dp[5] + ((uint32_t)(s2 + (s1 = sp[6]) + 1) >> 1) + 1) >> 1;
511 dp[6] = (dp[6] + ((uint32_t)(s1 + (s2 = sp[7]) + 1) >> 1) + 1) >> 1;
512 dp[7] = (dp[7] + ((uint32_t)(s2 + (s1 = sp[8]) + 1) >> 1) + 1) >> 1;
513 dp[8] = (dp[8] + ((uint32_t)(s1 + (s2 = sp[9]) + 1) >> 1) + 1) >> 1;
514 dp[9] = (dp[9] + ((uint32_t)(s2 + (s1 = sp[10]) + 1) >> 1) + 1) >> 1;
515 dp[10] = (dp[10] + ((uint32_t)(s1 + (s2 = sp[11]) + 1) >> 1) + 1) >> 1;
516 dp[11] = (dp[11] + ((uint32_t)(s2 + (s1 = sp[12]) + 1) >> 1) + 1) >> 1;
517 dp[12] = (dp[12] + ((uint32_t)(s1 + (s2 = sp[13]) + 1) >> 1) + 1) >> 1;
518 dp[13] = (dp[13] + ((uint32_t)(s2 + (s1 = sp[14]) + 1) >> 1) + 1) >> 1;
519 dp[14] = (dp[14] + ((uint32_t)(s1 + (s2 = sp[15]) + 1) >> 1) + 1) >> 1;
520 dp[15] = (dp[15] + ((uint32_t)(s2 + sp[16] + 1) >> 1) + 1) >> 1;
521 sp += lx2; dp += lx2;
525 static inline void mrecha(uint8_t *s, uint8_t *d,int lx2, int h)
527 uint8_t *dp=d, *sp=s;
529 movq_m2r(m_(sadd1),mm7);
530 int j; for( j=0; j<h; ++j ) {
531 movq_m2r(m_(sp+0),mm1);
532 movq_m2r(m_(sp+1),mm3);
533 movq_m2r(m_(dp+0),mm5);
537 punpcklbw_r2r(mm0,mm1);
538 punpckhbw_r2r(mm0,mm2);
539 punpcklbw_r2r(mm0,mm3);
540 punpckhbw_r2r(mm0,mm4);
541 punpcklbw_r2r(mm0,mm5);
542 punpckhbw_r2r(mm0,mm6);
543 paddusw_r2r(mm3,mm1);
544 paddusw_r2r(mm4,mm2);
545 paddusw_r2r(mm7,mm1);
546 paddusw_r2r(mm7,mm2);
549 paddusw_r2r(mm5,mm1);
550 paddusw_r2r(mm6,mm2);
551 paddusw_r2r(mm7,mm1);
552 paddusw_r2r(mm7,mm2);
555 packuswb_r2r(mm2,mm1);
556 movq_r2m(mm1,m_(dp+0));
557 movq_m2r(m_(sp+8),mm1);
558 movq_m2r(m_(sp+9),mm3);
559 movq_m2r(m_(dp+8),mm5);
563 punpcklbw_r2r(mm0,mm1);
564 punpckhbw_r2r(mm0,mm2);
565 punpcklbw_r2r(mm0,mm3);
566 punpckhbw_r2r(mm0,mm4);
567 punpcklbw_r2r(mm0,mm5);
568 punpckhbw_r2r(mm0,mm6);
569 paddusw_r2r(mm3,mm1);
570 paddusw_r2r(mm4,mm2);
571 paddusw_r2r(mm7,mm1);
572 paddusw_r2r(mm7,mm2);
575 paddusw_r2r(mm5,mm1);
576 paddusw_r2r(mm6,mm2);
577 paddusw_r2r(mm7,mm1);
578 paddusw_r2r(mm7,mm2);
581 packuswb_r2r(mm2,mm1);
582 movq_r2m(mm1,m_(dp+8));
583 sp += lx2; dp += lx2;
589 static inline void rechac(uint8_t *s, uint8_t *d, int lx2, int h)
591 uint8_t *dp=d, *sp=s;
593 int j; for( j=0; j<h; ++j ) {
595 dp[0] = (dp[0] + ((uint32_t)(s1 + (s2 = sp[1]) + 1) >> 1) + 1) >> 1;
596 dp[1] = (dp[1] + ((uint32_t)(s2 + (s1 = sp[2]) + 1) >> 1) + 1) >> 1;
597 dp[2] = (dp[2] + ((uint32_t)(s1 + (s2 = sp[3]) + 1) >> 1) + 1) >> 1;
598 dp[3] = (dp[3] + ((uint32_t)(s2 + (s1 = sp[4]) + 1) >> 1) + 1) >> 1;
599 dp[4] = (dp[4] + ((uint32_t)(s1 + (s2 = sp[5]) + 1) >> 1) + 1) >> 1;
600 dp[5] = (dp[5] + ((uint32_t)(s2 + (s1 = sp[6]) + 1) >> 1) + 1) >> 1;
601 dp[6] = (dp[6] + ((uint32_t)(s1 + (s2 = sp[7]) + 1) >> 1) + 1) >> 1;
602 dp[7] = (dp[7] + ((uint32_t)(s2 + sp[8] + 1) >> 1) + 1) >> 1;
603 sp += lx2; dp += lx2;
607 static inline void mrechac(uint8_t *s, uint8_t *d, int lx2, int h)
609 uint8_t *dp=d, *sp=s;
611 movq_m2r(m_(sadd1),mm7);
612 int j; for( j=0; j<h; ++j ) {
613 movq_m2r(m_(sp+0),mm1);
614 movq_m2r(m_(sp+1),mm3);
615 movq_m2r(m_(dp+0),mm5);
619 punpcklbw_r2r(mm0,mm1);
620 punpckhbw_r2r(mm0,mm2);
621 punpcklbw_r2r(mm0,mm3);
622 punpckhbw_r2r(mm0,mm4);
623 punpcklbw_r2r(mm0,mm5);
624 punpckhbw_r2r(mm0,mm6);
625 paddusw_r2r(mm3,mm1);
626 paddusw_r2r(mm4,mm2);
627 paddusw_r2r(mm7,mm1);
628 paddusw_r2r(mm7,mm2);
631 paddusw_r2r(mm5,mm1);
632 paddusw_r2r(mm6,mm2);
633 paddusw_r2r(mm7,mm1);
634 paddusw_r2r(mm7,mm2);
637 packuswb_r2r(mm2,mm1);
638 movq_r2m(mm1,m_(dp+0));
639 sp += lx2; dp += lx2;
645 static inline void rec4(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
647 uint8_t *dp=d, *sp=s, *sp2=s+lx;
648 uint32_t s1, s2, s3, s4;
649 int j; for( j=0; j<h; ++j ) {
651 dp[0] = (uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2;
652 dp[1] = (uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2;
653 dp[2] = (uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2;
654 dp[3] = (uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2;
655 dp[4] = (uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2;
656 dp[5] = (uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2;
657 dp[6] = (uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2;
658 dp[7] = (uint32_t)(s2+(s1=sp[8])+s4+(s3=sp2[8])+2)>>2;
659 dp[8] = (uint32_t)(s1+(s2=sp[9])+s3+(s4=sp2[9])+2)>>2;
660 dp[9] = (uint32_t)(s2+(s1=sp[10])+s4+(s3=sp2[10])+2)>>2;
661 dp[10] = (uint32_t)(s1+(s2=sp[11])+s3+(s4=sp2[11])+2)>>2;
662 dp[11] = (uint32_t)(s2+(s1=sp[12])+s4+(s3=sp2[12])+2)>>2;
663 dp[12] = (uint32_t)(s1+(s2=sp[13])+s3+(s4=sp2[13])+2)>>2;
664 dp[13] = (uint32_t)(s2+(s1=sp[14])+s4+(s3=sp2[14])+2)>>2;
665 dp[14] = (uint32_t)(s1+(s2=sp[15])+s3+(s4=sp2[15])+2)>>2;
666 dp[15] = (uint32_t)(s2+sp[16]+s4+sp2[16]+2)>>2;
667 sp += lx2; sp2 += lx2; dp += lx2;
671 static inline void mrec4(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
673 uint8_t *dp=d, *sp=s, *sp2=s+lx;
675 movq_m2r(m_(sadd2),mm7);
676 int j; for( j=0; j<h; ++j ) {
677 movq_m2r(m_(sp +0),mm1);
678 movq_m2r(m_(sp +1),mm3);
679 movq_m2r(m_(sp2+0),mm5);
683 punpcklbw_r2r(mm0,mm1);
684 punpckhbw_r2r(mm0,mm2);
685 punpcklbw_r2r(mm0,mm3);
686 punpckhbw_r2r(mm0,mm4);
687 punpcklbw_r2r(mm0,mm5);
688 punpckhbw_r2r(mm0,mm6);
689 paddusw_r2r(mm3,mm1);
690 movq_m2r(m_(sp2+1),mm3);
691 paddusw_r2r(mm4,mm2);
693 punpcklbw_r2r(mm0,mm3);
694 punpckhbw_r2r(mm0,mm4);
695 paddusw_r2r(mm5,mm3);
696 paddusw_r2r(mm6,mm4);
697 paddusw_r2r(mm3,mm1);
698 paddusw_r2r(mm4,mm2);
699 paddusw_r2r(mm7,mm1);
700 paddusw_r2r(mm7,mm2);
703 packuswb_r2r(mm2,mm1);
704 movq_r2m(mm1,m_(dp+0));
705 movq_m2r(m_(sp +8),mm1);
706 movq_m2r(m_(sp +9),mm3);
707 movq_m2r(m_(sp2+8),mm5);
711 punpcklbw_r2r(mm0,mm1);
712 punpckhbw_r2r(mm0,mm2);
713 punpcklbw_r2r(mm0,mm3);
714 punpckhbw_r2r(mm0,mm4);
715 punpcklbw_r2r(mm0,mm5);
716 punpckhbw_r2r(mm0,mm6);
717 paddusw_r2r(mm3,mm1);
718 movq_m2r(m_(sp2+9),mm3);
719 paddusw_r2r(mm4,mm2);
721 punpcklbw_r2r(mm0,mm3);
722 punpckhbw_r2r(mm0,mm4);
723 paddusw_r2r(mm5,mm3);
724 paddusw_r2r(mm6,mm4);
725 paddusw_r2r(mm3,mm1);
726 paddusw_r2r(mm4,mm2);
727 paddusw_r2r(mm7,mm1);
728 paddusw_r2r(mm7,mm2);
731 packuswb_r2r(mm2,mm1);
732 movq_r2m(mm1,m_(dp+8));
733 sp += lx2; sp2 += lx2; dp += lx2;
739 static inline void rec4c(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
741 uint8_t *dp=d, *sp=s, *sp2=s+lx;
742 uint32_t s1, s2, s3, s4;
743 int j; for( j=0; j<h; ++j ) {
745 dp[0] = (uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2;
746 dp[1] = (uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2;
747 dp[2] = (uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2;
748 dp[3] = (uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2;
749 dp[4] = (uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2;
750 dp[5] = (uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2;
751 dp[6] = (uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2;
752 dp[7] = (uint32_t)(s2+sp[8]+s4+sp2[8]+2)>>2;
753 sp += lx2; sp2 += lx2; dp += lx2;
757 static inline void mrec4c(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
759 uint8_t *dp=d, *sp=s, *sp2=s+lx;
761 movq_m2r(m_(sadd2),mm7);
762 int j; for( j=0; j<h; ++j ) {
763 movq_m2r(m_(sp +0),mm1);
764 movq_m2r(m_(sp +1),mm3);
765 movq_m2r(m_(sp2+0),mm5);
769 punpcklbw_r2r(mm0,mm1);
770 punpckhbw_r2r(mm0,mm2);
771 punpcklbw_r2r(mm0,mm3);
772 punpckhbw_r2r(mm0,mm4);
773 punpcklbw_r2r(mm0,mm5);
774 punpckhbw_r2r(mm0,mm6);
775 paddusw_r2r(mm3,mm1);
776 movq_m2r(m_(sp2+1),mm3);
777 paddusw_r2r(mm4,mm2);
779 punpcklbw_r2r(mm0,mm3);
780 punpckhbw_r2r(mm0,mm4);
781 paddusw_r2r(mm5,mm3);
782 paddusw_r2r(mm6,mm4);
783 paddusw_r2r(mm3,mm1);
784 paddusw_r2r(mm4,mm2);
785 paddusw_r2r(mm7,mm1);
786 paddusw_r2r(mm7,mm2);
789 packuswb_r2r(mm2,mm1);
790 movq_r2m(mm1,m_(dp+0));
791 sp += lx2; sp2 += lx2; dp += lx2;
797 static inline void rec4a(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
799 uint8_t *dp=d, *sp=s, *sp2=s+lx;
800 uint32_t s1, s2, s3, s4;
801 int j; for( j=0; j<h; ++j ) {
802 s1 = sp[0]; s3 = sp2[0];
803 dp[0] = (dp[0] + ((uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2) + 1)>>1;
804 dp[1] = (dp[1] + ((uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2) + 1)>>1;
805 dp[2] = (dp[2] + ((uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2) + 1)>>1;
806 dp[3] = (dp[3] + ((uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2) + 1)>>1;
807 dp[4] = (dp[4] + ((uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2) + 1)>>1;
808 dp[5] = (dp[5] + ((uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2) + 1)>>1;
809 dp[6] = (dp[6] + ((uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2) + 1)>>1;
810 dp[7] = (dp[7] + ((uint32_t)(s2+(s1=sp[8])+s4+(s3=sp2[8])+2)>>2) + 1)>>1;
811 dp[8] = (dp[8] + ((uint32_t)(s1+(s2=sp[9])+s3+(s4=sp2[9])+2)>>2) + 1)>>1;
812 dp[9] = (dp[9] + ((uint32_t)(s2+(s1=sp[10])+s4+(s3=sp2[10])+2)>>2) + 1)>>1;
813 dp[10] = (dp[10] + ((uint32_t)(s1+(s2=sp[11])+s3+(s4=sp2[11])+2)>>2) + 1)>>1;
814 dp[11] = (dp[11] + ((uint32_t)(s2+(s1=sp[12])+s4+(s3=sp2[12])+2)>>2) + 1)>>1;
815 dp[12] = (dp[12] + ((uint32_t)(s1+(s2=sp[13])+s3+(s4=sp2[13])+2)>>2) + 1)>>1;
816 dp[13] = (dp[13] + ((uint32_t)(s2+(s1=sp[14])+s4+(s3=sp2[14])+2)>>2) + 1)>>1;
817 dp[14] = (dp[14] + ((uint32_t)(s1+(s2=sp[15])+s3+(s4=sp2[15])+2)>>2) + 1)>>1;
818 dp[15] = (dp[15] + ((uint32_t)(s2+sp[16]+s4+sp2[16]+2)>>2) + 1)>>1;
819 sp += lx2; sp2 += lx2; dp += lx2;
823 static inline void mrec4a(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
825 uint8_t *dp=d, *sp=s, *sp2=s+lx;
827 movq_m2r(m_(sadd2),mm7);
828 int j; for( j=0; j<h; ++j ) {
829 movq_m2r(m_(sp +0),mm1);
830 movq_m2r(m_(sp +1),mm3);
831 movq_m2r(m_(sp2+0),mm5);
835 punpcklbw_r2r(mm0,mm1);
836 punpckhbw_r2r(mm0,mm2);
837 punpcklbw_r2r(mm0,mm3);
838 punpckhbw_r2r(mm0,mm4);
839 punpcklbw_r2r(mm0,mm5);
840 punpckhbw_r2r(mm0,mm6);
841 paddusw_r2r(mm3,mm1);
842 movq_m2r(m_(sp2+1),mm3);
843 paddusw_r2r(mm4,mm2);
845 punpcklbw_r2r(mm0,mm3);
846 punpckhbw_r2r(mm0,mm4);
847 paddusw_r2r(mm5,mm3);
848 paddusw_r2r(mm6,mm4);
849 paddusw_r2r(mm3,mm1);
850 paddusw_r2r(mm4,mm2);
851 movq_m2r(m_(dp +0),mm3);
852 paddusw_r2r(mm7,mm1);
854 paddusw_r2r(mm7,mm2);
855 punpcklbw_r2r(mm0,mm3);
856 punpckhbw_r2r(mm0,mm4);
859 movq_m2r(m_(sadd1),mm5);
860 paddusw_r2r(mm3,mm1);
861 paddusw_r2r(mm4,mm2);
862 paddusw_r2r(mm5,mm1);
863 paddusw_r2r(mm5,mm2);
866 packuswb_r2r(mm2,mm1);
867 movq_r2m(mm1,m_(dp+0));
868 movq_m2r(m_(sp +8),mm1);
869 movq_m2r(m_(sp +9),mm3);
870 movq_m2r(m_(sp2+8),mm5);
874 punpcklbw_r2r(mm0,mm1);
875 punpckhbw_r2r(mm0,mm2);
876 punpcklbw_r2r(mm0,mm3);
877 punpckhbw_r2r(mm0,mm4);
878 punpcklbw_r2r(mm0,mm5);
879 punpckhbw_r2r(mm0,mm6);
880 paddusw_r2r(mm3,mm1);
881 movq_m2r(m_(sp2+9),mm3);
882 paddusw_r2r(mm4,mm2);
884 punpcklbw_r2r(mm0,mm3);
885 punpckhbw_r2r(mm0,mm4);
886 paddusw_r2r(mm5,mm3);
887 paddusw_r2r(mm6,mm4);
888 paddusw_r2r(mm3,mm1);
889 paddusw_r2r(mm4,mm2);
890 movq_m2r(m_(dp +8),mm3);
891 paddusw_r2r(mm7,mm1);
893 paddusw_r2r(mm7,mm2);
894 punpcklbw_r2r(mm0,mm3);
895 punpckhbw_r2r(mm0,mm4);
898 movq_m2r(m_(sadd1),mm5);
899 paddusw_r2r(mm3,mm1);
900 paddusw_r2r(mm4,mm2);
901 paddusw_r2r(mm5,mm1);
902 paddusw_r2r(mm5,mm2);
905 packuswb_r2r(mm2,mm1);
906 movq_r2m(mm1,m_(dp+8));
907 sp += lx2; sp2 += lx2; dp += lx2;
913 static inline void rec4ac(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
915 uint8_t *dp=d, *sp=s, *sp2=s+lx;
916 uint32_t s1,s2,s3,s4;
917 int j; for( j=0; j<h; ++j ) {
919 dp[0] = (dp[0] + ((uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2) + 1)>>1;
920 dp[1] = (dp[1] + ((uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2) + 1)>>1;
921 dp[2] = (dp[2] + ((uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2) + 1)>>1;
922 dp[3] = (dp[3] + ((uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2) + 1)>>1;
923 dp[4] = (dp[4] + ((uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2) + 1)>>1;
924 dp[5] = (dp[5] + ((uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2) + 1)>>1;
925 dp[6] = (dp[6] + ((uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2) + 1)>>1;
926 dp[7] = (dp[7] + ((uint32_t)(s2+sp[8]+s4+sp2[8]+2)>>2) + 1)>>1;
927 sp += lx2; sp2 += lx2; dp += lx2;
931 static inline void mrec4ac(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
933 uint8_t *dp=d, *sp=s, *sp2=s+lx;
935 movq_m2r(m_(sadd2),mm7);
936 int j; for( j=0; j<h; ++j ) {
937 movq_m2r(m_(sp +0),mm1);
938 movq_m2r(m_(sp +1),mm3);
939 movq_m2r(m_(sp2+0),mm5);
943 punpcklbw_r2r(mm0,mm1);
944 punpckhbw_r2r(mm0,mm2);
945 punpcklbw_r2r(mm0,mm3);
946 punpckhbw_r2r(mm0,mm4);
947 punpcklbw_r2r(mm0,mm5);
948 punpckhbw_r2r(mm0,mm6);
949 paddusw_r2r(mm3,mm1);
950 movq_m2r(m_(sp2+1),mm3);
951 paddusw_r2r(mm4,mm2);
953 punpcklbw_r2r(mm0,mm3);
954 punpckhbw_r2r(mm0,mm4);
955 paddusw_r2r(mm5,mm3);
956 paddusw_r2r(mm6,mm4);
957 paddusw_r2r(mm3,mm1);
958 paddusw_r2r(mm4,mm2);
959 movq_m2r(m_(dp +0),mm3);
960 paddusw_r2r(mm7,mm1);
962 paddusw_r2r(mm7,mm2);
963 punpcklbw_r2r(mm0,mm3);
964 punpckhbw_r2r(mm0,mm4);
967 movq_m2r(m_(sadd1),mm5);
968 paddusw_r2r(mm3,mm1);
969 paddusw_r2r(mm4,mm2);
970 paddusw_r2r(mm5,mm1);
971 paddusw_r2r(mm5,mm2);
974 packuswb_r2r(mm2,mm1);
975 movq_r2m(mm1,m_(dp+0));
976 sp += lx2; sp2 += lx2; dp += lx2;
983 int main(int ac, char **av)
985 int i, j, k, l, m, n, done;
986 uint8_t dat0[32], dat1[32], dat2[32];
993 printf("m=%d, i=%d, j=%d, k=%d, n=%d\n",m, i, j, k, n);
994 for( l=0; l<n; ++l ) {
996 dat1[l] = dat2[l] = j+l;
997 dat0[l+16] = dat1[l+16] = dat2[l+16] = k;
1001 reca (&dat0[0], &dat1[0], 16, 1);
1002 mreca (&dat0[0], &dat2[0], 16, 1);
1005 recac (&dat0[0], &dat1[0], 16, 1);
1006 mrecac (&dat0[0], &dat2[0], 16, 1);
1009 recv (&dat0[0], &dat1[0], 0x10, 16, 1);
1010 mrecv (&dat0[0], &dat2[0], 0x10, 16, 1);
1013 recvc (&dat0[0], &dat1[0], 0x10, 16, 1);
1014 mrecvc (&dat0[0], &dat2[0], 0x10, 16, 1);
1017 recva (&dat0[0], &dat1[0], 0x10, 16, 1);
1018 mrecva (&dat0[0], &dat2[0], 0x10, 16, 1);
1021 recvac (&dat0[0], &dat1[0], 0x10, 16, 1);
1022 mrecvac(&dat0[0], &dat2[0], 0x10, 16, 1);
1025 rech (&dat0[0], &dat1[0], 16, 1);
1026 mrech (&dat0[0], &dat2[0], 16, 1);
1029 rechc (&dat0[0], &dat1[0], 16, 1);
1030 mrechc (&dat0[0], &dat2[0], 16, 1);
1033 recha (&dat0[0], &dat1[0], 16, 1);
1034 mrecha (&dat0[0], &dat2[0], 16, 1);
1037 rechac (&dat0[0], &dat1[0], 16, 1);
1038 mrechac(&dat0[0], &dat2[0], 16, 1);
1041 rec4 (&dat0[0], &dat1[0], 0x10, 16, 1);
1042 mrec4 (&dat0[0], &dat2[0], 0x10, 16, 1);
1045 rec4c (&dat0[0], &dat1[0], 0x10, 16, 1);
1046 mrec4c (&dat0[0], &dat2[0], 0x10, 16, 1);
1049 rec4a (&dat0[0], &dat1[0], 0x10, 16, 1);
1050 mrec4a (&dat0[0], &dat2[0], 0x10, 16, 1);
1053 rec4ac (&dat0[0], &dat1[0], 0x10, 16, 1);
1054 mrec4ac(&dat0[0], &dat2[0], 0x10, 16, 1);
1059 if( *(uint64_t *)&dat1[0] != *(uint64_t *)&dat2[0] )
1062 printf(" i=%5d %02x %02x %02x %02x %02x %02x %02x %02x\n", i,
1063 dat1[0], dat1[1], dat1[2], dat1[3], dat1[4], dat1[5], dat1[6], dat1[7]);
1064 printf(" j=%5d %02x %02x %02x %02x %02x %02x %02x %02x\n", j,
1065 dat2[0], dat2[1], dat2[2], dat2[3], dat2[4], dat2[5], dat2[6], dat2[7]);
1070 if( *(uint64_t *)&dat1[0] != *(uint64_t *)&dat2[0] ||
1071 *(uint64_t *)&dat1[8] != *(uint64_t *)&dat2[8] )
1074 printf(" i=%5d %02x %02x %02x %02x %02x %02x %02x %02x", i,
1075 dat1[0], dat1[1], dat1[2], dat1[3], dat1[4], dat1[5], dat1[6], dat1[7]);
1076 printf(" %02x %02x %02x %02x %02x %02x %02x %02x\n", dat1[8],
1077 dat1[9], dat1[10], dat1[11], dat1[12], dat1[13], dat1[14], dat1[15]);
1078 printf(" j=%5d %02x %02x %02x %02x %02x %02x %02x %02x", j,
1079 dat2[0], dat2[1], dat2[2], dat2[3], dat2[4], dat2[5], dat2[6], dat2[7]);
1080 printf(" %02x %02x %02x %02x %02x %02x %02x %02x\n", dat2[8],
1081 dat2[9], dat2[10], dat2[11], dat2[12], dat2[13], dat2[14], dat2[15]);