10 #define m_(v) (*(mmx_t*)(((long long *)(v))))
12 static uint32_t sadd1[2] = { 0x00010001, 0x00010001 };
13 static uint32_t sadd2[2] = { 0x00020002, 0x00020002 };
15 static uint32_t bmask[2] = { 0x7f7f7f7f, 0x7f7f7f7f };
16 static uint32_t badd1[2] = { 0x01010101, 0x01010101 };
20 static inline void reca(uint8_t *s, uint8_t *d, int lx2, int h)
23 int j; for( j=0; j<h; ++j ) {
24 dp[0] = (uint32_t)(dp[0] + sp[0] + 1) >> 1;
25 dp[1] = (uint32_t)(dp[1] + sp[1] + 1) >> 1;
26 dp[2] = (uint32_t)(dp[2] + sp[2] + 1) >> 1;
27 dp[3] = (uint32_t)(dp[3] + sp[3] + 1) >> 1;
28 dp[4] = (uint32_t)(dp[4] + sp[4] + 1) >> 1;
29 dp[5] = (uint32_t)(dp[5] + sp[5] + 1) >> 1;
30 dp[6] = (uint32_t)(dp[6] + sp[6] + 1) >> 1;
31 dp[7] = (uint32_t)(dp[7] + sp[7] + 1) >> 1;
32 dp[8] = (uint32_t)(dp[8] + sp[8] + 1) >> 1;
33 dp[9] = (uint32_t)(dp[9] + sp[9] + 1) >> 1;
34 dp[10] = (uint32_t)(dp[10] + sp[10] + 1) >> 1;
35 dp[11] = (uint32_t)(dp[11] + sp[11] + 1) >> 1;
36 dp[12] = (uint32_t)(dp[12] + sp[12] + 1) >> 1;
37 dp[13] = (uint32_t)(dp[13] + sp[13] + 1) >> 1;
38 dp[14] = (uint32_t)(dp[14] + sp[14] + 1) >> 1;
39 dp[15] = (uint32_t)(dp[15] + sp[15] + 1) >> 1;
44 static inline void mreca(uint8_t *s, uint8_t *d, int lx2, int h)
48 movq_m2r(m_(sadd1),mm7);
49 int j; for( j=0; j<h; ++j ) {
50 movq_m2r(m_(sp+0),mm1);
51 movq_m2r(m_(dp+0),mm3);
54 punpcklbw_r2r(mm0,mm1);
55 punpckhbw_r2r(mm0,mm2);
56 punpcklbw_r2r(mm0,mm3);
57 punpckhbw_r2r(mm0,mm4);
64 packuswb_r2r(mm2,mm1);
65 movq_r2m(mm1,m_(dp+0));
66 movq_m2r(m_(sp+8),mm1);
67 movq_m2r(m_(dp+8),mm3);
70 punpcklbw_r2r(mm0,mm1);
71 punpckhbw_r2r(mm0,mm2);
72 punpcklbw_r2r(mm0,mm3);
73 punpckhbw_r2r(mm0,mm4);
80 packuswb_r2r(mm2,mm1);
81 movq_r2m(mm1,m_(dp+8));
87 static inline void recac(uint8_t *s, uint8_t *d, int lx2, int h)
90 int j; for( j=0; j<h; ++j ) {
91 dp[0] = (uint32_t)(dp[0] + sp[0] + 1)>>1;
92 dp[1] = (uint32_t)(dp[1] + sp[1] + 1)>>1;
93 dp[2] = (uint32_t)(dp[2] + sp[2] + 1)>>1;
94 dp[3] = (uint32_t)(dp[3] + sp[3] + 1)>>1;
95 dp[4] = (uint32_t)(dp[4] + sp[4] + 1)>>1;
96 dp[5] = (uint32_t)(dp[5] + sp[5] + 1)>>1;
97 dp[6] = (uint32_t)(dp[6] + sp[6] + 1)>>1;
98 dp[7] = (uint32_t)(dp[7] + sp[7] + 1)>>1;
103 static inline void mrecac(uint8_t *s, uint8_t *d, int lx2, int h)
105 uint8_t *dp=d, *sp=s;
107 movq_m2r(m_(sadd1),mm7);
108 int j; for( j=0; j<h; ++j ) {
109 movq_m2r(m_(sp+0),mm1);
110 movq_m2r(m_(dp+0),mm3);
113 punpcklbw_r2r(mm0,mm1);
114 punpckhbw_r2r(mm0,mm2);
115 punpcklbw_r2r(mm0,mm3);
116 punpckhbw_r2r(mm0,mm4);
117 paddusw_r2r(mm3,mm1);
118 paddusw_r2r(mm4,mm2);
119 paddusw_r2r(mm7,mm1);
120 paddusw_r2r(mm7,mm2);
123 packuswb_r2r(mm2,mm1);
124 movq_r2m(mm1,m_(dp+0));
125 sp += lx2; dp += lx2;
130 static inline void recv(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
132 uint8_t *dp=d, *sp=s, *sp2=s+lx;
133 int j; for( j=0; j<h; ++j ) {
134 dp[0] = (uint32_t)(sp[0] + sp2[0] + 1) >> 1;
135 dp[1] = (uint32_t)(sp[1] + sp2[1] + 1) >> 1;
136 dp[2] = (uint32_t)(sp[2] + sp2[2] + 1) >> 1;
137 dp[3] = (uint32_t)(sp[3] + sp2[3] + 1) >> 1;
138 dp[4] = (uint32_t)(sp[4] + sp2[4] + 1) >> 1;
139 dp[5] = (uint32_t)(sp[5] + sp2[5] + 1) >> 1;
140 dp[6] = (uint32_t)(sp[6] + sp2[6] + 1) >> 1;
141 dp[7] = (uint32_t)(sp[7] + sp2[7] + 1) >> 1;
142 dp[8] = (uint32_t)(sp[8] + sp2[8] + 1) >> 1;
143 dp[9] = (uint32_t)(sp[9] + sp2[9] + 1) >> 1;
144 dp[10] = (uint32_t)(sp[10] + sp2[10] + 1) >> 1;
145 dp[11] = (uint32_t)(sp[11] + sp2[11] + 1) >> 1;
146 dp[12] = (uint32_t)(sp[12] + sp2[12] + 1) >> 1;
147 dp[13] = (uint32_t)(sp[13] + sp2[13] + 1) >> 1;
148 dp[14] = (uint32_t)(sp[14] + sp2[14] + 1) >> 1;
149 dp[15] = (uint32_t)(sp[15] + sp2[15] + 1) >> 1;
150 sp += lx2; sp2 += lx2 ; dp += lx2;
154 static inline void mrecv(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
156 uint8_t *dp=d, *sp=s, *sp2=s+lx;
158 movq_m2r(m_(sadd1),mm7);
159 int j; for( j=0; j<h; ++j ) {
160 movq_m2r(m_(sp +0),mm1);
161 movq_m2r(m_(sp2+0),mm3);
164 punpcklbw_r2r(mm0,mm1);
165 punpckhbw_r2r(mm0,mm2);
166 punpcklbw_r2r(mm0,mm3);
167 punpckhbw_r2r(mm0,mm4);
168 paddusw_r2r(mm3,mm1);
169 paddusw_r2r(mm4,mm2);
170 paddusw_r2r(mm7,mm1);
171 paddusw_r2r(mm7,mm2);
174 packuswb_r2r(mm2,mm1);
175 movq_r2m(mm1,m_(dp+0));
176 movq_m2r(m_(sp +8),mm1);
177 movq_m2r(m_(sp2+8),mm3);
180 punpcklbw_r2r(mm0,mm1);
181 punpckhbw_r2r(mm0,mm2);
182 punpcklbw_r2r(mm0,mm3);
183 punpckhbw_r2r(mm0,mm4);
184 paddusw_r2r(mm3,mm1);
185 paddusw_r2r(mm4,mm2);
186 paddusw_r2r(mm7,mm1);
187 paddusw_r2r(mm7,mm2);
190 packuswb_r2r(mm2,mm1);
191 movq_r2m(mm1,m_(dp+8));
192 sp += lx2; sp2 += lx2 ; dp += lx2;
197 static inline void recvc(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
199 uint8_t *dp=d, *sp=s, *sp2=s+lx;
200 int j; for( j=0; j<h; ++j ) {
201 dp[0] = (uint32_t)(sp[0]+sp2[0]+1)>>1;
202 dp[1] = (uint32_t)(sp[1]+sp2[1]+1)>>1;
203 dp[2] = (uint32_t)(sp[2]+sp2[2]+1)>>1;
204 dp[3] = (uint32_t)(sp[3]+sp2[3]+1)>>1;
205 dp[4] = (uint32_t)(sp[4]+sp2[4]+1)>>1;
206 dp[5] = (uint32_t)(sp[5]+sp2[5]+1)>>1;
207 dp[6] = (uint32_t)(sp[6]+sp2[6]+1)>>1;
208 dp[7] = (uint32_t)(sp[7]+sp2[7]+1)>>1;
209 sp += lx2; sp2 += lx2; dp += lx2;
213 static inline void mrecvc(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
215 uint8_t *dp=d, *sp=s, *sp2=s+lx;
217 movq_m2r(m_(sadd1),mm7);
218 int j; for( j=0; j<h; ++j ) {
219 movq_m2r(m_(sp +0),mm1);
220 movq_m2r(m_(sp2+0),mm3);
223 punpcklbw_r2r(mm0,mm1);
224 punpckhbw_r2r(mm0,mm2);
225 punpcklbw_r2r(mm0,mm3);
226 punpckhbw_r2r(mm0,mm4);
227 paddusw_r2r(mm3,mm1);
228 paddusw_r2r(mm4,mm2);
229 paddusw_r2r(mm7,mm1);
230 paddusw_r2r(mm7,mm2);
233 packuswb_r2r(mm2,mm1);
234 movq_r2m(mm1,m_(dp+0));
235 sp += lx2; sp2 += lx2; dp += lx2;
241 static inline void recva(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
243 uint8_t *dp=d, *sp=s, *sp2=s+lx;
244 int j; for( j=0; j<h; ++j ) {
245 dp[0] = (dp[0] + ((uint32_t)(sp[0]+sp2[0]+1)>>1) + 1)>>1;
246 dp[1] = (dp[1] + ((uint32_t)(sp[1]+sp2[1]+1)>>1) + 1)>>1;
247 dp[2] = (dp[2] + ((uint32_t)(sp[2]+sp2[2]+1)>>1) + 1)>>1;
248 dp[3] = (dp[3] + ((uint32_t)(sp[3]+sp2[3]+1)>>1) + 1)>>1;
249 dp[4] = (dp[4] + ((uint32_t)(sp[4]+sp2[4]+1)>>1) + 1)>>1;
250 dp[5] = (dp[5] + ((uint32_t)(sp[5]+sp2[5]+1)>>1) + 1)>>1;
251 dp[6] = (dp[6] + ((uint32_t)(sp[6]+sp2[6]+1)>>1) + 1)>>1;
252 dp[7] = (dp[7] + ((uint32_t)(sp[7]+sp2[7]+1)>>1) + 1)>>1;
253 dp[8] = (dp[8] + ((uint32_t)(sp[8]+sp2[8]+1)>>1) + 1)>>1;
254 dp[9] = (dp[9] + ((uint32_t)(sp[9]+sp2[9]+1)>>1) + 1)>>1;
255 dp[10] = (dp[10] + ((uint32_t)(sp[10]+sp2[10]+1)>>1) + 1)>>1;
256 dp[11] = (dp[11] + ((uint32_t)(sp[11]+sp2[11]+1)>>1) + 1)>>1;
257 dp[12] = (dp[12] + ((uint32_t)(sp[12]+sp2[12]+1)>>1) + 1)>>1;
258 dp[13] = (dp[13] + ((uint32_t)(sp[13]+sp2[13]+1)>>1) + 1)>>1;
259 dp[14] = (dp[14] + ((uint32_t)(sp[14]+sp2[14]+1)>>1) + 1)>>1;
260 dp[15] = (dp[15] + ((uint32_t)(sp[15]+sp2[15]+1)>>1) + 1)>>1;
261 sp += lx2; sp2 += lx2; dp += lx2;
265 static inline void mrecva(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
267 uint8_t *dp=d, *sp=s, *sp2=s+lx;
269 movq_m2r(m_(sadd1),mm7);
270 int j; for( j=0; j<h; ++j ) {
271 movq_m2r(m_(sp +0),mm1);
272 movq_m2r(m_(sp2+0),mm3);
273 movq_m2r(m_(dp +0),mm5);
277 punpcklbw_r2r(mm0,mm1);
278 punpckhbw_r2r(mm0,mm2);
279 punpcklbw_r2r(mm0,mm3);
280 punpckhbw_r2r(mm0,mm4);
281 punpcklbw_r2r(mm0,mm5);
282 punpckhbw_r2r(mm0,mm6);
283 paddusw_r2r(mm3,mm1);
284 paddusw_r2r(mm4,mm2);
285 paddusw_r2r(mm7,mm1);
286 paddusw_r2r(mm7,mm2);
289 paddusw_r2r(mm5,mm1);
290 paddusw_r2r(mm6,mm2);
291 paddusw_r2r(mm7,mm1);
292 paddusw_r2r(mm7,mm2);
295 packuswb_r2r(mm2,mm1);
296 movq_r2m(mm1,m_(dp+0));
297 movq_m2r(m_(sp +8),mm1);
298 movq_m2r(m_(sp2+8),mm3);
299 movq_m2r(m_(dp +8),mm5);
303 punpcklbw_r2r(mm0,mm1);
304 punpckhbw_r2r(mm0,mm2);
305 punpcklbw_r2r(mm0,mm3);
306 punpckhbw_r2r(mm0,mm4);
307 punpcklbw_r2r(mm0,mm5);
308 punpckhbw_r2r(mm0,mm6);
309 paddusw_r2r(mm3,mm1);
310 paddusw_r2r(mm4,mm2);
311 paddusw_r2r(mm7,mm1);
312 paddusw_r2r(mm7,mm2);
315 paddusw_r2r(mm5,mm1);
316 paddusw_r2r(mm6,mm2);
317 paddusw_r2r(mm7,mm1);
318 paddusw_r2r(mm7,mm2);
321 packuswb_r2r(mm2,mm1);
322 movq_r2m(mm1,m_(dp+8));
323 sp += lx2; sp2 += lx2; dp += lx2;
329 static inline void recvac(uint8_t *s, uint8_t *d, int lx,int lx2, int h)
331 uint8_t *dp=d, *sp=s, *sp2=s+lx;
332 int j; for( j=0; j<h; ++j ) {
333 dp[0] = (dp[0] + ((uint32_t)(sp[0]+sp2[0]+1)>>1) + 1)>>1;
334 dp[1] = (dp[1] + ((uint32_t)(sp[1]+sp2[1]+1)>>1) + 1)>>1;
335 dp[2] = (dp[2] + ((uint32_t)(sp[2]+sp2[2]+1)>>1) + 1)>>1;
336 dp[3] = (dp[3] + ((uint32_t)(sp[3]+sp2[3]+1)>>1) + 1)>>1;
337 dp[4] = (dp[4] + ((uint32_t)(sp[4]+sp2[4]+1)>>1) + 1)>>1;
338 dp[5] = (dp[5] + ((uint32_t)(sp[5]+sp2[5]+1)>>1) + 1)>>1;
339 dp[6] = (dp[6] + ((uint32_t)(sp[6]+sp2[6]+1)>>1) + 1)>>1;
340 dp[7] = (dp[7] + ((uint32_t)(sp[7]+sp2[7]+1)>>1) + 1)>>1;
341 sp += lx2; sp2 += lx2; dp+= lx2;
345 static inline void mrecvac(uint8_t *s, uint8_t *d, int lx,int lx2, int h)
347 uint8_t *dp=d, *sp=s, *sp2=s+lx;
349 movq_m2r(m_(sadd1),mm7);
350 int j; for( j=0; j<h; ++j ) {
351 movq_m2r(m_(sp +0),mm1);
352 movq_m2r(m_(sp2+0),mm3);
353 movq_m2r(m_(dp +0),mm5);
357 punpcklbw_r2r(mm0,mm1);
358 punpckhbw_r2r(mm0,mm2);
359 punpcklbw_r2r(mm0,mm3);
360 punpckhbw_r2r(mm0,mm4);
361 punpcklbw_r2r(mm0,mm5);
362 punpckhbw_r2r(mm0,mm6);
363 paddusw_r2r(mm3,mm1);
364 paddusw_r2r(mm4,mm2);
365 paddusw_r2r(mm7,mm1);
366 paddusw_r2r(mm7,mm2);
369 paddusw_r2r(mm5,mm1);
370 paddusw_r2r(mm6,mm2);
371 paddusw_r2r(mm7,mm1);
372 paddusw_r2r(mm7,mm2);
375 packuswb_r2r(mm2,mm1);
376 movq_r2m(mm1,m_(dp+0));
377 sp += lx2; sp2 += lx2; dp += lx2;
383 static inline void rech(uint8_t *s, uint8_t *d, int lx2, int h)
385 uint8_t *dp=d, *sp=s;
387 int j; for( j=0; j<h; ++j ) {
389 dp[0] = (uint32_t)(s1+(s2=sp[1])+1)>>1;
390 dp[1] = (uint32_t)(s2+(s1=sp[2])+1)>>1;
391 dp[2] = (uint32_t)(s1+(s2=sp[3])+1)>>1;
392 dp[3] = (uint32_t)(s2+(s1=sp[4])+1)>>1;
393 dp[4] = (uint32_t)(s1+(s2=sp[5])+1)>>1;
394 dp[5] = (uint32_t)(s2+(s1=sp[6])+1)>>1;
395 dp[6] = (uint32_t)(s1+(s2=sp[7])+1)>>1;
396 dp[7] = (uint32_t)(s2+(s1=sp[8])+1)>>1;
397 dp[8] = (uint32_t)(s1+(s2=sp[9])+1)>>1;
398 dp[9] = (uint32_t)(s2+(s1=sp[10])+1)>>1;
399 dp[10] = (uint32_t)(s1+(s2=sp[11])+1)>>1;
400 dp[11] = (uint32_t)(s2+(s1=sp[12])+1)>>1;
401 dp[12] = (uint32_t)(s1+(s2=sp[13])+1)>>1;
402 dp[13] = (uint32_t)(s2+(s1=sp[14])+1)>>1;
403 dp[14] = (uint32_t)(s1+(s2=sp[15])+1)>>1;
404 dp[15] = (uint32_t)(s2+sp[16]+1)>>1;
405 sp += lx2; dp += lx2;
409 static inline void mrech(uint8_t *s, uint8_t *d, int lx2, int h)
411 uint8_t *dp=d, *sp=s;
413 movq_m2r(m_(sadd1),mm7);
414 int j; for( j=0; j<h; ++j ) {
415 movq_m2r(m_(sp+0),mm1);
416 movq_m2r(m_(sp+1),mm3);
419 punpcklbw_r2r(mm0,mm1);
420 punpckhbw_r2r(mm0,mm2);
421 punpcklbw_r2r(mm0,mm3);
422 punpckhbw_r2r(mm0,mm4);
423 paddusw_r2r(mm3,mm1);
424 paddusw_r2r(mm4,mm2);
425 paddusw_r2r(mm7,mm1);
426 paddusw_r2r(mm7,mm2);
429 packuswb_r2r(mm2,mm1);
430 movq_r2m(mm1,m_(dp+0));
431 movq_m2r(m_(sp+8),mm1);
432 movq_m2r(m_(sp+9),mm3);
435 punpcklbw_r2r(mm0,mm1);
436 punpckhbw_r2r(mm0,mm2);
437 punpcklbw_r2r(mm0,mm3);
438 punpckhbw_r2r(mm0,mm4);
439 paddusw_r2r(mm3,mm1);
440 paddusw_r2r(mm4,mm2);
441 paddusw_r2r(mm7,mm1);
442 paddusw_r2r(mm7,mm2);
445 packuswb_r2r(mm2,mm1);
446 movq_r2m(mm1,m_(dp+8));
447 sp += lx2; dp += lx2;
453 static inline void rechc(uint8_t *s, uint8_t *d, int lx2, int h)
455 uint8_t *dp=d, *sp=s;
457 int j; for( j=0; j<h; ++j ) {
459 dp[0] = (uint32_t)(s1+(s2=sp[1])+1)>>1;
460 dp[1] = (uint32_t)(s2+(s1=sp[2])+1)>>1;
461 dp[2] = (uint32_t)(s1+(s2=sp[3])+1)>>1;
462 dp[3] = (uint32_t)(s2+(s1=sp[4])+1)>>1;
463 dp[4] = (uint32_t)(s1+(s2=sp[5])+1)>>1;
464 dp[5] = (uint32_t)(s2+(s1=sp[6])+1)>>1;
465 dp[6] = (uint32_t)(s1+(s2=sp[7])+1)>>1;
466 dp[7] = (uint32_t)(s2+sp[8]+1)>>1;
467 sp += lx2; dp += lx2;
471 static inline void mrechc(uint8_t *s, uint8_t *d, int lx2, int h)
473 uint8_t *dp=d, *sp=s;
475 movq_m2r(m_(sadd1),mm7);
476 int j; for( j=0; j<h; ++j ) {
477 movq_m2r(m_(sp+0),mm1);
478 movq_m2r(m_(sp+1),mm3);
481 punpcklbw_r2r(mm0,mm1);
482 punpckhbw_r2r(mm0,mm2);
483 punpcklbw_r2r(mm0,mm3);
484 punpckhbw_r2r(mm0,mm4);
485 paddusw_r2r(mm3,mm1);
486 paddusw_r2r(mm4,mm2);
487 paddusw_r2r(mm7,mm1);
488 paddusw_r2r(mm7,mm2);
491 packuswb_r2r(mm2,mm1);
492 movq_r2m(mm1,m_(dp+0));
493 sp += lx2; dp += lx2;
498 static inline void recha(uint8_t *s, uint8_t *d,int lx2, int h)
500 uint8_t *dp=d, *sp=s;
502 int j; for( j=0; j<h; ++j ) {
504 dp[0] = (dp[0] + ((uint32_t)(s1 + (s2 = sp[1]) + 1) >> 1) + 1) >> 1;
505 dp[1] = (dp[1] + ((uint32_t)(s2 + (s1 = sp[2]) + 1) >> 1) + 1) >> 1;
506 dp[2] = (dp[2] + ((uint32_t)(s1 + (s2 = sp[3]) + 1) >> 1) + 1) >> 1;
507 dp[3] = (dp[3] + ((uint32_t)(s2 + (s1 = sp[4]) + 1) >> 1) + 1) >> 1;
508 dp[4] = (dp[4] + ((uint32_t)(s1 + (s2 = sp[5]) + 1) >> 1) + 1) >> 1;
509 dp[5] = (dp[5] + ((uint32_t)(s2 + (s1 = sp[6]) + 1) >> 1) + 1) >> 1;
510 dp[6] = (dp[6] + ((uint32_t)(s1 + (s2 = sp[7]) + 1) >> 1) + 1) >> 1;
511 dp[7] = (dp[7] + ((uint32_t)(s2 + (s1 = sp[8]) + 1) >> 1) + 1) >> 1;
512 dp[8] = (dp[8] + ((uint32_t)(s1 + (s2 = sp[9]) + 1) >> 1) + 1) >> 1;
513 dp[9] = (dp[9] + ((uint32_t)(s2 + (s1 = sp[10]) + 1) >> 1) + 1) >> 1;
514 dp[10] = (dp[10] + ((uint32_t)(s1 + (s2 = sp[11]) + 1) >> 1) + 1) >> 1;
515 dp[11] = (dp[11] + ((uint32_t)(s2 + (s1 = sp[12]) + 1) >> 1) + 1) >> 1;
516 dp[12] = (dp[12] + ((uint32_t)(s1 + (s2 = sp[13]) + 1) >> 1) + 1) >> 1;
517 dp[13] = (dp[13] + ((uint32_t)(s2 + (s1 = sp[14]) + 1) >> 1) + 1) >> 1;
518 dp[14] = (dp[14] + ((uint32_t)(s1 + (s2 = sp[15]) + 1) >> 1) + 1) >> 1;
519 dp[15] = (dp[15] + ((uint32_t)(s2 + sp[16] + 1) >> 1) + 1) >> 1;
520 sp += lx2; dp += lx2;
524 static inline void mrecha(uint8_t *s, uint8_t *d,int lx2, int h)
526 uint8_t *dp=d, *sp=s;
528 movq_m2r(m_(sadd1),mm7);
529 int j; for( j=0; j<h; ++j ) {
530 movq_m2r(m_(sp+0),mm1);
531 movq_m2r(m_(sp+1),mm3);
532 movq_m2r(m_(dp+0),mm5);
536 punpcklbw_r2r(mm0,mm1);
537 punpckhbw_r2r(mm0,mm2);
538 punpcklbw_r2r(mm0,mm3);
539 punpckhbw_r2r(mm0,mm4);
540 punpcklbw_r2r(mm0,mm5);
541 punpckhbw_r2r(mm0,mm6);
542 paddusw_r2r(mm3,mm1);
543 paddusw_r2r(mm4,mm2);
544 paddusw_r2r(mm7,mm1);
545 paddusw_r2r(mm7,mm2);
548 paddusw_r2r(mm5,mm1);
549 paddusw_r2r(mm6,mm2);
550 paddusw_r2r(mm7,mm1);
551 paddusw_r2r(mm7,mm2);
554 packuswb_r2r(mm2,mm1);
555 movq_r2m(mm1,m_(dp+0));
556 movq_m2r(m_(sp+8),mm1);
557 movq_m2r(m_(sp+9),mm3);
558 movq_m2r(m_(dp+8),mm5);
562 punpcklbw_r2r(mm0,mm1);
563 punpckhbw_r2r(mm0,mm2);
564 punpcklbw_r2r(mm0,mm3);
565 punpckhbw_r2r(mm0,mm4);
566 punpcklbw_r2r(mm0,mm5);
567 punpckhbw_r2r(mm0,mm6);
568 paddusw_r2r(mm3,mm1);
569 paddusw_r2r(mm4,mm2);
570 paddusw_r2r(mm7,mm1);
571 paddusw_r2r(mm7,mm2);
574 paddusw_r2r(mm5,mm1);
575 paddusw_r2r(mm6,mm2);
576 paddusw_r2r(mm7,mm1);
577 paddusw_r2r(mm7,mm2);
580 packuswb_r2r(mm2,mm1);
581 movq_r2m(mm1,m_(dp+8));
582 sp += lx2; dp += lx2;
588 static inline void rechac(uint8_t *s, uint8_t *d, int lx2, int h)
590 uint8_t *dp=d, *sp=s;
592 int j; for( j=0; j<h; ++j ) {
594 dp[0] = (dp[0] + ((uint32_t)(s1 + (s2 = sp[1]) + 1) >> 1) + 1) >> 1;
595 dp[1] = (dp[1] + ((uint32_t)(s2 + (s1 = sp[2]) + 1) >> 1) + 1) >> 1;
596 dp[2] = (dp[2] + ((uint32_t)(s1 + (s2 = sp[3]) + 1) >> 1) + 1) >> 1;
597 dp[3] = (dp[3] + ((uint32_t)(s2 + (s1 = sp[4]) + 1) >> 1) + 1) >> 1;
598 dp[4] = (dp[4] + ((uint32_t)(s1 + (s2 = sp[5]) + 1) >> 1) + 1) >> 1;
599 dp[5] = (dp[5] + ((uint32_t)(s2 + (s1 = sp[6]) + 1) >> 1) + 1) >> 1;
600 dp[6] = (dp[6] + ((uint32_t)(s1 + (s2 = sp[7]) + 1) >> 1) + 1) >> 1;
601 dp[7] = (dp[7] + ((uint32_t)(s2 + sp[8] + 1) >> 1) + 1) >> 1;
602 sp += lx2; dp += lx2;
606 static inline void mrechac(uint8_t *s, uint8_t *d, int lx2, int h)
608 uint8_t *dp=d, *sp=s;
610 movq_m2r(m_(sadd1),mm7);
611 int j; for( j=0; j<h; ++j ) {
612 movq_m2r(m_(sp+0),mm1);
613 movq_m2r(m_(sp+1),mm3);
614 movq_m2r(m_(dp+0),mm5);
618 punpcklbw_r2r(mm0,mm1);
619 punpckhbw_r2r(mm0,mm2);
620 punpcklbw_r2r(mm0,mm3);
621 punpckhbw_r2r(mm0,mm4);
622 punpcklbw_r2r(mm0,mm5);
623 punpckhbw_r2r(mm0,mm6);
624 paddusw_r2r(mm3,mm1);
625 paddusw_r2r(mm4,mm2);
626 paddusw_r2r(mm7,mm1);
627 paddusw_r2r(mm7,mm2);
630 paddusw_r2r(mm5,mm1);
631 paddusw_r2r(mm6,mm2);
632 paddusw_r2r(mm7,mm1);
633 paddusw_r2r(mm7,mm2);
636 packuswb_r2r(mm2,mm1);
637 movq_r2m(mm1,m_(dp+0));
638 sp += lx2; dp += lx2;
644 static inline void rec4(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
646 uint8_t *dp=d, *sp=s, *sp2=s+lx;
647 uint32_t s1, s2, s3, s4;
648 int j; for( j=0; j<h; ++j ) {
650 dp[0] = (uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2;
651 dp[1] = (uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2;
652 dp[2] = (uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2;
653 dp[3] = (uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2;
654 dp[4] = (uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2;
655 dp[5] = (uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2;
656 dp[6] = (uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2;
657 dp[7] = (uint32_t)(s2+(s1=sp[8])+s4+(s3=sp2[8])+2)>>2;
658 dp[8] = (uint32_t)(s1+(s2=sp[9])+s3+(s4=sp2[9])+2)>>2;
659 dp[9] = (uint32_t)(s2+(s1=sp[10])+s4+(s3=sp2[10])+2)>>2;
660 dp[10] = (uint32_t)(s1+(s2=sp[11])+s3+(s4=sp2[11])+2)>>2;
661 dp[11] = (uint32_t)(s2+(s1=sp[12])+s4+(s3=sp2[12])+2)>>2;
662 dp[12] = (uint32_t)(s1+(s2=sp[13])+s3+(s4=sp2[13])+2)>>2;
663 dp[13] = (uint32_t)(s2+(s1=sp[14])+s4+(s3=sp2[14])+2)>>2;
664 dp[14] = (uint32_t)(s1+(s2=sp[15])+s3+(s4=sp2[15])+2)>>2;
665 dp[15] = (uint32_t)(s2+sp[16]+s4+sp2[16]+2)>>2;
666 sp += lx2; sp2 += lx2; dp += lx2;
670 static inline void mrec4(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
672 uint8_t *dp=d, *sp=s, *sp2=s+lx;
674 movq_m2r(m_(sadd2),mm7);
675 int j; for( j=0; j<h; ++j ) {
676 movq_m2r(m_(sp +0),mm1);
677 movq_m2r(m_(sp +1),mm3);
678 movq_m2r(m_(sp2+0),mm5);
682 punpcklbw_r2r(mm0,mm1);
683 punpckhbw_r2r(mm0,mm2);
684 punpcklbw_r2r(mm0,mm3);
685 punpckhbw_r2r(mm0,mm4);
686 punpcklbw_r2r(mm0,mm5);
687 punpckhbw_r2r(mm0,mm6);
688 paddusw_r2r(mm3,mm1);
689 movq_m2r(m_(sp2+1),mm3);
690 paddusw_r2r(mm4,mm2);
692 punpcklbw_r2r(mm0,mm3);
693 punpckhbw_r2r(mm0,mm4);
694 paddusw_r2r(mm5,mm3);
695 paddusw_r2r(mm6,mm4);
696 paddusw_r2r(mm3,mm1);
697 paddusw_r2r(mm4,mm2);
698 paddusw_r2r(mm7,mm1);
699 paddusw_r2r(mm7,mm2);
702 packuswb_r2r(mm2,mm1);
703 movq_r2m(mm1,m_(dp+0));
704 movq_m2r(m_(sp +8),mm1);
705 movq_m2r(m_(sp +9),mm3);
706 movq_m2r(m_(sp2+8),mm5);
710 punpcklbw_r2r(mm0,mm1);
711 punpckhbw_r2r(mm0,mm2);
712 punpcklbw_r2r(mm0,mm3);
713 punpckhbw_r2r(mm0,mm4);
714 punpcklbw_r2r(mm0,mm5);
715 punpckhbw_r2r(mm0,mm6);
716 paddusw_r2r(mm3,mm1);
717 movq_m2r(m_(sp2+9),mm3);
718 paddusw_r2r(mm4,mm2);
720 punpcklbw_r2r(mm0,mm3);
721 punpckhbw_r2r(mm0,mm4);
722 paddusw_r2r(mm5,mm3);
723 paddusw_r2r(mm6,mm4);
724 paddusw_r2r(mm3,mm1);
725 paddusw_r2r(mm4,mm2);
726 paddusw_r2r(mm7,mm1);
727 paddusw_r2r(mm7,mm2);
730 packuswb_r2r(mm2,mm1);
731 movq_r2m(mm1,m_(dp+8));
732 sp += lx2; sp2 += lx2; dp += lx2;
738 static inline void rec4c(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
740 uint8_t *dp=d, *sp=s, *sp2=s+lx;
741 uint32_t s1, s2, s3, s4;
742 int j; for( j=0; j<h; ++j ) {
744 dp[0] = (uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2;
745 dp[1] = (uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2;
746 dp[2] = (uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2;
747 dp[3] = (uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2;
748 dp[4] = (uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2;
749 dp[5] = (uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2;
750 dp[6] = (uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2;
751 dp[7] = (uint32_t)(s2+sp[8]+s4+sp2[8]+2)>>2;
752 sp += lx2; sp2 += lx2; dp += lx2;
756 static inline void mrec4c(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
758 uint8_t *dp=d, *sp=s, *sp2=s+lx;
760 movq_m2r(m_(sadd2),mm7);
761 int j; for( j=0; j<h; ++j ) {
762 movq_m2r(m_(sp +0),mm1);
763 movq_m2r(m_(sp +1),mm3);
764 movq_m2r(m_(sp2+0),mm5);
768 punpcklbw_r2r(mm0,mm1);
769 punpckhbw_r2r(mm0,mm2);
770 punpcklbw_r2r(mm0,mm3);
771 punpckhbw_r2r(mm0,mm4);
772 punpcklbw_r2r(mm0,mm5);
773 punpckhbw_r2r(mm0,mm6);
774 paddusw_r2r(mm3,mm1);
775 movq_m2r(m_(sp2+1),mm3);
776 paddusw_r2r(mm4,mm2);
778 punpcklbw_r2r(mm0,mm3);
779 punpckhbw_r2r(mm0,mm4);
780 paddusw_r2r(mm5,mm3);
781 paddusw_r2r(mm6,mm4);
782 paddusw_r2r(mm3,mm1);
783 paddusw_r2r(mm4,mm2);
784 paddusw_r2r(mm7,mm1);
785 paddusw_r2r(mm7,mm2);
788 packuswb_r2r(mm2,mm1);
789 movq_r2m(mm1,m_(dp+0));
790 sp += lx2; sp2 += lx2; dp += lx2;
796 static inline void rec4a(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
798 uint8_t *dp=d, *sp=s, *sp2=s+lx;
799 uint32_t s1, s2, s3, s4;
800 int j; for( j=0; j<h; ++j ) {
801 s1 = sp[0]; s3 = sp2[0];
802 dp[0] = (dp[0] + ((uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2) + 1)>>1;
803 dp[1] = (dp[1] + ((uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2) + 1)>>1;
804 dp[2] = (dp[2] + ((uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2) + 1)>>1;
805 dp[3] = (dp[3] + ((uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2) + 1)>>1;
806 dp[4] = (dp[4] + ((uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2) + 1)>>1;
807 dp[5] = (dp[5] + ((uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2) + 1)>>1;
808 dp[6] = (dp[6] + ((uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2) + 1)>>1;
809 dp[7] = (dp[7] + ((uint32_t)(s2+(s1=sp[8])+s4+(s3=sp2[8])+2)>>2) + 1)>>1;
810 dp[8] = (dp[8] + ((uint32_t)(s1+(s2=sp[9])+s3+(s4=sp2[9])+2)>>2) + 1)>>1;
811 dp[9] = (dp[9] + ((uint32_t)(s2+(s1=sp[10])+s4+(s3=sp2[10])+2)>>2) + 1)>>1;
812 dp[10] = (dp[10] + ((uint32_t)(s1+(s2=sp[11])+s3+(s4=sp2[11])+2)>>2) + 1)>>1;
813 dp[11] = (dp[11] + ((uint32_t)(s2+(s1=sp[12])+s4+(s3=sp2[12])+2)>>2) + 1)>>1;
814 dp[12] = (dp[12] + ((uint32_t)(s1+(s2=sp[13])+s3+(s4=sp2[13])+2)>>2) + 1)>>1;
815 dp[13] = (dp[13] + ((uint32_t)(s2+(s1=sp[14])+s4+(s3=sp2[14])+2)>>2) + 1)>>1;
816 dp[14] = (dp[14] + ((uint32_t)(s1+(s2=sp[15])+s3+(s4=sp2[15])+2)>>2) + 1)>>1;
817 dp[15] = (dp[15] + ((uint32_t)(s2+sp[16]+s4+sp2[16]+2)>>2) + 1)>>1;
818 sp += lx2; sp2 += lx2; dp += lx2;
822 static inline void mrec4a(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
824 uint8_t *dp=d, *sp=s, *sp2=s+lx;
826 movq_m2r(m_(sadd2),mm7);
827 int j; for( j=0; j<h; ++j ) {
828 movq_m2r(m_(sp +0),mm1);
829 movq_m2r(m_(sp +1),mm3);
830 movq_m2r(m_(sp2+0),mm5);
834 punpcklbw_r2r(mm0,mm1);
835 punpckhbw_r2r(mm0,mm2);
836 punpcklbw_r2r(mm0,mm3);
837 punpckhbw_r2r(mm0,mm4);
838 punpcklbw_r2r(mm0,mm5);
839 punpckhbw_r2r(mm0,mm6);
840 paddusw_r2r(mm3,mm1);
841 movq_m2r(m_(sp2+1),mm3);
842 paddusw_r2r(mm4,mm2);
844 punpcklbw_r2r(mm0,mm3);
845 punpckhbw_r2r(mm0,mm4);
846 paddusw_r2r(mm5,mm3);
847 paddusw_r2r(mm6,mm4);
848 paddusw_r2r(mm3,mm1);
849 paddusw_r2r(mm4,mm2);
850 movq_m2r(m_(dp +0),mm3);
851 paddusw_r2r(mm7,mm1);
853 paddusw_r2r(mm7,mm2);
854 punpcklbw_r2r(mm0,mm3);
855 punpckhbw_r2r(mm0,mm4);
858 movq_m2r(m_(sadd1),mm5);
859 paddusw_r2r(mm3,mm1);
860 paddusw_r2r(mm4,mm2);
861 paddusw_r2r(mm5,mm1);
862 paddusw_r2r(mm5,mm2);
865 packuswb_r2r(mm2,mm1);
866 movq_r2m(mm1,m_(dp+0));
867 movq_m2r(m_(sp +8),mm1);
868 movq_m2r(m_(sp +9),mm3);
869 movq_m2r(m_(sp2+8),mm5);
873 punpcklbw_r2r(mm0,mm1);
874 punpckhbw_r2r(mm0,mm2);
875 punpcklbw_r2r(mm0,mm3);
876 punpckhbw_r2r(mm0,mm4);
877 punpcklbw_r2r(mm0,mm5);
878 punpckhbw_r2r(mm0,mm6);
879 paddusw_r2r(mm3,mm1);
880 movq_m2r(m_(sp2+9),mm3);
881 paddusw_r2r(mm4,mm2);
883 punpcklbw_r2r(mm0,mm3);
884 punpckhbw_r2r(mm0,mm4);
885 paddusw_r2r(mm5,mm3);
886 paddusw_r2r(mm6,mm4);
887 paddusw_r2r(mm3,mm1);
888 paddusw_r2r(mm4,mm2);
889 movq_m2r(m_(dp +8),mm3);
890 paddusw_r2r(mm7,mm1);
892 paddusw_r2r(mm7,mm2);
893 punpcklbw_r2r(mm0,mm3);
894 punpckhbw_r2r(mm0,mm4);
897 movq_m2r(m_(sadd1),mm5);
898 paddusw_r2r(mm3,mm1);
899 paddusw_r2r(mm4,mm2);
900 paddusw_r2r(mm5,mm1);
901 paddusw_r2r(mm5,mm2);
904 packuswb_r2r(mm2,mm1);
905 movq_r2m(mm1,m_(dp+8));
906 sp += lx2; sp2 += lx2; dp += lx2;
912 static inline void rec4ac(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
914 uint8_t *dp=d, *sp=s, *sp2=s+lx;
915 uint32_t s1,s2,s3,s4;
916 int j; for( j=0; j<h; ++j ) {
918 dp[0] = (dp[0] + ((uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2) + 1)>>1;
919 dp[1] = (dp[1] + ((uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2) + 1)>>1;
920 dp[2] = (dp[2] + ((uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2) + 1)>>1;
921 dp[3] = (dp[3] + ((uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2) + 1)>>1;
922 dp[4] = (dp[4] + ((uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2) + 1)>>1;
923 dp[5] = (dp[5] + ((uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2) + 1)>>1;
924 dp[6] = (dp[6] + ((uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2) + 1)>>1;
925 dp[7] = (dp[7] + ((uint32_t)(s2+sp[8]+s4+sp2[8]+2)>>2) + 1)>>1;
926 sp += lx2; sp2 += lx2; dp += lx2;
930 static inline void mrec4ac(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
932 uint8_t *dp=d, *sp=s, *sp2=s+lx;
934 movq_m2r(m_(sadd2),mm7);
935 int j; for( j=0; j<h; ++j ) {
936 movq_m2r(m_(sp +0),mm1);
937 movq_m2r(m_(sp +1),mm3);
938 movq_m2r(m_(sp2+0),mm5);
942 punpcklbw_r2r(mm0,mm1);
943 punpckhbw_r2r(mm0,mm2);
944 punpcklbw_r2r(mm0,mm3);
945 punpckhbw_r2r(mm0,mm4);
946 punpcklbw_r2r(mm0,mm5);
947 punpckhbw_r2r(mm0,mm6);
948 paddusw_r2r(mm3,mm1);
949 movq_m2r(m_(sp2+1),mm3);
950 paddusw_r2r(mm4,mm2);
952 punpcklbw_r2r(mm0,mm3);
953 punpckhbw_r2r(mm0,mm4);
954 paddusw_r2r(mm5,mm3);
955 paddusw_r2r(mm6,mm4);
956 paddusw_r2r(mm3,mm1);
957 paddusw_r2r(mm4,mm2);
958 movq_m2r(m_(dp +0),mm3);
959 paddusw_r2r(mm7,mm1);
961 paddusw_r2r(mm7,mm2);
962 punpcklbw_r2r(mm0,mm3);
963 punpckhbw_r2r(mm0,mm4);
966 movq_m2r(m_(sadd1),mm5);
967 paddusw_r2r(mm3,mm1);
968 paddusw_r2r(mm4,mm2);
969 paddusw_r2r(mm5,mm1);
970 paddusw_r2r(mm5,mm2);
973 packuswb_r2r(mm2,mm1);
974 movq_r2m(mm1,m_(dp+0));
975 sp += lx2; sp2 += lx2; dp += lx2;
982 int main(int ac, char **av)
984 int i, j, k, l, m, n, done;
985 uint8_t dat0[32], dat1[32], dat2[32];
986 for( m=0; m<14; ++m ) {
990 for( i=0; i<256 && !done ; ++i ) {
991 for( j=0; j<256 && !done; j+=n ) {
993 for( k=0; k<256; k+=n ) {
994 for( l=0; l<n; ++l ) {
996 dat1[l] = dat2[l] = j+l;
997 dat0[l+16] = dat1[l+16] = dat2[l+16] = k;
1001 reca (&dat0[0], &dat1[0], 16, 1);
1002 mreca (&dat0[0], &dat2[0], 16, 1);
1005 recac (&dat0[0], &dat1[0], 16, 1);
1006 mrecac (&dat0[0], &dat2[0], 16, 1);
1009 recv (&dat0[0], &dat1[0], 0x10, 16, 1);
1010 mrecv (&dat0[0], &dat2[0], 0x10, 16, 1);
1013 recvc (&dat0[0], &dat1[0], 0x10, 16, 1);
1014 mrecvc (&dat0[0], &dat2[0], 0x10, 16, 1);
1017 recva (&dat0[0], &dat1[0], 0x10, 16, 1);
1018 mrecva (&dat0[0], &dat2[0], 0x10, 16, 1);
1021 recvac (&dat0[0], &dat1[0], 0x10, 16, 1);
1022 mrecvac(&dat0[0], &dat2[0], 0x10, 16, 1);
1025 rech (&dat0[0], &dat1[0], 16, 1);
1026 mrech (&dat0[0], &dat2[0], 16, 1);
1029 rechc (&dat0[0], &dat1[0], 16, 1);
1030 mrechc (&dat0[0], &dat2[0], 16, 1);
1033 recha (&dat0[0], &dat1[0], 16, 1);
1034 mrecha (&dat0[0], &dat2[0], 16, 1);
1037 rechac (&dat0[0], &dat1[0], 16, 1);
1038 mrechac(&dat0[0], &dat2[0], 16, 1);
1041 rec4 (&dat0[0], &dat1[0], 0x10, 16, 1);
1042 mrec4 (&dat0[0], &dat2[0], 0x10, 16, 1);
1045 rec4c (&dat0[0], &dat1[0], 0x10, 16, 1);
1046 mrec4c (&dat0[0], &dat2[0], 0x10, 16, 1);
1049 rec4a (&dat0[0], &dat1[0], 0x10, 16, 1);
1050 mrec4a (&dat0[0], &dat2[0], 0x10, 16, 1);
1053 rec4ac (&dat0[0], &dat1[0], 0x10, 16, 1);
1054 mrec4ac(&dat0[0], &dat2[0], 0x10, 16, 1);
1059 if( *(uint64_t *)&dat1[0] != *(uint64_t *)&dat2[0] )
1062 printf(" i=%5d %02x %02x %02x %02x %02x %02x %02x %02x\n", i,
1063 dat1[0], dat1[1], dat1[2], dat1[3], dat1[4], dat1[5], dat1[6], dat1[7]);
1064 printf(" j=%5d %02x %02x %02x %02x %02x %02x %02x %02x\n", j,
1065 dat2[0], dat2[1], dat2[2], dat2[3], dat2[4], dat2[5], dat2[6], dat2[7]);
1071 if( *(uint64_t *)&dat1[0] != *(uint64_t *)&dat2[0] ||
1072 *(uint64_t *)&dat1[8] != *(uint64_t *)&dat2[8] )
1075 printf(" i=%5d %02x %02x %02x %02x %02x %02x %02x %02x", i,
1076 dat1[0], dat1[1], dat1[2], dat1[3], dat1[4], dat1[5], dat1[6], dat1[7]);
1077 printf(" %02x %02x %02x %02x %02x %02x %02x %02x\n", dat1[8],
1078 dat1[9], dat1[10], dat1[11], dat1[12], dat1[13], dat1[14], dat1[15]);
1079 printf(" j=%5d %02x %02x %02x %02x %02x %02x %02x %02x", j,
1080 dat2[0], dat2[1], dat2[2], dat2[3], dat2[4], dat2[5], dat2[6], dat2[7]);
1081 printf(" %02x %02x %02x %02x %02x %02x %02x %02x\n", dat2[8],
1082 dat2[9], dat2[10], dat2[11], dat2[12], dat2[13], dat2[14], dat2[15]);