1 #include "../libzmpeg3.h"
12 #if defined(__x86_64__)
13 #define m_(v) (*(mmx_t*)(v))
15 #define m_(v) (*(char*)(v))
17 static uint32_t sadd1[2] = { 0x00010001, 0x00010001 };
18 static uint32_t sadd2[2] = { 0x00020002, 0x00020002 };
20 static uint32_t bmask[2] = { 0x7f7f7f7f, 0x7f7f7f7f };
21 static uint32_t badd1[2] = { 0x01010101, 0x01010101 };
26 static inline void rec(uint8_t *s, uint8_t *d, int lx2, int h)
28 for( int j=0; j<h; ++j, s+=lx2, d+=lx2 ) {
30 d[0] = s[0]; d[1] = s[1]; d[2] = s[2]; d[3] = s[3];
31 d[4] = s[4]; d[5] = s[5]; d[6] = s[6]; d[7] = s[7];
32 d[8] = s[8]; d[9] = s[9]; d[10] = s[10]; d[11] = s[11];
33 d[12] = s[12]; d[13] = s[13]; d[14] = s[14]; d[15] = s[15];
35 *(uint64_t*)(d+0) = *(uint64_t*)(s+0);
36 *(uint64_t*)(d+8) = *(uint64_t*)(s+8);
42 static inline void recc(uint8_t *s, uint8_t *d, int lx2, int h)
44 for( int j=0; j<h; ++j, s+=lx2, d+=lx2) {
46 d[0] = s[0]; d[1] = s[1]; d[2] = s[2]; d[3] = s[3];
47 d[4] = s[4]; d[5] = s[5]; d[6] = s[6]; d[7] = s[7];
49 *(uint64_t*)d = *(uint64_t*)s;
54 static inline void reca(uint8_t *s, uint8_t *d, int lx2, int h)
58 for( int j=0; j<h; ++j ) {
59 dp[0] = (uint32_t)(dp[0] + sp[0] + 1) >> 1;
60 dp[1] = (uint32_t)(dp[1] + sp[1] + 1) >> 1;
61 dp[2] = (uint32_t)(dp[2] + sp[2] + 1) >> 1;
62 dp[3] = (uint32_t)(dp[3] + sp[3] + 1) >> 1;
63 dp[4] = (uint32_t)(dp[4] + sp[4] + 1) >> 1;
64 dp[5] = (uint32_t)(dp[5] + sp[5] + 1) >> 1;
65 dp[6] = (uint32_t)(dp[6] + sp[6] + 1) >> 1;
66 dp[7] = (uint32_t)(dp[7] + sp[7] + 1) >> 1;
67 dp[8] = (uint32_t)(dp[8] + sp[8] + 1) >> 1;
68 dp[9] = (uint32_t)(dp[9] + sp[9] + 1) >> 1;
69 dp[10] = (uint32_t)(dp[10] + sp[10] + 1) >> 1;
70 dp[11] = (uint32_t)(dp[11] + sp[11] + 1) >> 1;
71 dp[12] = (uint32_t)(dp[12] + sp[12] + 1) >> 1;
72 dp[13] = (uint32_t)(dp[13] + sp[13] + 1) >> 1;
73 dp[14] = (uint32_t)(dp[14] + sp[14] + 1) >> 1;
74 dp[15] = (uint32_t)(dp[15] + sp[15] + 1) >> 1;
81 movq_m2r(m_(sadd1),mm7);
82 for( int j=0; j<h; ++j ) {
83 movq_m2r(m_(sp+0),mm1);
84 movq_m2r(m_(dp+0),mm3);
87 punpcklbw_r2r(mm0,mm1);
88 punpckhbw_r2r(mm0,mm2);
89 punpcklbw_r2r(mm0,mm3);
90 punpckhbw_r2r(mm0,mm4);
97 packuswb_r2r(mm2,mm1);
98 movq_r2m(mm1,m_(dp+0));
99 movq_m2r(m_(sp+8),mm1);
100 movq_m2r(m_(dp+8),mm3);
103 punpcklbw_r2r(mm0,mm1);
104 punpckhbw_r2r(mm0,mm2);
105 punpcklbw_r2r(mm0,mm3);
106 punpckhbw_r2r(mm0,mm4);
107 paddusw_r2r(mm3,mm1);
108 paddusw_r2r(mm4,mm2);
109 paddusw_r2r(mm7,mm1);
110 paddusw_r2r(mm7,mm2);
113 packuswb_r2r(mm2,mm1);
114 movq_r2m(mm1,m_(dp+8));
115 sp += lx2; dp += lx2;
118 uint8_t *dp=d, *sp=s;
119 movq_m2r(m_(bmask),mm6);
120 movq_m2r(m_(badd1),mm7);
121 for( int j=0; j<h; ++j ) {
122 movq_m2r(m_(sp+0),mm0);
123 movq_m2r(m_(dp+0),mm1);
124 movq_m2r(m_(sp+8),mm2);
125 movq_m2r(m_(dp+8),mm3);
134 paddusb_r2r(mm1,mm0);
135 paddusb_r2r(mm3,mm2);
136 paddusb_r2r(mm7,mm0);
137 paddusb_r2r(mm7,mm2);
138 movq_r2m(mm0,m_(dp+0));
139 movq_r2m(mm2,m_(dp+8));
140 sp += lx2; dp += lx2;
147 static inline void recac(uint8_t *s, uint8_t *d, int lx2, int h)
150 uint8_t *dp=d, *sp=s;
151 for( int j=0; j<h; ++j ) {
152 dp[0] = (uint32_t)(dp[0] + sp[0] + 1)>>1;
153 dp[1] = (uint32_t)(dp[1] + sp[1] + 1)>>1;
154 dp[2] = (uint32_t)(dp[2] + sp[2] + 1)>>1;
155 dp[3] = (uint32_t)(dp[3] + sp[3] + 1)>>1;
156 dp[4] = (uint32_t)(dp[4] + sp[4] + 1)>>1;
157 dp[5] = (uint32_t)(dp[5] + sp[5] + 1)>>1;
158 dp[6] = (uint32_t)(dp[6] + sp[6] + 1)>>1;
159 dp[7] = (uint32_t)(dp[7] + sp[7] + 1)>>1;
160 sp += lx2; dp += lx2;
164 uint8_t *dp=d, *sp=s;
166 movq_m2r(m_(sadd1),mm7);
167 for( int j=0; j<h; ++j ) {
168 movq_m2r(m_(sp+0),mm1);
169 movq_m2r(m_(dp+0),mm3);
172 punpcklbw_r2r(mm0,mm1);
173 punpckhbw_r2r(mm0,mm2);
174 punpcklbw_r2r(mm0,mm3);
175 punpckhbw_r2r(mm0,mm4);
176 paddusw_r2r(mm3,mm1);
177 paddusw_r2r(mm4,mm2);
178 paddusw_r2r(mm7,mm1);
179 paddusw_r2r(mm7,mm2);
182 packuswb_r2r(mm2,mm1);
183 movq_r2m(mm1,m_(dp+0));
184 sp += lx2; dp += lx2;
187 uint8_t *dp=d, *sp=s;
188 movq_m2r(m_(bmask),mm6);
189 movq_m2r(m_(badd1),mm7);
190 for( int j=0; j<h; ++j ) {
191 movq_m2r(m_(sp),mm0);
192 movq_m2r(m_(dp),mm1);
197 paddusb_r2r(mm1,mm0);
198 paddusb_r2r(mm7,mm0);
199 movq_r2m(mm0,m_(dp));
200 sp += lx2; dp += lx2;
207 static inline void recv(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
210 uint8_t *dp=d, *sp=s, *sp2=s+lx;
211 for( int j=0; j<h; ++j ) {
212 dp[0] = (uint32_t)(sp[0] + sp2[0] + 1) >> 1;
213 dp[1] = (uint32_t)(sp[1] + sp2[1] + 1) >> 1;
214 dp[2] = (uint32_t)(sp[2] + sp2[2] + 1) >> 1;
215 dp[3] = (uint32_t)(sp[3] + sp2[3] + 1) >> 1;
216 dp[4] = (uint32_t)(sp[4] + sp2[4] + 1) >> 1;
217 dp[5] = (uint32_t)(sp[5] + sp2[5] + 1) >> 1;
218 dp[6] = (uint32_t)(sp[6] + sp2[6] + 1) >> 1;
219 dp[7] = (uint32_t)(sp[7] + sp2[7] + 1) >> 1;
220 dp[8] = (uint32_t)(sp[8] + sp2[8] + 1) >> 1;
221 dp[9] = (uint32_t)(sp[9] + sp2[9] + 1) >> 1;
222 dp[10] = (uint32_t)(sp[10] + sp2[10] + 1) >> 1;
223 dp[11] = (uint32_t)(sp[11] + sp2[11] + 1) >> 1;
224 dp[12] = (uint32_t)(sp[12] + sp2[12] + 1) >> 1;
225 dp[13] = (uint32_t)(sp[13] + sp2[13] + 1) >> 1;
226 dp[14] = (uint32_t)(sp[14] + sp2[14] + 1) >> 1;
227 dp[15] = (uint32_t)(sp[15] + sp2[15] + 1) >> 1;
228 sp += lx2; sp2 += lx2 ; dp += lx2;
232 uint8_t *dp=d, *sp=s, *sp2=s+lx;
234 movq_m2r(m_(sadd1),mm7);
235 for( int j=0; j<h; ++j ) {
236 movq_m2r(m_(sp +0),mm1);
237 movq_m2r(m_(sp2+0),mm3);
240 punpcklbw_r2r(mm0,mm1);
241 punpckhbw_r2r(mm0,mm2);
242 punpcklbw_r2r(mm0,mm3);
243 punpckhbw_r2r(mm0,mm4);
244 paddusw_r2r(mm3,mm1);
245 paddusw_r2r(mm4,mm2);
246 paddusw_r2r(mm7,mm1);
247 paddusw_r2r(mm7,mm2);
250 packuswb_r2r(mm2,mm1);
251 movq_r2m(mm1,m_(dp+0));
252 movq_m2r(m_(sp +8),mm1);
253 movq_m2r(m_(sp2+8),mm3);
256 punpcklbw_r2r(mm0,mm1);
257 punpckhbw_r2r(mm0,mm2);
258 punpcklbw_r2r(mm0,mm3);
259 punpckhbw_r2r(mm0,mm4);
260 paddusw_r2r(mm3,mm1);
261 paddusw_r2r(mm4,mm2);
262 paddusw_r2r(mm7,mm1);
263 paddusw_r2r(mm7,mm2);
266 packuswb_r2r(mm2,mm1);
267 movq_r2m(mm1,m_(dp+8));
268 sp += lx2; sp2 += lx2 ; dp += lx2;
271 uint8_t *dp=d, *sp=s, *sp2=s+lx;
272 movq_m2r(m_(bmask),mm6);
273 movq_m2r(m_(badd1),mm7);
274 for( int j=0; j<h; ++j ) {
275 movq_m2r(m_(sp +0),mm0);
276 movq_m2r(m_(sp2+0),mm1);
277 movq_m2r(m_(sp +8),mm2);
278 movq_m2r(m_(sp2+8),mm3);
287 paddusb_r2r(mm1,mm0);
288 paddusb_r2r(mm3,mm2);
289 paddusb_r2r(mm7,mm0);
290 paddusb_r2r(mm7,mm2);
291 movq_r2m(mm0,m_(dp+0));
292 movq_r2m(mm2,m_(dp+8));
293 sp += lx2; sp2 += lx2 ; dp += lx2;
300 static inline void recvc(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
303 uint8_t *dp=d, *sp=s, *sp2=s+lx;
304 for( int j=0; j<h; ++j ) {
305 dp[0] = (uint32_t)(sp[0]+sp2[0]+1)>>1;
306 dp[1] = (uint32_t)(sp[1]+sp2[1]+1)>>1;
307 dp[2] = (uint32_t)(sp[2]+sp2[2]+1)>>1;
308 dp[3] = (uint32_t)(sp[3]+sp2[3]+1)>>1;
309 dp[4] = (uint32_t)(sp[4]+sp2[4]+1)>>1;
310 dp[5] = (uint32_t)(sp[5]+sp2[5]+1)>>1;
311 dp[6] = (uint32_t)(sp[6]+sp2[6]+1)>>1;
312 dp[7] = (uint32_t)(sp[7]+sp2[7]+1)>>1;
313 sp += lx2; sp2 += lx2; dp += lx2;
317 uint8_t *dp=d, *sp=s, *sp2=s+lx;
319 movq_m2r(m_(sadd1),mm7);
320 for( int j=0; j<h; ++j ) {
321 movq_m2r(m_(sp +0),mm1);
322 movq_m2r(m_(sp2+0),mm3);
325 punpcklbw_r2r(mm0,mm1);
326 punpckhbw_r2r(mm0,mm2);
327 punpcklbw_r2r(mm0,mm3);
328 punpckhbw_r2r(mm0,mm4);
329 paddusw_r2r(mm3,mm1);
330 paddusw_r2r(mm4,mm2);
331 paddusw_r2r(mm7,mm1);
332 paddusw_r2r(mm7,mm2);
335 packuswb_r2r(mm2,mm1);
336 movq_r2m(mm1,m_(dp+0));
337 sp += lx2; sp2 += lx2; dp += lx2;
340 uint8_t *dp=d, *sp=s, *sp2=s+lx;
341 movq_m2r(m_(bmask),mm6);
342 movq_m2r(m_(badd1),mm7);
343 for( int j=0; j<h; ++j ) {
344 movq_m2r(m_(sp),mm0);
345 movq_m2r(m_(sp2),mm1);
350 paddusb_r2r(mm1,mm0);
351 paddusb_r2r(mm7,mm0);
352 movq_r2m(mm0,m_(dp));
353 sp += lx2; sp2 += lx2; dp += lx2;
361 static inline void recva(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
364 uint8_t *dp=d, *sp=s, *sp2=s+lx;
365 for( int j=0; j<h; ++j ) {
366 dp[0] = (dp[0] + ((uint32_t)(sp[0]+sp2[0]+1)>>1) + 1)>>1;
367 dp[1] = (dp[1] + ((uint32_t)(sp[1]+sp2[1]+1)>>1) + 1)>>1;
368 dp[2] = (dp[2] + ((uint32_t)(sp[2]+sp2[2]+1)>>1) + 1)>>1;
369 dp[3] = (dp[3] + ((uint32_t)(sp[3]+sp2[3]+1)>>1) + 1)>>1;
370 dp[4] = (dp[4] + ((uint32_t)(sp[4]+sp2[4]+1)>>1) + 1)>>1;
371 dp[5] = (dp[5] + ((uint32_t)(sp[5]+sp2[5]+1)>>1) + 1)>>1;
372 dp[6] = (dp[6] + ((uint32_t)(sp[6]+sp2[6]+1)>>1) + 1)>>1;
373 dp[7] = (dp[7] + ((uint32_t)(sp[7]+sp2[7]+1)>>1) + 1)>>1;
374 dp[8] = (dp[8] + ((uint32_t)(sp[8]+sp2[8]+1)>>1) + 1)>>1;
375 dp[9] = (dp[9] + ((uint32_t)(sp[9]+sp2[9]+1)>>1) + 1)>>1;
376 dp[10] = (dp[10] + ((uint32_t)(sp[10]+sp2[10]+1)>>1) + 1)>>1;
377 dp[11] = (dp[11] + ((uint32_t)(sp[11]+sp2[11]+1)>>1) + 1)>>1;
378 dp[12] = (dp[12] + ((uint32_t)(sp[12]+sp2[12]+1)>>1) + 1)>>1;
379 dp[13] = (dp[13] + ((uint32_t)(sp[13]+sp2[13]+1)>>1) + 1)>>1;
380 dp[14] = (dp[14] + ((uint32_t)(sp[14]+sp2[14]+1)>>1) + 1)>>1;
381 dp[15] = (dp[15] + ((uint32_t)(sp[15]+sp2[15]+1)>>1) + 1)>>1;
382 sp += lx2; sp2 += lx2; dp += lx2;
386 uint8_t *dp=d, *sp=s, *sp2=s+lx;
388 movq_m2r(m_(sadd1),mm7);
389 for( int j=0; j<h; ++j ) {
390 movq_m2r(m_(sp +0),mm1);
391 movq_m2r(m_(sp2+0),mm3);
392 movq_m2r(m_(dp +0),mm5);
396 punpcklbw_r2r(mm0,mm1);
397 punpckhbw_r2r(mm0,mm2);
398 punpcklbw_r2r(mm0,mm3);
399 punpckhbw_r2r(mm0,mm4);
400 punpcklbw_r2r(mm0,mm5);
401 punpckhbw_r2r(mm0,mm6);
402 paddusw_r2r(mm3,mm1);
403 paddusw_r2r(mm4,mm2);
404 paddusw_r2r(mm7,mm1);
405 paddusw_r2r(mm7,mm2);
408 paddusw_r2r(mm5,mm1);
409 paddusw_r2r(mm6,mm2);
410 paddusw_r2r(mm7,mm1);
411 paddusw_r2r(mm7,mm2);
414 packuswb_r2r(mm2,mm1);
415 movq_r2m(mm1,m_(dp+0));
416 movq_m2r(m_(sp +8),mm1);
417 movq_m2r(m_(sp2+8),mm3);
418 movq_m2r(m_(dp +8),mm5);
422 punpcklbw_r2r(mm0,mm1);
423 punpckhbw_r2r(mm0,mm2);
424 punpcklbw_r2r(mm0,mm3);
425 punpckhbw_r2r(mm0,mm4);
426 punpcklbw_r2r(mm0,mm5);
427 punpckhbw_r2r(mm0,mm6);
428 paddusw_r2r(mm3,mm1);
429 paddusw_r2r(mm4,mm2);
430 paddusw_r2r(mm7,mm1);
431 paddusw_r2r(mm7,mm2);
434 paddusw_r2r(mm5,mm1);
435 paddusw_r2r(mm6,mm2);
436 paddusw_r2r(mm7,mm1);
437 paddusw_r2r(mm7,mm2);
440 packuswb_r2r(mm2,mm1);
441 movq_r2m(mm1,m_(dp+8));
442 sp += lx2; sp2 += lx2; dp += lx2;
445 uint8_t *dp=d, *sp=s, *sp2=s+lx;
446 movq_m2r(m_(bmask),mm6);
447 movq_m2r(m_(badd1),mm7);
448 for( int j=0; j<h; ++j ) {
449 movq_m2r(m_(sp +0),mm0);
450 movq_m2r(m_(sp2+0),mm1);
451 movq_m2r(m_(sp +8),mm2);
452 movq_m2r(m_(sp2+8),mm3);
453 movq_m2r(m_(dp +0),mm4);
454 movq_m2r(m_(dp +8),mm5);
467 paddusb_r2r(mm1,mm0);
468 paddusb_r2r(mm3,mm2);
469 paddusb_r2r(mm7,mm0);
470 paddusb_r2r(mm7,mm2);
475 paddusb_r2r(mm0,mm4);
476 paddusb_r2r(mm2,mm5);
477 paddusb_r2r(mm7,mm4);
478 paddusb_r2r(mm7,mm5);
479 movq_r2m(mm4,m_(dp+0));
480 movq_r2m(mm5,m_(dp+8));
481 sp += lx2; sp2 += lx2; dp += lx2;
489 static inline void recvac(uint8_t *s, uint8_t *d, int lx,int lx2, int h)
492 uint8_t *dp=d, *sp=s, *sp2=s+lx;
493 for( int j=0; j<h; ++j ) {
494 dp[0] = (dp[0] + ((uint32_t)(sp[0]+sp2[0]+1)>>1) + 1)>>1;
495 dp[1] = (dp[1] + ((uint32_t)(sp[1]+sp2[1]+1)>>1) + 1)>>1;
496 dp[2] = (dp[2] + ((uint32_t)(sp[2]+sp2[2]+1)>>1) + 1)>>1;
497 dp[3] = (dp[3] + ((uint32_t)(sp[3]+sp2[3]+1)>>1) + 1)>>1;
498 dp[4] = (dp[4] + ((uint32_t)(sp[4]+sp2[4]+1)>>1) + 1)>>1;
499 dp[5] = (dp[5] + ((uint32_t)(sp[5]+sp2[5]+1)>>1) + 1)>>1;
500 dp[6] = (dp[6] + ((uint32_t)(sp[6]+sp2[6]+1)>>1) + 1)>>1;
501 dp[7] = (dp[7] + ((uint32_t)(sp[7]+sp2[7]+1)>>1) + 1)>>1;
502 sp += lx2; sp2 += lx2; dp+= lx2;
506 uint8_t *dp=d, *sp=s, *sp2=s+lx;
508 movq_m2r(m_(sadd1),mm7);
509 for( int j=0; j<h; ++j ) {
510 movq_m2r(m_(sp +0),mm1);
511 movq_m2r(m_(sp2+0),mm3);
512 movq_m2r(m_(dp +0),mm5);
516 punpcklbw_r2r(mm0,mm1);
517 punpckhbw_r2r(mm0,mm2);
518 punpcklbw_r2r(mm0,mm3);
519 punpckhbw_r2r(mm0,mm4);
520 punpcklbw_r2r(mm0,mm5);
521 punpckhbw_r2r(mm0,mm6);
522 paddusw_r2r(mm3,mm1);
523 paddusw_r2r(mm4,mm2);
524 paddusw_r2r(mm7,mm1);
525 paddusw_r2r(mm7,mm2);
528 paddusw_r2r(mm5,mm1);
529 paddusw_r2r(mm6,mm2);
530 paddusw_r2r(mm7,mm1);
531 paddusw_r2r(mm7,mm2);
534 packuswb_r2r(mm2,mm1);
535 movq_r2m(mm1,m_(dp+0));
536 sp += lx2; sp2 += lx2; dp += lx2;
539 uint8_t *dp=d, *sp=s, *sp2=s+lx;
540 movq_m2r(m_(bmask),mm6);
541 movq_m2r(m_(badd1),mm7);
542 for( int j=0; j<h; ++j ) {
543 movq_m2r(m_(sp),mm0);
544 movq_m2r(m_(sp2),mm1);
545 movq_m2r(m_(dp),mm4);
552 paddusb_r2r(mm1,mm0);
553 paddusb_r2r(mm7,mm0);
556 paddusb_r2r(mm0,mm4);
557 paddusb_r2r(mm7,mm4);
558 movq_r2m(mm4,m_(dp));
559 sp += lx2; sp2 += lx2; dp += lx2;
567 static inline void rech(uint8_t *s, uint8_t *d, int lx2, int h)
570 uint8_t *dp=d, *sp=s;
572 for( int j=0; j<h; ++j ) {
574 dp[0] = (uint32_t)(s1+(s2=sp[1])+1)>>1;
575 dp[1] = (uint32_t)(s2+(s1=sp[2])+1)>>1;
576 dp[2] = (uint32_t)(s1+(s2=sp[3])+1)>>1;
577 dp[3] = (uint32_t)(s2+(s1=sp[4])+1)>>1;
578 dp[4] = (uint32_t)(s1+(s2=sp[5])+1)>>1;
579 dp[5] = (uint32_t)(s2+(s1=sp[6])+1)>>1;
580 dp[6] = (uint32_t)(s1+(s2=sp[7])+1)>>1;
581 dp[7] = (uint32_t)(s2+(s1=sp[8])+1)>>1;
582 dp[8] = (uint32_t)(s1+(s2=sp[9])+1)>>1;
583 dp[9] = (uint32_t)(s2+(s1=sp[10])+1)>>1;
584 dp[10] = (uint32_t)(s1+(s2=sp[11])+1)>>1;
585 dp[11] = (uint32_t)(s2+(s1=sp[12])+1)>>1;
586 dp[12] = (uint32_t)(s1+(s2=sp[13])+1)>>1;
587 dp[13] = (uint32_t)(s2+(s1=sp[14])+1)>>1;
588 dp[14] = (uint32_t)(s1+(s2=sp[15])+1)>>1;
589 dp[15] = (uint32_t)(s2+sp[16]+1)>>1;
590 sp += lx2; dp += lx2;
594 uint8_t *dp=d, *sp=s;
596 movq_m2r(m_(sadd1),mm7);
597 for( int j=0; j<h; ++j ) {
598 movq_m2r(m_(sp+0),mm1);
599 movq_m2r(m_(sp+1),mm3);
602 punpcklbw_r2r(mm0,mm1);
603 punpckhbw_r2r(mm0,mm2);
604 punpcklbw_r2r(mm0,mm3);
605 punpckhbw_r2r(mm0,mm4);
606 paddusw_r2r(mm3,mm1);
607 paddusw_r2r(mm4,mm2);
608 paddusw_r2r(mm7,mm1);
609 paddusw_r2r(mm7,mm2);
612 packuswb_r2r(mm2,mm1);
613 movq_r2m(mm1,m_(dp+0));
614 movq_m2r(m_(sp+8),mm1);
615 movq_m2r(m_(sp+9),mm3);
618 punpcklbw_r2r(mm0,mm1);
619 punpckhbw_r2r(mm0,mm2);
620 punpcklbw_r2r(mm0,mm3);
621 punpckhbw_r2r(mm0,mm4);
622 paddusw_r2r(mm3,mm1);
623 paddusw_r2r(mm4,mm2);
624 paddusw_r2r(mm7,mm1);
625 paddusw_r2r(mm7,mm2);
628 packuswb_r2r(mm2,mm1);
629 movq_r2m(mm1,m_(dp+8));
630 sp += lx2; dp += lx2;
633 uint8_t *dp=d, *sp=s;
634 movq_m2r(m_(bmask),mm6);
635 movq_m2r(m_(badd1),mm7);
636 for( int j=0; j<h; ++j ) {
637 movq_m2r(m_(sp+0),mm0);
638 movq_m2r(m_(sp+1),mm1);
639 movq_m2r(m_(sp+8),mm2);
640 movq_m2r(m_(sp+9),mm3);
649 paddusb_r2r(mm1,mm0);
650 paddusb_r2r(mm3,mm2);
651 paddusb_r2r(mm7,mm0);
652 paddusb_r2r(mm7,mm2);
653 movq_r2m(mm0,m_(dp+0));
654 movq_r2m(mm2,m_(dp+8));
655 sp += lx2; dp += lx2;
663 static inline void rechc(uint8_t *s,uint8_t *d, int lx2, int h)
666 uint8_t *dp=d, *sp=s;
668 for( int j=0; j<h; ++j ) {
670 dp[0] = (uint32_t)(s1+(s2=sp[1])+1)>>1;
671 dp[1] = (uint32_t)(s2+(s1=sp[2])+1)>>1;
672 dp[2] = (uint32_t)(s1+(s2=sp[3])+1)>>1;
673 dp[3] = (uint32_t)(s2+(s1=sp[4])+1)>>1;
674 dp[4] = (uint32_t)(s1+(s2=sp[5])+1)>>1;
675 dp[5] = (uint32_t)(s2+(s1=sp[6])+1)>>1;
676 dp[6] = (uint32_t)(s1+(s2=sp[7])+1)>>1;
677 dp[7] = (uint32_t)(s2+sp[8]+1)>>1;
678 sp += lx2; dp += lx2;
682 uint8_t *dp=d, *sp=s;
684 movq_m2r(m_(sadd1),mm7);
685 for( int j=0; j<h; ++j ) {
686 movq_m2r(m_(sp+0),mm1);
687 movq_m2r(m_(sp+1),mm3);
690 punpcklbw_r2r(mm0,mm1);
691 punpckhbw_r2r(mm0,mm2);
692 punpcklbw_r2r(mm0,mm3);
693 punpckhbw_r2r(mm0,mm4);
694 paddusw_r2r(mm3,mm1);
695 paddusw_r2r(mm4,mm2);
696 paddusw_r2r(mm7,mm1);
697 paddusw_r2r(mm7,mm2);
700 packuswb_r2r(mm2,mm1);
701 movq_r2m(mm1,m_(dp+0));
702 sp += lx2; dp += lx2;
705 uint8_t *dp=d, *sp=s;
706 movq_m2r(m_(bmask),mm6);
707 movq_m2r(m_(badd1),mm7);
708 for( int j=0; j<h; ++j ) {
709 movq_m2r(m_(sp+0),mm0);
710 movq_m2r(m_(sp+1),mm1);
715 paddusb_r2r(mm1,mm0);
716 paddusb_r2r(mm7,mm0);
717 movq_r2m(mm0,m_(dp+0));
718 sp += lx2; dp += lx2;
725 static inline void recha(uint8_t *s, uint8_t *d,int lx2, int h)
728 uint8_t *dp=d, *sp=s;
730 for( int j=0; j<h; ++j ) {
732 dp[0] = (dp[0] + ((uint32_t)(s1 + (s2 = sp[1]) + 1) >> 1) + 1) >> 1;
733 dp[1] = (dp[1] + ((uint32_t)(s2 + (s1 = sp[2]) + 1) >> 1) + 1) >> 1;
734 dp[2] = (dp[2] + ((uint32_t)(s1 + (s2 = sp[3]) + 1) >> 1) + 1) >> 1;
735 dp[3] = (dp[3] + ((uint32_t)(s2 + (s1 = sp[4]) + 1) >> 1) + 1) >> 1;
736 dp[4] = (dp[4] + ((uint32_t)(s1 + (s2 = sp[5]) + 1) >> 1) + 1) >> 1;
737 dp[5] = (dp[5] + ((uint32_t)(s2 + (s1 = sp[6]) + 1) >> 1) + 1) >> 1;
738 dp[6] = (dp[6] + ((uint32_t)(s1 + (s2 = sp[7]) + 1) >> 1) + 1) >> 1;
739 dp[7] = (dp[7] + ((uint32_t)(s2 + (s1 = sp[8]) + 1) >> 1) + 1) >> 1;
740 dp[8] = (dp[8] + ((uint32_t)(s1 + (s2 = sp[9]) + 1) >> 1) + 1) >> 1;
741 dp[9] = (dp[9] + ((uint32_t)(s2 + (s1 = sp[10]) + 1) >> 1) + 1) >> 1;
742 dp[10] = (dp[10] + ((uint32_t)(s1 + (s2 = sp[11]) + 1) >> 1) + 1) >> 1;
743 dp[11] = (dp[11] + ((uint32_t)(s2 + (s1 = sp[12]) + 1) >> 1) + 1) >> 1;
744 dp[12] = (dp[12] + ((uint32_t)(s1 + (s2 = sp[13]) + 1) >> 1) + 1) >> 1;
745 dp[13] = (dp[13] + ((uint32_t)(s2 + (s1 = sp[14]) + 1) >> 1) + 1) >> 1;
746 dp[14] = (dp[14] + ((uint32_t)(s1 + (s2 = sp[15]) + 1) >> 1) + 1) >> 1;
747 dp[15] = (dp[15] + ((uint32_t)(s2 + sp[16] + 1) >> 1) + 1) >> 1;
748 sp += lx2; dp += lx2;
751 uint8_t *dp=d, *sp=s;
753 movq_m2r(m_(sadd1),mm7);
754 for( int j=0; j<h; ++j ) {
755 movq_m2r(m_(sp+0),mm1);
756 movq_m2r(m_(sp+1),mm3);
757 movq_m2r(m_(dp+0),mm5);
761 punpcklbw_r2r(mm0,mm1);
762 punpckhbw_r2r(mm0,mm2);
763 punpcklbw_r2r(mm0,mm3);
764 punpckhbw_r2r(mm0,mm4);
765 punpcklbw_r2r(mm0,mm5);
766 punpckhbw_r2r(mm0,mm6);
767 paddusw_r2r(mm3,mm1);
768 paddusw_r2r(mm4,mm2);
769 paddusw_r2r(mm7,mm1);
770 paddusw_r2r(mm7,mm2);
773 paddusw_r2r(mm5,mm1);
774 paddusw_r2r(mm6,mm2);
775 paddusw_r2r(mm7,mm1);
776 paddusw_r2r(mm7,mm2);
779 packuswb_r2r(mm2,mm1);
780 movq_r2m(mm1,m_(dp+0));
781 movq_m2r(m_(sp+8),mm1);
782 movq_m2r(m_(sp+9),mm3);
783 movq_m2r(m_(dp+8),mm5);
787 punpcklbw_r2r(mm0,mm1);
788 punpckhbw_r2r(mm0,mm2);
789 punpcklbw_r2r(mm0,mm3);
790 punpckhbw_r2r(mm0,mm4);
791 punpcklbw_r2r(mm0,mm5);
792 punpckhbw_r2r(mm0,mm6);
793 paddusw_r2r(mm3,mm1);
794 paddusw_r2r(mm4,mm2);
795 paddusw_r2r(mm7,mm1);
796 paddusw_r2r(mm7,mm2);
799 paddusw_r2r(mm5,mm1);
800 paddusw_r2r(mm6,mm2);
801 paddusw_r2r(mm7,mm1);
802 paddusw_r2r(mm7,mm2);
805 packuswb_r2r(mm2,mm1);
806 movq_r2m(mm1,m_(dp+8));
807 sp += lx2; dp += lx2;
814 static inline void rechac(uint8_t *s,uint8_t *d, int lx2, int h)
817 uint8_t *dp=d, *sp=s;
819 for( int j=0; j<h; ++j ) {
821 dp[0] = (dp[0] + ((uint32_t)(s1 + (s2 = sp[1]) + 1) >> 1) + 1) >> 1;
822 dp[1] = (dp[1] + ((uint32_t)(s2 + (s1 = sp[2]) + 1) >> 1) + 1) >> 1;
823 dp[2] = (dp[2] + ((uint32_t)(s1 + (s2 = sp[3]) + 1) >> 1) + 1) >> 1;
824 dp[3] = (dp[3] + ((uint32_t)(s2 + (s1 = sp[4]) + 1) >> 1) + 1) >> 1;
825 dp[4] = (dp[4] + ((uint32_t)(s1 + (s2 = sp[5]) + 1) >> 1) + 1) >> 1;
826 dp[5] = (dp[5] + ((uint32_t)(s2 + (s1 = sp[6]) + 1) >> 1) + 1) >> 1;
827 dp[6] = (dp[6] + ((uint32_t)(s1 + (s2 = sp[7]) + 1) >> 1) + 1) >> 1;
828 dp[7] = (dp[7] + ((uint32_t)(s2 + sp[8] + 1) >> 1) + 1) >> 1;
829 sp += lx2; dp += lx2;
832 uint8_t *dp=d, *sp=s;
834 movq_m2r(m_(sadd1),mm7);
835 for( int j=0; j<h; ++j ) {
836 movq_m2r(m_(sp+0),mm1);
837 movq_m2r(m_(sp+1),mm3);
838 movq_m2r(m_(dp+0),mm5);
842 punpcklbw_r2r(mm0,mm1);
843 punpckhbw_r2r(mm0,mm2);
844 punpcklbw_r2r(mm0,mm3);
845 punpckhbw_r2r(mm0,mm4);
846 punpcklbw_r2r(mm0,mm5);
847 punpckhbw_r2r(mm0,mm6);
848 paddusw_r2r(mm3,mm1);
849 paddusw_r2r(mm4,mm2);
850 paddusw_r2r(mm7,mm1);
851 paddusw_r2r(mm7,mm2);
854 paddusw_r2r(mm5,mm1);
855 paddusw_r2r(mm6,mm2);
856 paddusw_r2r(mm7,mm1);
857 paddusw_r2r(mm7,mm2);
860 packuswb_r2r(mm2,mm1);
861 movq_r2m(mm1,m_(dp+0));
862 sp += lx2; dp += lx2;
869 static inline void rec4(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
872 uint8_t *dp=d, *sp=s, *sp2=s+lx;
873 uint32_t s1, s2, s3, s4;
874 for( int j=0; j<h; ++j ) {
876 dp[0] = (uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2;
877 dp[1] = (uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2;
878 dp[2] = (uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2;
879 dp[3] = (uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2;
880 dp[4] = (uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2;
881 dp[5] = (uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2;
882 dp[6] = (uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2;
883 dp[7] = (uint32_t)(s2+(s1=sp[8])+s4+(s3=sp2[8])+2)>>2;
884 dp[8] = (uint32_t)(s1+(s2=sp[9])+s3+(s4=sp2[9])+2)>>2;
885 dp[9] = (uint32_t)(s2+(s1=sp[10])+s4+(s3=sp2[10])+2)>>2;
886 dp[10] = (uint32_t)(s1+(s2=sp[11])+s3+(s4=sp2[11])+2)>>2;
887 dp[11] = (uint32_t)(s2+(s1=sp[12])+s4+(s3=sp2[12])+2)>>2;
888 dp[12] = (uint32_t)(s1+(s2=sp[13])+s3+(s4=sp2[13])+2)>>2;
889 dp[13] = (uint32_t)(s2+(s1=sp[14])+s4+(s3=sp2[14])+2)>>2;
890 dp[14] = (uint32_t)(s1+(s2=sp[15])+s3+(s4=sp2[15])+2)>>2;
891 dp[15] = (uint32_t)(s2+sp[16]+s4+sp2[16]+2)>>2;
892 sp += lx2; sp2 += lx2; dp += lx2;
895 uint8_t *dp=d, *sp=s, *sp2=s+lx;
897 movq_m2r(m_(sadd2),mm7);
898 for( int j=0; j<h; ++j ) {
899 movq_m2r(m_(sp +0),mm1);
900 movq_m2r(m_(sp +1),mm3);
901 movq_m2r(m_(sp2+0),mm5);
905 punpcklbw_r2r(mm0,mm1);
906 punpckhbw_r2r(mm0,mm2);
907 punpcklbw_r2r(mm0,mm3);
908 punpckhbw_r2r(mm0,mm4);
909 punpcklbw_r2r(mm0,mm5);
910 punpckhbw_r2r(mm0,mm6);
911 paddusw_r2r(mm3,mm1);
912 movq_m2r(m_(sp2+1),mm3);
913 paddusw_r2r(mm4,mm2);
915 punpcklbw_r2r(mm0,mm3);
916 punpckhbw_r2r(mm0,mm4);
917 paddusw_r2r(mm5,mm3);
918 paddusw_r2r(mm6,mm4);
919 paddusw_r2r(mm3,mm1);
920 paddusw_r2r(mm4,mm2);
921 paddusw_r2r(mm7,mm1);
922 paddusw_r2r(mm7,mm2);
925 packuswb_r2r(mm2,mm1);
926 movq_r2m(mm1,m_(dp+0));
927 movq_m2r(m_(sp +8),mm1);
928 movq_m2r(m_(sp +9),mm3);
929 movq_m2r(m_(sp2+8),mm5);
933 punpcklbw_r2r(mm0,mm1);
934 punpckhbw_r2r(mm0,mm2);
935 punpcklbw_r2r(mm0,mm3);
936 punpckhbw_r2r(mm0,mm4);
937 punpcklbw_r2r(mm0,mm5);
938 punpckhbw_r2r(mm0,mm6);
939 paddusw_r2r(mm3,mm1);
940 movq_m2r(m_(sp2+9),mm3);
941 paddusw_r2r(mm4,mm2);
943 punpcklbw_r2r(mm0,mm3);
944 punpckhbw_r2r(mm0,mm4);
945 paddusw_r2r(mm5,mm3);
946 paddusw_r2r(mm6,mm4);
947 paddusw_r2r(mm3,mm1);
948 paddusw_r2r(mm4,mm2);
949 paddusw_r2r(mm7,mm1);
950 paddusw_r2r(mm7,mm2);
953 packuswb_r2r(mm2,mm1);
954 movq_r2m(mm1,m_(dp+8));
955 sp += lx2; sp2 += lx2; dp += lx2;
962 static inline void rec4c(uint8_t *s,uint8_t *d, int lx, int lx2, int h)
965 uint8_t *dp=d, *sp=s, *sp2=s+lx;
966 uint32_t s1, s2, s3, s4;
967 for( int j=0; j<h; ++j ) {
969 dp[0] = (uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2;
970 dp[1] = (uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2;
971 dp[2] = (uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2;
972 dp[3] = (uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2;
973 dp[4] = (uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2;
974 dp[5] = (uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2;
975 dp[6] = (uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2;
976 dp[7] = (uint32_t)(s2+sp[8]+s4+sp2[8]+2)>>2;
977 sp += lx2; sp2 += lx2; dp += lx2;
980 uint8_t *dp=d, *sp=s, *sp2=s+lx;
982 movq_m2r(m_(sadd2),mm7);
983 for( int j=0; j<h; ++j ) {
984 movq_m2r(m_(sp +0),mm1);
985 movq_m2r(m_(sp +1),mm3);
986 movq_m2r(m_(sp2+0),mm5);
990 punpcklbw_r2r(mm0,mm1);
991 punpckhbw_r2r(mm0,mm2);
992 punpcklbw_r2r(mm0,mm3);
993 punpckhbw_r2r(mm0,mm4);
994 punpcklbw_r2r(mm0,mm5);
995 punpckhbw_r2r(mm0,mm6);
996 paddusw_r2r(mm3,mm1);
997 movq_m2r(m_(sp2+1),mm3);
998 paddusw_r2r(mm4,mm2);
1000 punpcklbw_r2r(mm0,mm3);
1001 punpckhbw_r2r(mm0,mm4);
1002 paddusw_r2r(mm5,mm3);
1003 paddusw_r2r(mm6,mm4);
1004 paddusw_r2r(mm3,mm1);
1005 paddusw_r2r(mm4,mm2);
1006 paddusw_r2r(mm7,mm1);
1007 paddusw_r2r(mm7,mm2);
1010 packuswb_r2r(mm2,mm1);
1011 movq_r2m(mm1,m_(dp+0));
1012 sp += lx2; sp2 += lx2; dp += lx2;
1019 static inline void rec4a(uint8_t *s,uint8_t *d, int lx, int lx2, int h)
1022 uint8_t *dp=d, *sp=s, *sp2=s+lx;
1023 uint32_t s1, s2, s3, s4;
1024 for( int j=0; j<h; ++j ) {
1025 s1 = sp[0]; s3 = sp2[0];
1026 dp[0] = (dp[0] + ((uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2) + 1)>>1;
1027 dp[1] = (dp[1] + ((uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2) + 1)>>1;
1028 dp[2] = (dp[2] + ((uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2) + 1)>>1;
1029 dp[3] = (dp[3] + ((uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2) + 1)>>1;
1030 dp[4] = (dp[4] + ((uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2) + 1)>>1;
1031 dp[5] = (dp[5] + ((uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2) + 1)>>1;
1032 dp[6] = (dp[6] + ((uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2) + 1)>>1;
1033 dp[7] = (dp[7] + ((uint32_t)(s2+(s1=sp[8])+s4+(s3=sp2[8])+2)>>2) + 1)>>1;
1034 dp[8] = (dp[8] + ((uint32_t)(s1+(s2=sp[9])+s3+(s4=sp2[9])+2)>>2) + 1)>>1;
1035 dp[9] = (dp[9] + ((uint32_t)(s2+(s1=sp[10])+s4+(s3=sp2[10])+2)>>2) + 1)>>1;
1036 dp[10] = (dp[10] + ((uint32_t)(s1+(s2=sp[11])+s3+(s4=sp2[11])+2)>>2) + 1)>>1;
1037 dp[11] = (dp[11] + ((uint32_t)(s2+(s1=sp[12])+s4+(s3=sp2[12])+2)>>2) + 1)>>1;
1038 dp[12] = (dp[12] + ((uint32_t)(s1+(s2=sp[13])+s3+(s4=sp2[13])+2)>>2) + 1)>>1;
1039 dp[13] = (dp[13] + ((uint32_t)(s2+(s1=sp[14])+s4+(s3=sp2[14])+2)>>2) + 1)>>1;
1040 dp[14] = (dp[14] + ((uint32_t)(s1+(s2=sp[15])+s3+(s4=sp2[15])+2)>>2) + 1)>>1;
1041 dp[15] = (dp[15] + ((uint32_t)(s2+sp[16]+s4+sp2[16]+2)>>2) + 1)>>1;
1042 sp += lx2; sp2 += lx2; dp += lx2;
1045 uint8_t *dp=d, *sp=s, *sp2=s+lx;
1047 movq_m2r(m_(sadd2),mm7);
1048 for( int j=0; j<h; ++j ) {
1049 movq_m2r(m_(sp +0),mm1);
1050 movq_m2r(m_(sp +1),mm3);
1051 movq_m2r(m_(sp2+0),mm5);
1055 punpcklbw_r2r(mm0,mm1);
1056 punpckhbw_r2r(mm0,mm2);
1057 punpcklbw_r2r(mm0,mm3);
1058 punpckhbw_r2r(mm0,mm4);
1059 punpcklbw_r2r(mm0,mm5);
1060 punpckhbw_r2r(mm0,mm6);
1061 paddusw_r2r(mm3,mm1);
1062 movq_m2r(m_(sp2+1),mm3);
1063 paddusw_r2r(mm4,mm2);
1065 punpcklbw_r2r(mm0,mm3);
1066 punpckhbw_r2r(mm0,mm4);
1067 paddusw_r2r(mm5,mm3);
1068 paddusw_r2r(mm6,mm4);
1069 paddusw_r2r(mm3,mm1);
1070 paddusw_r2r(mm4,mm2);
1071 movq_m2r(m_(dp +0),mm3);
1072 paddusw_r2r(mm7,mm1);
1074 paddusw_r2r(mm7,mm2);
1075 punpcklbw_r2r(mm0,mm3);
1076 punpckhbw_r2r(mm0,mm4);
1079 movq_m2r(m_(sadd1),mm5);
1080 paddusw_r2r(mm3,mm1);
1081 paddusw_r2r(mm4,mm2);
1082 paddusw_r2r(mm5,mm1);
1083 paddusw_r2r(mm5,mm2);
1086 packuswb_r2r(mm2,mm1);
1087 movq_r2m(mm1,m_(dp+0));
1088 movq_m2r(m_(sp +8),mm1);
1089 movq_m2r(m_(sp +9),mm3);
1090 movq_m2r(m_(sp2+8),mm5);
1094 punpcklbw_r2r(mm0,mm1);
1095 punpckhbw_r2r(mm0,mm2);
1096 punpcklbw_r2r(mm0,mm3);
1097 punpckhbw_r2r(mm0,mm4);
1098 punpcklbw_r2r(mm0,mm5);
1099 punpckhbw_r2r(mm0,mm6);
1100 paddusw_r2r(mm3,mm1);
1101 movq_m2r(m_(sp2+9),mm3);
1102 paddusw_r2r(mm4,mm2);
1104 punpcklbw_r2r(mm0,mm3);
1105 punpckhbw_r2r(mm0,mm4);
1106 paddusw_r2r(mm5,mm3);
1107 paddusw_r2r(mm6,mm4);
1108 paddusw_r2r(mm3,mm1);
1109 paddusw_r2r(mm4,mm2);
1110 movq_m2r(m_(dp +8),mm3);
1111 paddusw_r2r(mm7,mm1);
1113 paddusw_r2r(mm7,mm2);
1114 punpcklbw_r2r(mm0,mm3);
1115 punpckhbw_r2r(mm0,mm4);
1118 movq_m2r(m_(sadd1),mm5);
1119 paddusw_r2r(mm3,mm1);
1120 paddusw_r2r(mm4,mm2);
1121 paddusw_r2r(mm5,mm1);
1122 paddusw_r2r(mm5,mm2);
1125 packuswb_r2r(mm2,mm1);
1126 movq_r2m(mm1,m_(dp+8));
1127 sp += lx2; sp2 += lx2; dp += lx2;
1134 static inline void rec4ac(uint8_t *s,uint8_t *d, int lx, int lx2, int h)
1137 uint8_t *dp=d, *sp=s, *sp2=s+lx;
1138 uint32_t s1,s2,s3,s4;
1139 for( int j=0; j<h; ++j ) {
1140 s1=sp[0]; s3=sp2[0];
1141 dp[0] = (dp[0] + ((uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2) + 1)>>1;
1142 dp[1] = (dp[1] + ((uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2) + 1)>>1;
1143 dp[2] = (dp[2] + ((uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2) + 1)>>1;
1144 dp[3] = (dp[3] + ((uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2) + 1)>>1;
1145 dp[4] = (dp[4] + ((uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2) + 1)>>1;
1146 dp[5] = (dp[5] + ((uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2) + 1)>>1;
1147 dp[6] = (dp[6] + ((uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2) + 1)>>1;
1148 dp[7] = (dp[7] + ((uint32_t)(s2+sp[8]+s4+sp2[8]+2)>>2) + 1)>>1;
1149 sp += lx2; sp2 += lx2; dp += lx2;
1152 uint8_t *dp=d, *sp=s, *sp2=s+lx;
1154 movq_m2r(m_(sadd2),mm7);
1155 for( int j=0; j<h; ++j ) {
1156 movq_m2r(m_(sp +0),mm1);
1157 movq_m2r(m_(sp +1),mm3);
1158 movq_m2r(m_(sp2+0),mm5);
1162 punpcklbw_r2r(mm0,mm1);
1163 punpckhbw_r2r(mm0,mm2);
1164 punpcklbw_r2r(mm0,mm3);
1165 punpckhbw_r2r(mm0,mm4);
1166 punpcklbw_r2r(mm0,mm5);
1167 punpckhbw_r2r(mm0,mm6);
1168 paddusw_r2r(mm3,mm1);
1169 movq_m2r(m_(sp2+1),mm3);
1170 paddusw_r2r(mm4,mm2);
1172 punpcklbw_r2r(mm0,mm3);
1173 punpckhbw_r2r(mm0,mm4);
1174 paddusw_r2r(mm5,mm3);
1175 paddusw_r2r(mm6,mm4);
1176 paddusw_r2r(mm3,mm1);
1177 paddusw_r2r(mm4,mm2);
1178 movq_m2r(m_(dp +0),mm3);
1179 paddusw_r2r(mm7,mm1);
1181 paddusw_r2r(mm7,mm2);
1182 punpcklbw_r2r(mm0,mm3);
1183 punpckhbw_r2r(mm0,mm4);
1186 movq_m2r(m_(sadd1),mm5);
1187 paddusw_r2r(mm3,mm1);
1188 paddusw_r2r(mm4,mm2);
1189 paddusw_r2r(mm5,mm1);
1190 paddusw_r2r(mm5,mm2);
1193 packuswb_r2r(mm2,mm1);
1194 movq_r2m(mm1,m_(dp+0));
1195 sp += lx2; sp2 += lx2; dp += lx2;
1203 static class stats {
1206 stats() { for( int i=0; i<16; ++i ) totals[i] = 0; }
1208 for( int i=0; i<16; ++i ) {
1209 static const char *fn[16] = {
1210 "recc", "rec", "recac", "reca",
1211 "recvc", "recv", "recvac", "recva",
1212 "rechc", "rech", "rechac", "recha",
1213 "rec4c", "rec4", "rec4ac", "rec4a"
1215 printf("%-8s %d\n",fn[i],totals[i]);
1218 void incr(int i) { if( i>0 && i<16 ) ++totals[i]; }
1223 inline void zvideo_t::
1224 recon_comp(uint8_t *s, uint8_t *d, int lx, int lx2, int h, int type)
1226 /* probably Accelerated functions */
1228 case 0x3: reca(s, d, lx2, h); break;
1229 case 0x2: recac(s, d, lx2, h); break;
1230 case 0x1: rec(s, d, lx2, h); break;
1231 case 0x0: recc(s, d, lx2, h); break;
1232 case 0x7: recva(s, d, lx, lx2, h); break;
1233 case 0x6: recvac(s, d, lx, lx2, h); break;
1234 case 0x5: recv(s, d, lx, lx2, h); break;
1235 case 0x4: recvc(s, d, lx, lx2, h); break;
1236 case 0x9: rech(s, d, lx2, h); break;
1237 case 0x8: rechc(s, d, lx2, h); break;
1238 /* maybe Unaccelerated functions */
1239 case 0xb: recha(s, d, lx2, h); break;
1240 case 0xa: rechac(s, d, lx2, h); break;
1241 case 0xf: rec4a(s, d, lx, lx2, h); break;
1242 case 0xe: rec4ac(s, d, lx, lx2, h); break;
1243 case 0xd: rec4(s, d, lx, lx2, h); break;
1244 case 0xc: rec4c(s, d, lx, lx2, h); break;
1252 uint8_t *src[]; * prediction source buffer *
1253 int sfield; * prediction source field number (0 or 1) *
1254 uint8_t *dst[]; * prediction destination buffer *
1255 int dfield; * prediction destination field number (0 or 1)*
1256 int lx,lx2; * horizontal offsets *
1257 int w,h; * prediction block/sub-block width, height *
1258 int x,y; * pixel co-ordinates of top-left sample in current MB *
1259 int dx,dy; * horizontal, vertical motion vector *
1260 int addflag; * add prediction error to prediction ? *
1263 recon( uint8_t *src[], int sfield,
1264 uint8_t *dst[], int dfield, int lx, int lx2,
1265 int w, int h, int x, int y, int dx, int dy, int addflag)
1267 /* validate parameters */
1269 int sofs = (y+(dy>>1))*lx + x+(dx>>1);
1270 int dofs = y*lx + x;
1271 if( sfield ) sofs += lx2 >> 1;
1272 if( dfield ) dofs += lx2 >> 1;
1273 if( sofs >= 0 && dofs >= 0 ) {
1274 int dsz = coded_picture_width * coded_picture_height;
1275 int ssz = dsz + 16*coded_picture_width + 16;
1276 int dlen = (h-1)*lx2 + w*8+8-1;
1277 int slen = dlen + (dy&1)*lx + (dx&1);
1278 if( sofs+slen >= ssz || dofs+dlen >= dsz )
1284 if( this->src->log_errs ) {
1285 zmsgs("err%c frm %dx%d @ %d,%d %dx%d dx=%d, dy=%d, sofs=%d, dofs=%d\n",
1286 err, coded_picture_width, coded_picture_height,
1287 x, y, 8+w*8, h, dx, dy, sofs, dofs);
1292 /* half pel scaling */
1293 int type = ((dx & 1) << 3) | ((dy & 1) << 2) | w;
1294 if( addflag ) type |= 2;
1296 recon_comp(src[0]+sofs, dst[0]+dofs, lx, lx2, h, type); /* Y */
1298 if( chroma_format != cfmt_444 ) {
1303 if( chroma_format == cfmt_420 ) {
1308 sofs = (y+(dy>>1))*lx + x+(dx>>1);
1310 if( sfield ) sofs += lx2 >> 1;
1311 if( dfield ) dofs += lx2 >> 1;
1312 type = ((dx & 1) << 3) | ((dy & 1) << 2) | w;
1313 if( addflag ) type |= 2;
1316 recon_comp(src[1]+sofs, dst[1]+dofs, lx, lx2, h, type); /* Cb */
1317 recon_comp(src[2]+sofs, dst[2]+dofs, lx, lx2, h, type); /* Cr */
1323 reconstruct( int bx, int by, int mb_type, int motion_type,
1324 int PMV[2][2][2], int mv_field_sel[2][2],
1325 int dmvector[2], int stwtype)
1328 uint8_t **predframe;
1332 stwtop = stwtype % 3; /* 0:temporal, 1:(spat+temp), 2:spatial */
1333 stwbot = stwtype / 3;
1335 if( (mb_type & slice_decoder_t::mb_FORWARD) || (pict_type == pic_type_P) ) {
1336 if( pict_struct == pics_FRAME_PICTURE ) {
1337 if( (motion_type == slice_decoder_t::mc_FRAME) ||
1338 !(mb_type & slice_decoder_t::mb_FORWARD) ) {
1339 /* frame-based prediction */
1341 recon(oldrefframe, 0, newframe, 0,
1342 coded_picture_width, coded_picture_width<<1,
1343 WIDTH, 8, bx, by, PMV[0][0][0], PMV[0][0][1], stwtop);
1346 recon(oldrefframe, 1, newframe, 1,
1347 coded_picture_width, coded_picture_width<<1,
1348 WIDTH, 8, bx, by, PMV[0][0][0], PMV[0][0][1], stwbot);
1350 else if(motion_type == slice_decoder_t::mc_FIELD) { /* field-based prediction */
1351 /* top field prediction */
1353 recon(oldrefframe, mv_field_sel[0][0], newframe, 0,
1354 coded_picture_width<<1, coded_picture_width<<1,
1355 WIDTH, 8, bx, by>>1, PMV[0][0][0], PMV[0][0][1]>>1, stwtop);
1357 /* bottom field prediction */
1359 recon(oldrefframe, mv_field_sel[1][0], newframe, 1,
1360 coded_picture_width<<1, coded_picture_width<<1,
1361 WIDTH, 8, bx, by>>1, PMV[1][0][0], PMV[1][0][1]>>1, stwbot);
1363 else if( motion_type == slice_decoder_t::mc_DMV ) {
1364 /* dual prime prediction */
1365 /* calculate derived motion vectors */
1366 calc_dmv(DMV, dmvector, PMV[0][0][0], PMV[0][0][1]>>1);
1369 /* predict top field from top field */
1370 recon(oldrefframe, 0, newframe, 0,
1371 coded_picture_width<<1, coded_picture_width<<1,
1372 WIDTH, 8, bx, by>>1, PMV[0][0][0], PMV[0][0][1]>>1, 0);
1374 /* predict and add to top field from bottom field */
1375 recon(oldrefframe, 1, newframe, 0,
1376 coded_picture_width<<1, coded_picture_width<<1,
1377 WIDTH, 8, bx, by>>1, DMV[0][0], DMV[0][1], 1);
1381 /* predict bottom field from bottom field */
1382 recon(oldrefframe, 1, newframe, 1,
1383 coded_picture_width<<1, coded_picture_width<<1,
1384 WIDTH, 8, bx, by>>1, PMV[0][0][0], PMV[0][0][1]>>1, 0);
1386 /* predict and add to bottom field from top field */
1387 recon(oldrefframe, 0, newframe, 1,
1388 coded_picture_width<<1, coded_picture_width<<1,
1389 WIDTH, 8, bx, by>>1, DMV[1][0], DMV[1][1], 1);
1392 else if( src->log_errs ) {
1393 /* invalid motion_type */
1394 zerrs("invalid motion_type 1 (%d)\n",motion_type);
1398 /* pics_TOP_FIELD or pics_BOTTOM_FIELD */
1400 currentfield = (pict_struct == pics_BOTTOM_FIELD);
1402 /* determine which frame to use for prediction */
1403 if( (pict_type == pic_type_P) && secondfield &&
1404 (currentfield != mv_field_sel[0][0]) )
1405 predframe = refframe; /* same frame */
1407 predframe = oldrefframe; /* previous frame */
1409 if( (motion_type == slice_decoder_t::mc_FIELD) ||
1410 !(mb_type & slice_decoder_t::mb_FORWARD) ) {
1411 /* field-based prediction */
1413 recon(predframe,mv_field_sel[0][0],newframe,0,
1414 coded_picture_width<<1,coded_picture_width<<1,
1415 WIDTH, 16, bx, by, PMV[0][0][0], PMV[0][0][1], stwtop);
1417 else if(motion_type == slice_decoder_t::mc_16X8) {
1419 recon(predframe, mv_field_sel[0][0], newframe, 0,
1420 coded_picture_width<<1, coded_picture_width<<1,
1421 WIDTH, 8, bx, by, PMV[0][0][0], PMV[0][0][1], stwtop);
1423 /* determine which frame to use for lower half prediction */
1424 if( (pict_type == pic_type_P) && secondfield &&
1425 (currentfield != mv_field_sel[1][0]) )
1426 predframe = refframe; /* same frame */
1428 predframe = oldrefframe; /* previous frame */
1430 recon(predframe, mv_field_sel[1][0], newframe, 0,
1431 coded_picture_width<<1, coded_picture_width<<1,
1432 WIDTH, 8, bx, by+8, PMV[1][0][0], PMV[1][0][1], stwtop);
1435 else if(motion_type == slice_decoder_t::mc_DMV) { /* dual prime prediction */
1437 predframe = refframe; /* same frame */
1439 predframe = oldrefframe; /* previous frame */
1441 /* calculate derived motion vectors */
1442 calc_dmv(DMV, dmvector, PMV[0][0][0], PMV[0][0][1]);
1444 /* predict from field of same parity */
1445 recon(oldrefframe, currentfield, newframe, 0,
1446 coded_picture_width<<1, coded_picture_width<<1,
1447 WIDTH, 16, bx, by, PMV[0][0][0], PMV[0][0][1], 0);
1449 /* predict from field of opposite parity */
1450 recon(predframe, !currentfield, newframe, 0,
1451 coded_picture_width<<1, coded_picture_width<<1,
1452 WIDTH, 16, bx, by, DMV[0][0], DMV[0][1], 1);
1454 else if( src->log_errs ) {
1455 /* invalid motion_type */
1456 zerrs("invalid motion_type 2 (%d)\n",motion_type);
1459 stwtop = stwbot = 1;
1462 if( (mb_type & slice_decoder_t::mb_BACKWARD) ) {
1463 if( pict_struct == pics_FRAME_PICTURE ) {
1464 if( motion_type == slice_decoder_t::mc_FRAME ) {
1465 /* frame-based prediction */
1467 recon(refframe, 0, newframe, 0,
1468 coded_picture_width, coded_picture_width<<1,
1469 WIDTH, 8, bx, by, PMV[0][1][0], PMV[0][1][1], stwtop);
1471 recon(refframe, 1, newframe, 1,
1472 coded_picture_width, coded_picture_width<<1,
1473 WIDTH, 8, bx, by, PMV[0][1][0], PMV[0][1][1], stwbot);
1476 /* field-based prediction */
1477 /* top field prediction */
1479 recon(refframe, mv_field_sel[0][1], newframe, 0,
1480 coded_picture_width<<1,coded_picture_width<<1,
1481 WIDTH, 8, bx, (by>>1), PMV[0][1][0], PMV[0][1][1]>>1, stwtop);
1482 /* bottom field prediction */
1484 recon(refframe, mv_field_sel[1][1], newframe, 1,
1485 coded_picture_width<<1, coded_picture_width<<1,
1486 WIDTH, 8, bx, (by>>1), PMV[1][1][0], PMV[1][1][1]>>1, stwbot);
1490 /* pics_TOP_FIELD or pics_BOTTOM_FIELD */
1492 if( motion_type == slice_decoder_t::mc_FIELD ) {
1493 /* field-based prediction */
1494 recon(refframe, mv_field_sel[0][1], newframe, 0,
1495 coded_picture_width<<1, coded_picture_width<<1,
1496 WIDTH, 16, bx, by, PMV[0][1][0], PMV[0][1][1], stwtop);
1498 else if( motion_type==slice_decoder_t::mc_16X8 ) {
1499 recon(refframe, mv_field_sel[0][1], newframe, 0,
1500 coded_picture_width<<1, coded_picture_width<<1,
1501 WIDTH, 8, bx, by, PMV[0][1][0], PMV[0][1][1], stwtop);
1503 recon(refframe, mv_field_sel[1][1], newframe, 0,
1504 coded_picture_width<<1, coded_picture_width<<1,
1505 WIDTH, 8, bx, by+8, PMV[1][1][0], PMV[1][1][1], stwtop);
1507 else if( src->log_errs ) {
1508 /* invalid motion_type */
1509 zerrs("invalid motion_type 3 (%d)\n",motion_type);
1512 } /* mb_type & slice_decoder_t::mb_BACKWARD */