1 #include "../libzmpeg3.h"
10 #if defined(__x86_64__)
11 #define m_(v) (*(mmx_t*)(v))
13 #define m_(v) (*(char*)(v))
15 static uint32_t sadd1[2] = { 0x00010001, 0x00010001 };
16 static uint32_t sadd2[2] = { 0x00020002, 0x00020002 };
18 static uint32_t bmask[2] = { 0x7f7f7f7f, 0x7f7f7f7f };
19 static uint32_t badd1[2] = { 0x01010101, 0x01010101 };
24 static inline void rec(uint8_t *s, uint8_t *d, int lx2, int h)
26 for( int j=0; j<h; ++j, s+=lx2, d+=lx2 ) {
28 d[0] = s[0]; d[1] = s[1]; d[2] = s[2]; d[3] = s[3];
29 d[4] = s[4]; d[5] = s[5]; d[6] = s[6]; d[7] = s[7];
30 d[8] = s[8]; d[9] = s[9]; d[10] = s[10]; d[11] = s[11];
31 d[12] = s[12]; d[13] = s[13]; d[14] = s[14]; d[15] = s[15];
33 *(uint64_t*)(d+0) = *(uint64_t*)(s+0);
34 *(uint64_t*)(d+8) = *(uint64_t*)(s+8);
40 static inline void recc(uint8_t *s, uint8_t *d, int lx2, int h)
42 for( int j=0; j<h; ++j, s+=lx2, d+=lx2) {
44 d[0] = s[0]; d[1] = s[1]; d[2] = s[2]; d[3] = s[3];
45 d[4] = s[4]; d[5] = s[5]; d[6] = s[6]; d[7] = s[7];
47 *(uint64_t*)d = *(uint64_t*)s;
52 static inline void reca(uint8_t *s, uint8_t *d, int lx2, int h)
56 for( int j=0; j<h; ++j ) {
57 dp[0] = (uint32_t)(dp[0] + sp[0] + 1) >> 1;
58 dp[1] = (uint32_t)(dp[1] + sp[1] + 1) >> 1;
59 dp[2] = (uint32_t)(dp[2] + sp[2] + 1) >> 1;
60 dp[3] = (uint32_t)(dp[3] + sp[3] + 1) >> 1;
61 dp[4] = (uint32_t)(dp[4] + sp[4] + 1) >> 1;
62 dp[5] = (uint32_t)(dp[5] + sp[5] + 1) >> 1;
63 dp[6] = (uint32_t)(dp[6] + sp[6] + 1) >> 1;
64 dp[7] = (uint32_t)(dp[7] + sp[7] + 1) >> 1;
65 dp[8] = (uint32_t)(dp[8] + sp[8] + 1) >> 1;
66 dp[9] = (uint32_t)(dp[9] + sp[9] + 1) >> 1;
67 dp[10] = (uint32_t)(dp[10] + sp[10] + 1) >> 1;
68 dp[11] = (uint32_t)(dp[11] + sp[11] + 1) >> 1;
69 dp[12] = (uint32_t)(dp[12] + sp[12] + 1) >> 1;
70 dp[13] = (uint32_t)(dp[13] + sp[13] + 1) >> 1;
71 dp[14] = (uint32_t)(dp[14] + sp[14] + 1) >> 1;
72 dp[15] = (uint32_t)(dp[15] + sp[15] + 1) >> 1;
79 movq_m2r(m_(sadd1),mm7);
80 for( int j=0; j<h; ++j ) {
81 movq_m2r(m_(sp+0),mm1);
82 movq_m2r(m_(dp+0),mm3);
85 punpcklbw_r2r(mm0,mm1);
86 punpckhbw_r2r(mm0,mm2);
87 punpcklbw_r2r(mm0,mm3);
88 punpckhbw_r2r(mm0,mm4);
95 packuswb_r2r(mm2,mm1);
96 movq_r2m(mm1,m_(dp+0));
97 movq_m2r(m_(sp+8),mm1);
98 movq_m2r(m_(dp+8),mm3);
101 punpcklbw_r2r(mm0,mm1);
102 punpckhbw_r2r(mm0,mm2);
103 punpcklbw_r2r(mm0,mm3);
104 punpckhbw_r2r(mm0,mm4);
105 paddusw_r2r(mm3,mm1);
106 paddusw_r2r(mm4,mm2);
107 paddusw_r2r(mm7,mm1);
108 paddusw_r2r(mm7,mm2);
111 packuswb_r2r(mm2,mm1);
112 movq_r2m(mm1,m_(dp+8));
113 sp += lx2; dp += lx2;
116 uint8_t *dp=d, *sp=s;
117 movq_m2r(m_(bmask),mm6);
118 movq_m2r(m_(badd1),mm7);
119 for( int j=0; j<h; ++j ) {
120 movq_m2r(m_(sp+0),mm0);
121 movq_m2r(m_(dp+0),mm1);
122 movq_m2r(m_(sp+8),mm2);
123 movq_m2r(m_(dp+8),mm3);
132 paddusb_r2r(mm1,mm0);
133 paddusb_r2r(mm3,mm2);
134 paddusb_r2r(mm7,mm0);
135 paddusb_r2r(mm7,mm2);
136 movq_r2m(mm0,m_(dp+0));
137 movq_r2m(mm2,m_(dp+8));
138 sp += lx2; dp += lx2;
145 static inline void recac(uint8_t *s, uint8_t *d, int lx2, int h)
148 uint8_t *dp=d, *sp=s;
149 for( int j=0; j<h; ++j ) {
150 dp[0] = (uint32_t)(dp[0] + sp[0] + 1)>>1;
151 dp[1] = (uint32_t)(dp[1] + sp[1] + 1)>>1;
152 dp[2] = (uint32_t)(dp[2] + sp[2] + 1)>>1;
153 dp[3] = (uint32_t)(dp[3] + sp[3] + 1)>>1;
154 dp[4] = (uint32_t)(dp[4] + sp[4] + 1)>>1;
155 dp[5] = (uint32_t)(dp[5] + sp[5] + 1)>>1;
156 dp[6] = (uint32_t)(dp[6] + sp[6] + 1)>>1;
157 dp[7] = (uint32_t)(dp[7] + sp[7] + 1)>>1;
158 sp += lx2; dp += lx2;
162 uint8_t *dp=d, *sp=s;
164 movq_m2r(m_(sadd1),mm7);
165 for( int j=0; j<h; ++j ) {
166 movq_m2r(m_(sp+0),mm1);
167 movq_m2r(m_(dp+0),mm3);
170 punpcklbw_r2r(mm0,mm1);
171 punpckhbw_r2r(mm0,mm2);
172 punpcklbw_r2r(mm0,mm3);
173 punpckhbw_r2r(mm0,mm4);
174 paddusw_r2r(mm3,mm1);
175 paddusw_r2r(mm4,mm2);
176 paddusw_r2r(mm7,mm1);
177 paddusw_r2r(mm7,mm2);
180 packuswb_r2r(mm2,mm1);
181 movq_r2m(mm1,m_(dp+0));
182 sp += lx2; dp += lx2;
185 uint8_t *dp=d, *sp=s;
186 movq_m2r(m_(bmask),mm6);
187 movq_m2r(m_(badd1),mm7);
188 for( int j=0; j<h; ++j ) {
189 movq_m2r(m_(sp),mm0);
190 movq_m2r(m_(dp),mm1);
195 paddusb_r2r(mm1,mm0);
196 paddusb_r2r(mm7,mm0);
197 movq_r2m(mm0,m_(dp));
198 sp += lx2; dp += lx2;
205 static inline void recv(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
208 uint8_t *dp=d, *sp=s, *sp2=s+lx;
209 for( int j=0; j<h; ++j ) {
210 dp[0] = (uint32_t)(sp[0] + sp2[0] + 1) >> 1;
211 dp[1] = (uint32_t)(sp[1] + sp2[1] + 1) >> 1;
212 dp[2] = (uint32_t)(sp[2] + sp2[2] + 1) >> 1;
213 dp[3] = (uint32_t)(sp[3] + sp2[3] + 1) >> 1;
214 dp[4] = (uint32_t)(sp[4] + sp2[4] + 1) >> 1;
215 dp[5] = (uint32_t)(sp[5] + sp2[5] + 1) >> 1;
216 dp[6] = (uint32_t)(sp[6] + sp2[6] + 1) >> 1;
217 dp[7] = (uint32_t)(sp[7] + sp2[7] + 1) >> 1;
218 dp[8] = (uint32_t)(sp[8] + sp2[8] + 1) >> 1;
219 dp[9] = (uint32_t)(sp[9] + sp2[9] + 1) >> 1;
220 dp[10] = (uint32_t)(sp[10] + sp2[10] + 1) >> 1;
221 dp[11] = (uint32_t)(sp[11] + sp2[11] + 1) >> 1;
222 dp[12] = (uint32_t)(sp[12] + sp2[12] + 1) >> 1;
223 dp[13] = (uint32_t)(sp[13] + sp2[13] + 1) >> 1;
224 dp[14] = (uint32_t)(sp[14] + sp2[14] + 1) >> 1;
225 dp[15] = (uint32_t)(sp[15] + sp2[15] + 1) >> 1;
226 sp += lx2; sp2 += lx2 ; dp += lx2;
230 uint8_t *dp=d, *sp=s, *sp2=s+lx;
232 movq_m2r(m_(sadd1),mm7);
233 for( int j=0; j<h; ++j ) {
234 movq_m2r(m_(sp +0),mm1);
235 movq_m2r(m_(sp2+0),mm3);
238 punpcklbw_r2r(mm0,mm1);
239 punpckhbw_r2r(mm0,mm2);
240 punpcklbw_r2r(mm0,mm3);
241 punpckhbw_r2r(mm0,mm4);
242 paddusw_r2r(mm3,mm1);
243 paddusw_r2r(mm4,mm2);
244 paddusw_r2r(mm7,mm1);
245 paddusw_r2r(mm7,mm2);
248 packuswb_r2r(mm2,mm1);
249 movq_r2m(mm1,m_(dp+0));
250 movq_m2r(m_(sp +8),mm1);
251 movq_m2r(m_(sp2+8),mm3);
254 punpcklbw_r2r(mm0,mm1);
255 punpckhbw_r2r(mm0,mm2);
256 punpcklbw_r2r(mm0,mm3);
257 punpckhbw_r2r(mm0,mm4);
258 paddusw_r2r(mm3,mm1);
259 paddusw_r2r(mm4,mm2);
260 paddusw_r2r(mm7,mm1);
261 paddusw_r2r(mm7,mm2);
264 packuswb_r2r(mm2,mm1);
265 movq_r2m(mm1,m_(dp+8));
266 sp += lx2; sp2 += lx2 ; dp += lx2;
269 uint8_t *dp=d, *sp=s, *sp2=s+lx;
270 movq_m2r(m_(bmask),mm6);
271 movq_m2r(m_(badd1),mm7);
272 for( int j=0; j<h; ++j ) {
273 movq_m2r(m_(sp +0),mm0);
274 movq_m2r(m_(sp2+0),mm1);
275 movq_m2r(m_(sp +8),mm2);
276 movq_m2r(m_(sp2+8),mm3);
285 paddusb_r2r(mm1,mm0);
286 paddusb_r2r(mm3,mm2);
287 paddusb_r2r(mm7,mm0);
288 paddusb_r2r(mm7,mm2);
289 movq_r2m(mm0,m_(dp+0));
290 movq_r2m(mm2,m_(dp+8));
291 sp += lx2; sp2 += lx2 ; dp += lx2;
298 static inline void recvc(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
301 uint8_t *dp=d, *sp=s, *sp2=s+lx;
302 for( int j=0; j<h; ++j ) {
303 dp[0] = (uint32_t)(sp[0]+sp2[0]+1)>>1;
304 dp[1] = (uint32_t)(sp[1]+sp2[1]+1)>>1;
305 dp[2] = (uint32_t)(sp[2]+sp2[2]+1)>>1;
306 dp[3] = (uint32_t)(sp[3]+sp2[3]+1)>>1;
307 dp[4] = (uint32_t)(sp[4]+sp2[4]+1)>>1;
308 dp[5] = (uint32_t)(sp[5]+sp2[5]+1)>>1;
309 dp[6] = (uint32_t)(sp[6]+sp2[6]+1)>>1;
310 dp[7] = (uint32_t)(sp[7]+sp2[7]+1)>>1;
311 sp += lx2; sp2 += lx2; dp += lx2;
315 uint8_t *dp=d, *sp=s, *sp2=s+lx;
317 movq_m2r(m_(sadd1),mm7);
318 for( int j=0; j<h; ++j ) {
319 movq_m2r(m_(sp +0),mm1);
320 movq_m2r(m_(sp2+0),mm3);
323 punpcklbw_r2r(mm0,mm1);
324 punpckhbw_r2r(mm0,mm2);
325 punpcklbw_r2r(mm0,mm3);
326 punpckhbw_r2r(mm0,mm4);
327 paddusw_r2r(mm3,mm1);
328 paddusw_r2r(mm4,mm2);
329 paddusw_r2r(mm7,mm1);
330 paddusw_r2r(mm7,mm2);
333 packuswb_r2r(mm2,mm1);
334 movq_r2m(mm1,m_(dp+0));
335 sp += lx2; sp2 += lx2; dp += lx2;
338 uint8_t *dp=d, *sp=s, *sp2=s+lx;
339 movq_m2r(m_(bmask),mm6);
340 movq_m2r(m_(badd1),mm7);
341 for( int j=0; j<h; ++j ) {
342 movq_m2r(m_(sp),mm0);
343 movq_m2r(m_(sp2),mm1);
348 paddusb_r2r(mm1,mm0);
349 paddusb_r2r(mm7,mm0);
350 movq_r2m(mm0,m_(dp));
351 sp += lx2; sp2 += lx2; dp += lx2;
359 static inline void recva(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
362 uint8_t *dp=d, *sp=s, *sp2=s+lx;
363 for( int j=0; j<h; ++j ) {
364 dp[0] = (dp[0] + ((uint32_t)(sp[0]+sp2[0]+1)>>1) + 1)>>1;
365 dp[1] = (dp[1] + ((uint32_t)(sp[1]+sp2[1]+1)>>1) + 1)>>1;
366 dp[2] = (dp[2] + ((uint32_t)(sp[2]+sp2[2]+1)>>1) + 1)>>1;
367 dp[3] = (dp[3] + ((uint32_t)(sp[3]+sp2[3]+1)>>1) + 1)>>1;
368 dp[4] = (dp[4] + ((uint32_t)(sp[4]+sp2[4]+1)>>1) + 1)>>1;
369 dp[5] = (dp[5] + ((uint32_t)(sp[5]+sp2[5]+1)>>1) + 1)>>1;
370 dp[6] = (dp[6] + ((uint32_t)(sp[6]+sp2[6]+1)>>1) + 1)>>1;
371 dp[7] = (dp[7] + ((uint32_t)(sp[7]+sp2[7]+1)>>1) + 1)>>1;
372 dp[8] = (dp[8] + ((uint32_t)(sp[8]+sp2[8]+1)>>1) + 1)>>1;
373 dp[9] = (dp[9] + ((uint32_t)(sp[9]+sp2[9]+1)>>1) + 1)>>1;
374 dp[10] = (dp[10] + ((uint32_t)(sp[10]+sp2[10]+1)>>1) + 1)>>1;
375 dp[11] = (dp[11] + ((uint32_t)(sp[11]+sp2[11]+1)>>1) + 1)>>1;
376 dp[12] = (dp[12] + ((uint32_t)(sp[12]+sp2[12]+1)>>1) + 1)>>1;
377 dp[13] = (dp[13] + ((uint32_t)(sp[13]+sp2[13]+1)>>1) + 1)>>1;
378 dp[14] = (dp[14] + ((uint32_t)(sp[14]+sp2[14]+1)>>1) + 1)>>1;
379 dp[15] = (dp[15] + ((uint32_t)(sp[15]+sp2[15]+1)>>1) + 1)>>1;
380 sp += lx2; sp2 += lx2; dp += lx2;
384 uint8_t *dp=d, *sp=s, *sp2=s+lx;
386 movq_m2r(m_(sadd1),mm7);
387 for( int j=0; j<h; ++j ) {
388 movq_m2r(m_(sp +0),mm1);
389 movq_m2r(m_(sp2+0),mm3);
390 movq_m2r(m_(dp +0),mm5);
394 punpcklbw_r2r(mm0,mm1);
395 punpckhbw_r2r(mm0,mm2);
396 punpcklbw_r2r(mm0,mm3);
397 punpckhbw_r2r(mm0,mm4);
398 punpcklbw_r2r(mm0,mm5);
399 punpckhbw_r2r(mm0,mm6);
400 paddusw_r2r(mm3,mm1);
401 paddusw_r2r(mm4,mm2);
402 paddusw_r2r(mm7,mm1);
403 paddusw_r2r(mm7,mm2);
406 paddusw_r2r(mm5,mm1);
407 paddusw_r2r(mm6,mm2);
408 paddusw_r2r(mm7,mm1);
409 paddusw_r2r(mm7,mm2);
412 packuswb_r2r(mm2,mm1);
413 movq_r2m(mm1,m_(dp+0));
414 movq_m2r(m_(sp +8),mm1);
415 movq_m2r(m_(sp2+8),mm3);
416 movq_m2r(m_(dp +8),mm5);
420 punpcklbw_r2r(mm0,mm1);
421 punpckhbw_r2r(mm0,mm2);
422 punpcklbw_r2r(mm0,mm3);
423 punpckhbw_r2r(mm0,mm4);
424 punpcklbw_r2r(mm0,mm5);
425 punpckhbw_r2r(mm0,mm6);
426 paddusw_r2r(mm3,mm1);
427 paddusw_r2r(mm4,mm2);
428 paddusw_r2r(mm7,mm1);
429 paddusw_r2r(mm7,mm2);
432 paddusw_r2r(mm5,mm1);
433 paddusw_r2r(mm6,mm2);
434 paddusw_r2r(mm7,mm1);
435 paddusw_r2r(mm7,mm2);
438 packuswb_r2r(mm2,mm1);
439 movq_r2m(mm1,m_(dp+8));
440 sp += lx2; sp2 += lx2; dp += lx2;
443 uint8_t *dp=d, *sp=s, *sp2=s+lx;
444 movq_m2r(m_(bmask),mm6);
445 movq_m2r(m_(badd1),mm7);
446 for( int j=0; j<h; ++j ) {
447 movq_m2r(m_(sp +0),mm0);
448 movq_m2r(m_(sp2+0),mm1);
449 movq_m2r(m_(sp +8),mm2);
450 movq_m2r(m_(sp2+8),mm3);
451 movq_m2r(m_(dp +0),mm4);
452 movq_m2r(m_(dp +8),mm5);
465 paddusb_r2r(mm1,mm0);
466 paddusb_r2r(mm3,mm2);
467 paddusb_r2r(mm7,mm0);
468 paddusb_r2r(mm7,mm2);
473 paddusb_r2r(mm0,mm4);
474 paddusb_r2r(mm2,mm5);
475 paddusb_r2r(mm7,mm4);
476 paddusb_r2r(mm7,mm5);
477 movq_r2m(mm4,m_(dp+0));
478 movq_r2m(mm5,m_(dp+8));
479 sp += lx2; sp2 += lx2; dp += lx2;
487 static inline void recvac(uint8_t *s, uint8_t *d, int lx,int lx2, int h)
490 uint8_t *dp=d, *sp=s, *sp2=s+lx;
491 for( int j=0; j<h; ++j ) {
492 dp[0] = (dp[0] + ((uint32_t)(sp[0]+sp2[0]+1)>>1) + 1)>>1;
493 dp[1] = (dp[1] + ((uint32_t)(sp[1]+sp2[1]+1)>>1) + 1)>>1;
494 dp[2] = (dp[2] + ((uint32_t)(sp[2]+sp2[2]+1)>>1) + 1)>>1;
495 dp[3] = (dp[3] + ((uint32_t)(sp[3]+sp2[3]+1)>>1) + 1)>>1;
496 dp[4] = (dp[4] + ((uint32_t)(sp[4]+sp2[4]+1)>>1) + 1)>>1;
497 dp[5] = (dp[5] + ((uint32_t)(sp[5]+sp2[5]+1)>>1) + 1)>>1;
498 dp[6] = (dp[6] + ((uint32_t)(sp[6]+sp2[6]+1)>>1) + 1)>>1;
499 dp[7] = (dp[7] + ((uint32_t)(sp[7]+sp2[7]+1)>>1) + 1)>>1;
500 sp += lx2; sp2 += lx2; dp+= lx2;
504 uint8_t *dp=d, *sp=s, *sp2=s+lx;
506 movq_m2r(m_(sadd1),mm7);
507 for( int j=0; j<h; ++j ) {
508 movq_m2r(m_(sp +0),mm1);
509 movq_m2r(m_(sp2+0),mm3);
510 movq_m2r(m_(dp +0),mm5);
514 punpcklbw_r2r(mm0,mm1);
515 punpckhbw_r2r(mm0,mm2);
516 punpcklbw_r2r(mm0,mm3);
517 punpckhbw_r2r(mm0,mm4);
518 punpcklbw_r2r(mm0,mm5);
519 punpckhbw_r2r(mm0,mm6);
520 paddusw_r2r(mm3,mm1);
521 paddusw_r2r(mm4,mm2);
522 paddusw_r2r(mm7,mm1);
523 paddusw_r2r(mm7,mm2);
526 paddusw_r2r(mm5,mm1);
527 paddusw_r2r(mm6,mm2);
528 paddusw_r2r(mm7,mm1);
529 paddusw_r2r(mm7,mm2);
532 packuswb_r2r(mm2,mm1);
533 movq_r2m(mm1,m_(dp+0));
534 sp += lx2; sp2 += lx2; dp += lx2;
537 uint8_t *dp=d, *sp=s, *sp2=s+lx;
538 movq_m2r(m_(bmask),mm6);
539 movq_m2r(m_(badd1),mm7);
540 for( int j=0; j<h; ++j ) {
541 movq_m2r(m_(sp),mm0);
542 movq_m2r(m_(sp2),mm1);
543 movq_m2r(m_(dp),mm4);
550 paddusb_r2r(mm1,mm0);
551 paddusb_r2r(mm7,mm0);
554 paddusb_r2r(mm0,mm4);
555 paddusb_r2r(mm7,mm4);
556 movq_r2m(mm4,m_(dp));
557 sp += lx2; sp2 += lx2; dp += lx2;
565 static inline void rech(uint8_t *s, uint8_t *d, int lx2, int h)
568 uint8_t *dp=d, *sp=s;
570 for( int j=0; j<h; ++j ) {
572 dp[0] = (uint32_t)(s1+(s2=sp[1])+1)>>1;
573 dp[1] = (uint32_t)(s2+(s1=sp[2])+1)>>1;
574 dp[2] = (uint32_t)(s1+(s2=sp[3])+1)>>1;
575 dp[3] = (uint32_t)(s2+(s1=sp[4])+1)>>1;
576 dp[4] = (uint32_t)(s1+(s2=sp[5])+1)>>1;
577 dp[5] = (uint32_t)(s2+(s1=sp[6])+1)>>1;
578 dp[6] = (uint32_t)(s1+(s2=sp[7])+1)>>1;
579 dp[7] = (uint32_t)(s2+(s1=sp[8])+1)>>1;
580 dp[8] = (uint32_t)(s1+(s2=sp[9])+1)>>1;
581 dp[9] = (uint32_t)(s2+(s1=sp[10])+1)>>1;
582 dp[10] = (uint32_t)(s1+(s2=sp[11])+1)>>1;
583 dp[11] = (uint32_t)(s2+(s1=sp[12])+1)>>1;
584 dp[12] = (uint32_t)(s1+(s2=sp[13])+1)>>1;
585 dp[13] = (uint32_t)(s2+(s1=sp[14])+1)>>1;
586 dp[14] = (uint32_t)(s1+(s2=sp[15])+1)>>1;
587 dp[15] = (uint32_t)(s2+sp[16]+1)>>1;
588 sp += lx2; dp += lx2;
592 uint8_t *dp=d, *sp=s;
594 movq_m2r(m_(sadd1),mm7);
595 for( int j=0; j<h; ++j ) {
596 movq_m2r(m_(sp+0),mm1);
597 movq_m2r(m_(sp+1),mm3);
600 punpcklbw_r2r(mm0,mm1);
601 punpckhbw_r2r(mm0,mm2);
602 punpcklbw_r2r(mm0,mm3);
603 punpckhbw_r2r(mm0,mm4);
604 paddusw_r2r(mm3,mm1);
605 paddusw_r2r(mm4,mm2);
606 paddusw_r2r(mm7,mm1);
607 paddusw_r2r(mm7,mm2);
610 packuswb_r2r(mm2,mm1);
611 movq_r2m(mm1,m_(dp+0));
612 movq_m2r(m_(sp+8),mm1);
613 movq_m2r(m_(sp+9),mm3);
616 punpcklbw_r2r(mm0,mm1);
617 punpckhbw_r2r(mm0,mm2);
618 punpcklbw_r2r(mm0,mm3);
619 punpckhbw_r2r(mm0,mm4);
620 paddusw_r2r(mm3,mm1);
621 paddusw_r2r(mm4,mm2);
622 paddusw_r2r(mm7,mm1);
623 paddusw_r2r(mm7,mm2);
626 packuswb_r2r(mm2,mm1);
627 movq_r2m(mm1,m_(dp+8));
628 sp += lx2; dp += lx2;
631 uint8_t *dp=d, *sp=s;
632 movq_m2r(m_(bmask),mm6);
633 movq_m2r(m_(badd1),mm7);
634 for( int j=0; j<h; ++j ) {
635 movq_m2r(m_(sp+0),mm0);
636 movq_m2r(m_(sp+1),mm1);
637 movq_m2r(m_(sp+8),mm2);
638 movq_m2r(m_(sp+9),mm3);
647 paddusb_r2r(mm1,mm0);
648 paddusb_r2r(mm3,mm2);
649 paddusb_r2r(mm7,mm0);
650 paddusb_r2r(mm7,mm2);
651 movq_r2m(mm0,m_(dp+0));
652 movq_r2m(mm2,m_(dp+8));
653 sp += lx2; dp += lx2;
661 static inline void rechc(uint8_t *s,uint8_t *d, int lx2, int h)
664 uint8_t *dp=d, *sp=s;
666 for( int j=0; j<h; ++j ) {
668 dp[0] = (uint32_t)(s1+(s2=sp[1])+1)>>1;
669 dp[1] = (uint32_t)(s2+(s1=sp[2])+1)>>1;
670 dp[2] = (uint32_t)(s1+(s2=sp[3])+1)>>1;
671 dp[3] = (uint32_t)(s2+(s1=sp[4])+1)>>1;
672 dp[4] = (uint32_t)(s1+(s2=sp[5])+1)>>1;
673 dp[5] = (uint32_t)(s2+(s1=sp[6])+1)>>1;
674 dp[6] = (uint32_t)(s1+(s2=sp[7])+1)>>1;
675 dp[7] = (uint32_t)(s2+sp[8]+1)>>1;
676 sp += lx2; dp += lx2;
680 uint8_t *dp=d, *sp=s;
682 movq_m2r(m_(sadd1),mm7);
683 for( int j=0; j<h; ++j ) {
684 movq_m2r(m_(sp+0),mm1);
685 movq_m2r(m_(sp+1),mm3);
688 punpcklbw_r2r(mm0,mm1);
689 punpckhbw_r2r(mm0,mm2);
690 punpcklbw_r2r(mm0,mm3);
691 punpckhbw_r2r(mm0,mm4);
692 paddusw_r2r(mm3,mm1);
693 paddusw_r2r(mm4,mm2);
694 paddusw_r2r(mm7,mm1);
695 paddusw_r2r(mm7,mm2);
698 packuswb_r2r(mm2,mm1);
699 movq_r2m(mm1,m_(dp+0));
700 sp += lx2; dp += lx2;
703 uint8_t *dp=d, *sp=s;
704 movq_m2r(m_(bmask),mm6);
705 movq_m2r(m_(badd1),mm7);
706 for( int j=0; j<h; ++j ) {
707 movq_m2r(m_(sp+0),mm0);
708 movq_m2r(m_(sp+1),mm1);
713 paddusb_r2r(mm1,mm0);
714 paddusb_r2r(mm7,mm0);
715 movq_r2m(mm0,m_(dp+0));
716 sp += lx2; dp += lx2;
723 static inline void recha(uint8_t *s, uint8_t *d,int lx2, int h)
726 uint8_t *dp=d, *sp=s;
728 for( int j=0; j<h; ++j ) {
730 dp[0] = (dp[0] + ((uint32_t)(s1 + (s2 = sp[1]) + 1) >> 1) + 1) >> 1;
731 dp[1] = (dp[1] + ((uint32_t)(s2 + (s1 = sp[2]) + 1) >> 1) + 1) >> 1;
732 dp[2] = (dp[2] + ((uint32_t)(s1 + (s2 = sp[3]) + 1) >> 1) + 1) >> 1;
733 dp[3] = (dp[3] + ((uint32_t)(s2 + (s1 = sp[4]) + 1) >> 1) + 1) >> 1;
734 dp[4] = (dp[4] + ((uint32_t)(s1 + (s2 = sp[5]) + 1) >> 1) + 1) >> 1;
735 dp[5] = (dp[5] + ((uint32_t)(s2 + (s1 = sp[6]) + 1) >> 1) + 1) >> 1;
736 dp[6] = (dp[6] + ((uint32_t)(s1 + (s2 = sp[7]) + 1) >> 1) + 1) >> 1;
737 dp[7] = (dp[7] + ((uint32_t)(s2 + (s1 = sp[8]) + 1) >> 1) + 1) >> 1;
738 dp[8] = (dp[8] + ((uint32_t)(s1 + (s2 = sp[9]) + 1) >> 1) + 1) >> 1;
739 dp[9] = (dp[9] + ((uint32_t)(s2 + (s1 = sp[10]) + 1) >> 1) + 1) >> 1;
740 dp[10] = (dp[10] + ((uint32_t)(s1 + (s2 = sp[11]) + 1) >> 1) + 1) >> 1;
741 dp[11] = (dp[11] + ((uint32_t)(s2 + (s1 = sp[12]) + 1) >> 1) + 1) >> 1;
742 dp[12] = (dp[12] + ((uint32_t)(s1 + (s2 = sp[13]) + 1) >> 1) + 1) >> 1;
743 dp[13] = (dp[13] + ((uint32_t)(s2 + (s1 = sp[14]) + 1) >> 1) + 1) >> 1;
744 dp[14] = (dp[14] + ((uint32_t)(s1 + (s2 = sp[15]) + 1) >> 1) + 1) >> 1;
745 dp[15] = (dp[15] + ((uint32_t)(s2 + sp[16] + 1) >> 1) + 1) >> 1;
746 sp += lx2; dp += lx2;
749 uint8_t *dp=d, *sp=s;
751 movq_m2r(m_(sadd1),mm7);
752 for( int j=0; j<h; ++j ) {
753 movq_m2r(m_(sp+0),mm1);
754 movq_m2r(m_(sp+1),mm3);
755 movq_m2r(m_(dp+0),mm5);
759 punpcklbw_r2r(mm0,mm1);
760 punpckhbw_r2r(mm0,mm2);
761 punpcklbw_r2r(mm0,mm3);
762 punpckhbw_r2r(mm0,mm4);
763 punpcklbw_r2r(mm0,mm5);
764 punpckhbw_r2r(mm0,mm6);
765 paddusw_r2r(mm3,mm1);
766 paddusw_r2r(mm4,mm2);
767 paddusw_r2r(mm7,mm1);
768 paddusw_r2r(mm7,mm2);
771 paddusw_r2r(mm5,mm1);
772 paddusw_r2r(mm6,mm2);
773 paddusw_r2r(mm7,mm1);
774 paddusw_r2r(mm7,mm2);
777 packuswb_r2r(mm2,mm1);
778 movq_r2m(mm1,m_(dp+0));
779 movq_m2r(m_(sp+8),mm1);
780 movq_m2r(m_(sp+9),mm3);
781 movq_m2r(m_(dp+8),mm5);
785 punpcklbw_r2r(mm0,mm1);
786 punpckhbw_r2r(mm0,mm2);
787 punpcklbw_r2r(mm0,mm3);
788 punpckhbw_r2r(mm0,mm4);
789 punpcklbw_r2r(mm0,mm5);
790 punpckhbw_r2r(mm0,mm6);
791 paddusw_r2r(mm3,mm1);
792 paddusw_r2r(mm4,mm2);
793 paddusw_r2r(mm7,mm1);
794 paddusw_r2r(mm7,mm2);
797 paddusw_r2r(mm5,mm1);
798 paddusw_r2r(mm6,mm2);
799 paddusw_r2r(mm7,mm1);
800 paddusw_r2r(mm7,mm2);
803 packuswb_r2r(mm2,mm1);
804 movq_r2m(mm1,m_(dp+8));
805 sp += lx2; dp += lx2;
812 static inline void rechac(uint8_t *s,uint8_t *d, int lx2, int h)
815 uint8_t *dp=d, *sp=s;
817 for( int j=0; j<h; ++j ) {
819 dp[0] = (dp[0] + ((uint32_t)(s1 + (s2 = sp[1]) + 1) >> 1) + 1) >> 1;
820 dp[1] = (dp[1] + ((uint32_t)(s2 + (s1 = sp[2]) + 1) >> 1) + 1) >> 1;
821 dp[2] = (dp[2] + ((uint32_t)(s1 + (s2 = sp[3]) + 1) >> 1) + 1) >> 1;
822 dp[3] = (dp[3] + ((uint32_t)(s2 + (s1 = sp[4]) + 1) >> 1) + 1) >> 1;
823 dp[4] = (dp[4] + ((uint32_t)(s1 + (s2 = sp[5]) + 1) >> 1) + 1) >> 1;
824 dp[5] = (dp[5] + ((uint32_t)(s2 + (s1 = sp[6]) + 1) >> 1) + 1) >> 1;
825 dp[6] = (dp[6] + ((uint32_t)(s1 + (s2 = sp[7]) + 1) >> 1) + 1) >> 1;
826 dp[7] = (dp[7] + ((uint32_t)(s2 + sp[8] + 1) >> 1) + 1) >> 1;
827 sp += lx2; dp += lx2;
830 uint8_t *dp=d, *sp=s;
832 movq_m2r(m_(sadd1),mm7);
833 for( int j=0; j<h; ++j ) {
834 movq_m2r(m_(sp+0),mm1);
835 movq_m2r(m_(sp+1),mm3);
836 movq_m2r(m_(dp+0),mm5);
840 punpcklbw_r2r(mm0,mm1);
841 punpckhbw_r2r(mm0,mm2);
842 punpcklbw_r2r(mm0,mm3);
843 punpckhbw_r2r(mm0,mm4);
844 punpcklbw_r2r(mm0,mm5);
845 punpckhbw_r2r(mm0,mm6);
846 paddusw_r2r(mm3,mm1);
847 paddusw_r2r(mm4,mm2);
848 paddusw_r2r(mm7,mm1);
849 paddusw_r2r(mm7,mm2);
852 paddusw_r2r(mm5,mm1);
853 paddusw_r2r(mm6,mm2);
854 paddusw_r2r(mm7,mm1);
855 paddusw_r2r(mm7,mm2);
858 packuswb_r2r(mm2,mm1);
859 movq_r2m(mm1,m_(dp+0));
860 sp += lx2; dp += lx2;
867 static inline void rec4(uint8_t *s, uint8_t *d, int lx, int lx2, int h)
870 uint8_t *dp=d, *sp=s, *sp2=s+lx;
871 uint32_t s1, s2, s3, s4;
872 for( int j=0; j<h; ++j ) {
874 dp[0] = (uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2;
875 dp[1] = (uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2;
876 dp[2] = (uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2;
877 dp[3] = (uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2;
878 dp[4] = (uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2;
879 dp[5] = (uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2;
880 dp[6] = (uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2;
881 dp[7] = (uint32_t)(s2+(s1=sp[8])+s4+(s3=sp2[8])+2)>>2;
882 dp[8] = (uint32_t)(s1+(s2=sp[9])+s3+(s4=sp2[9])+2)>>2;
883 dp[9] = (uint32_t)(s2+(s1=sp[10])+s4+(s3=sp2[10])+2)>>2;
884 dp[10] = (uint32_t)(s1+(s2=sp[11])+s3+(s4=sp2[11])+2)>>2;
885 dp[11] = (uint32_t)(s2+(s1=sp[12])+s4+(s3=sp2[12])+2)>>2;
886 dp[12] = (uint32_t)(s1+(s2=sp[13])+s3+(s4=sp2[13])+2)>>2;
887 dp[13] = (uint32_t)(s2+(s1=sp[14])+s4+(s3=sp2[14])+2)>>2;
888 dp[14] = (uint32_t)(s1+(s2=sp[15])+s3+(s4=sp2[15])+2)>>2;
889 dp[15] = (uint32_t)(s2+sp[16]+s4+sp2[16]+2)>>2;
890 sp += lx2; sp2 += lx2; dp += lx2;
893 uint8_t *dp=d, *sp=s, *sp2=s+lx;
895 movq_m2r(m_(sadd2),mm7);
896 for( int j=0; j<h; ++j ) {
897 movq_m2r(m_(sp +0),mm1);
898 movq_m2r(m_(sp +1),mm3);
899 movq_m2r(m_(sp2+0),mm5);
903 punpcklbw_r2r(mm0,mm1);
904 punpckhbw_r2r(mm0,mm2);
905 punpcklbw_r2r(mm0,mm3);
906 punpckhbw_r2r(mm0,mm4);
907 punpcklbw_r2r(mm0,mm5);
908 punpckhbw_r2r(mm0,mm6);
909 paddusw_r2r(mm3,mm1);
910 movq_m2r(m_(sp2+1),mm3);
911 paddusw_r2r(mm4,mm2);
913 punpcklbw_r2r(mm0,mm3);
914 punpckhbw_r2r(mm0,mm4);
915 paddusw_r2r(mm5,mm3);
916 paddusw_r2r(mm6,mm4);
917 paddusw_r2r(mm3,mm1);
918 paddusw_r2r(mm4,mm2);
919 paddusw_r2r(mm7,mm1);
920 paddusw_r2r(mm7,mm2);
923 packuswb_r2r(mm2,mm1);
924 movq_r2m(mm1,m_(dp+0));
925 movq_m2r(m_(sp +8),mm1);
926 movq_m2r(m_(sp +9),mm3);
927 movq_m2r(m_(sp2+8),mm5);
931 punpcklbw_r2r(mm0,mm1);
932 punpckhbw_r2r(mm0,mm2);
933 punpcklbw_r2r(mm0,mm3);
934 punpckhbw_r2r(mm0,mm4);
935 punpcklbw_r2r(mm0,mm5);
936 punpckhbw_r2r(mm0,mm6);
937 paddusw_r2r(mm3,mm1);
938 movq_m2r(m_(sp2+9),mm3);
939 paddusw_r2r(mm4,mm2);
941 punpcklbw_r2r(mm0,mm3);
942 punpckhbw_r2r(mm0,mm4);
943 paddusw_r2r(mm5,mm3);
944 paddusw_r2r(mm6,mm4);
945 paddusw_r2r(mm3,mm1);
946 paddusw_r2r(mm4,mm2);
947 paddusw_r2r(mm7,mm1);
948 paddusw_r2r(mm7,mm2);
951 packuswb_r2r(mm2,mm1);
952 movq_r2m(mm1,m_(dp+8));
953 sp += lx2; sp2 += lx2; dp += lx2;
960 static inline void rec4c(uint8_t *s,uint8_t *d, int lx, int lx2, int h)
963 uint8_t *dp=d, *sp=s, *sp2=s+lx;
964 uint32_t s1, s2, s3, s4;
965 for( int j=0; j<h; ++j ) {
967 dp[0] = (uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2;
968 dp[1] = (uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2;
969 dp[2] = (uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2;
970 dp[3] = (uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2;
971 dp[4] = (uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2;
972 dp[5] = (uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2;
973 dp[6] = (uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2;
974 dp[7] = (uint32_t)(s2+sp[8]+s4+sp2[8]+2)>>2;
975 sp += lx2; sp2 += lx2; dp += lx2;
978 uint8_t *dp=d, *sp=s, *sp2=s+lx;
980 movq_m2r(m_(sadd2),mm7);
981 for( int j=0; j<h; ++j ) {
982 movq_m2r(m_(sp +0),mm1);
983 movq_m2r(m_(sp +1),mm3);
984 movq_m2r(m_(sp2+0),mm5);
988 punpcklbw_r2r(mm0,mm1);
989 punpckhbw_r2r(mm0,mm2);
990 punpcklbw_r2r(mm0,mm3);
991 punpckhbw_r2r(mm0,mm4);
992 punpcklbw_r2r(mm0,mm5);
993 punpckhbw_r2r(mm0,mm6);
994 paddusw_r2r(mm3,mm1);
995 movq_m2r(m_(sp2+1),mm3);
996 paddusw_r2r(mm4,mm2);
998 punpcklbw_r2r(mm0,mm3);
999 punpckhbw_r2r(mm0,mm4);
1000 paddusw_r2r(mm5,mm3);
1001 paddusw_r2r(mm6,mm4);
1002 paddusw_r2r(mm3,mm1);
1003 paddusw_r2r(mm4,mm2);
1004 paddusw_r2r(mm7,mm1);
1005 paddusw_r2r(mm7,mm2);
1008 packuswb_r2r(mm2,mm1);
1009 movq_r2m(mm1,m_(dp+0));
1010 sp += lx2; sp2 += lx2; dp += lx2;
1017 static inline void rec4a(uint8_t *s,uint8_t *d, int lx, int lx2, int h)
1020 uint8_t *dp=d, *sp=s, *sp2=s+lx;
1021 uint32_t s1, s2, s3, s4;
1022 for( int j=0; j<h; ++j ) {
1023 s1 = sp[0]; s3 = sp2[0];
1024 dp[0] = (dp[0] + ((uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2) + 1)>>1;
1025 dp[1] = (dp[1] + ((uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2) + 1)>>1;
1026 dp[2] = (dp[2] + ((uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2) + 1)>>1;
1027 dp[3] = (dp[3] + ((uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2) + 1)>>1;
1028 dp[4] = (dp[4] + ((uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2) + 1)>>1;
1029 dp[5] = (dp[5] + ((uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2) + 1)>>1;
1030 dp[6] = (dp[6] + ((uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2) + 1)>>1;
1031 dp[7] = (dp[7] + ((uint32_t)(s2+(s1=sp[8])+s4+(s3=sp2[8])+2)>>2) + 1)>>1;
1032 dp[8] = (dp[8] + ((uint32_t)(s1+(s2=sp[9])+s3+(s4=sp2[9])+2)>>2) + 1)>>1;
1033 dp[9] = (dp[9] + ((uint32_t)(s2+(s1=sp[10])+s4+(s3=sp2[10])+2)>>2) + 1)>>1;
1034 dp[10] = (dp[10] + ((uint32_t)(s1+(s2=sp[11])+s3+(s4=sp2[11])+2)>>2) + 1)>>1;
1035 dp[11] = (dp[11] + ((uint32_t)(s2+(s1=sp[12])+s4+(s3=sp2[12])+2)>>2) + 1)>>1;
1036 dp[12] = (dp[12] + ((uint32_t)(s1+(s2=sp[13])+s3+(s4=sp2[13])+2)>>2) + 1)>>1;
1037 dp[13] = (dp[13] + ((uint32_t)(s2+(s1=sp[14])+s4+(s3=sp2[14])+2)>>2) + 1)>>1;
1038 dp[14] = (dp[14] + ((uint32_t)(s1+(s2=sp[15])+s3+(s4=sp2[15])+2)>>2) + 1)>>1;
1039 dp[15] = (dp[15] + ((uint32_t)(s2+sp[16]+s4+sp2[16]+2)>>2) + 1)>>1;
1040 sp += lx2; sp2 += lx2; dp += lx2;
1043 uint8_t *dp=d, *sp=s, *sp2=s+lx;
1045 movq_m2r(m_(sadd2),mm7);
1046 for( int j=0; j<h; ++j ) {
1047 movq_m2r(m_(sp +0),mm1);
1048 movq_m2r(m_(sp +1),mm3);
1049 movq_m2r(m_(sp2+0),mm5);
1053 punpcklbw_r2r(mm0,mm1);
1054 punpckhbw_r2r(mm0,mm2);
1055 punpcklbw_r2r(mm0,mm3);
1056 punpckhbw_r2r(mm0,mm4);
1057 punpcklbw_r2r(mm0,mm5);
1058 punpckhbw_r2r(mm0,mm6);
1059 paddusw_r2r(mm3,mm1);
1060 movq_m2r(m_(sp2+1),mm3);
1061 paddusw_r2r(mm4,mm2);
1063 punpcklbw_r2r(mm0,mm3);
1064 punpckhbw_r2r(mm0,mm4);
1065 paddusw_r2r(mm5,mm3);
1066 paddusw_r2r(mm6,mm4);
1067 paddusw_r2r(mm3,mm1);
1068 paddusw_r2r(mm4,mm2);
1069 movq_m2r(m_(dp +0),mm3);
1070 paddusw_r2r(mm7,mm1);
1072 paddusw_r2r(mm7,mm2);
1073 punpcklbw_r2r(mm0,mm3);
1074 punpckhbw_r2r(mm0,mm4);
1077 movq_m2r(m_(sadd1),mm5);
1078 paddusw_r2r(mm3,mm1);
1079 paddusw_r2r(mm4,mm2);
1080 paddusw_r2r(mm5,mm1);
1081 paddusw_r2r(mm5,mm2);
1084 packuswb_r2r(mm2,mm1);
1085 movq_r2m(mm1,m_(dp+0));
1086 movq_m2r(m_(sp +8),mm1);
1087 movq_m2r(m_(sp +9),mm3);
1088 movq_m2r(m_(sp2+8),mm5);
1092 punpcklbw_r2r(mm0,mm1);
1093 punpckhbw_r2r(mm0,mm2);
1094 punpcklbw_r2r(mm0,mm3);
1095 punpckhbw_r2r(mm0,mm4);
1096 punpcklbw_r2r(mm0,mm5);
1097 punpckhbw_r2r(mm0,mm6);
1098 paddusw_r2r(mm3,mm1);
1099 movq_m2r(m_(sp2+9),mm3);
1100 paddusw_r2r(mm4,mm2);
1102 punpcklbw_r2r(mm0,mm3);
1103 punpckhbw_r2r(mm0,mm4);
1104 paddusw_r2r(mm5,mm3);
1105 paddusw_r2r(mm6,mm4);
1106 paddusw_r2r(mm3,mm1);
1107 paddusw_r2r(mm4,mm2);
1108 movq_m2r(m_(dp +8),mm3);
1109 paddusw_r2r(mm7,mm1);
1111 paddusw_r2r(mm7,mm2);
1112 punpcklbw_r2r(mm0,mm3);
1113 punpckhbw_r2r(mm0,mm4);
1116 movq_m2r(m_(sadd1),mm5);
1117 paddusw_r2r(mm3,mm1);
1118 paddusw_r2r(mm4,mm2);
1119 paddusw_r2r(mm5,mm1);
1120 paddusw_r2r(mm5,mm2);
1123 packuswb_r2r(mm2,mm1);
1124 movq_r2m(mm1,m_(dp+8));
1125 sp += lx2; sp2 += lx2; dp += lx2;
1132 static inline void rec4ac(uint8_t *s,uint8_t *d, int lx, int lx2, int h)
1135 uint8_t *dp=d, *sp=s, *sp2=s+lx;
1136 uint32_t s1,s2,s3,s4;
1137 for( int j=0; j<h; ++j ) {
1138 s1=sp[0]; s3=sp2[0];
1139 dp[0] = (dp[0] + ((uint32_t)(s1+(s2=sp[1])+s3+(s4=sp2[1])+2)>>2) + 1)>>1;
1140 dp[1] = (dp[1] + ((uint32_t)(s2+(s1=sp[2])+s4+(s3=sp2[2])+2)>>2) + 1)>>1;
1141 dp[2] = (dp[2] + ((uint32_t)(s1+(s2=sp[3])+s3+(s4=sp2[3])+2)>>2) + 1)>>1;
1142 dp[3] = (dp[3] + ((uint32_t)(s2+(s1=sp[4])+s4+(s3=sp2[4])+2)>>2) + 1)>>1;
1143 dp[4] = (dp[4] + ((uint32_t)(s1+(s2=sp[5])+s3+(s4=sp2[5])+2)>>2) + 1)>>1;
1144 dp[5] = (dp[5] + ((uint32_t)(s2+(s1=sp[6])+s4+(s3=sp2[6])+2)>>2) + 1)>>1;
1145 dp[6] = (dp[6] + ((uint32_t)(s1+(s2=sp[7])+s3+(s4=sp2[7])+2)>>2) + 1)>>1;
1146 dp[7] = (dp[7] + ((uint32_t)(s2+sp[8]+s4+sp2[8]+2)>>2) + 1)>>1;
1147 sp += lx2; sp2 += lx2; dp += lx2;
1150 uint8_t *dp=d, *sp=s, *sp2=s+lx;
1152 movq_m2r(m_(sadd2),mm7);
1153 for( int j=0; j<h; ++j ) {
1154 movq_m2r(m_(sp +0),mm1);
1155 movq_m2r(m_(sp +1),mm3);
1156 movq_m2r(m_(sp2+0),mm5);
1160 punpcklbw_r2r(mm0,mm1);
1161 punpckhbw_r2r(mm0,mm2);
1162 punpcklbw_r2r(mm0,mm3);
1163 punpckhbw_r2r(mm0,mm4);
1164 punpcklbw_r2r(mm0,mm5);
1165 punpckhbw_r2r(mm0,mm6);
1166 paddusw_r2r(mm3,mm1);
1167 movq_m2r(m_(sp2+1),mm3);
1168 paddusw_r2r(mm4,mm2);
1170 punpcklbw_r2r(mm0,mm3);
1171 punpckhbw_r2r(mm0,mm4);
1172 paddusw_r2r(mm5,mm3);
1173 paddusw_r2r(mm6,mm4);
1174 paddusw_r2r(mm3,mm1);
1175 paddusw_r2r(mm4,mm2);
1176 movq_m2r(m_(dp +0),mm3);
1177 paddusw_r2r(mm7,mm1);
1179 paddusw_r2r(mm7,mm2);
1180 punpcklbw_r2r(mm0,mm3);
1181 punpckhbw_r2r(mm0,mm4);
1184 movq_m2r(m_(sadd1),mm5);
1185 paddusw_r2r(mm3,mm1);
1186 paddusw_r2r(mm4,mm2);
1187 paddusw_r2r(mm5,mm1);
1188 paddusw_r2r(mm5,mm2);
1191 packuswb_r2r(mm2,mm1);
1192 movq_r2m(mm1,m_(dp+0));
1193 sp += lx2; sp2 += lx2; dp += lx2;
1201 static class stats {
1204 stats() { for( int i=0; i<16; ++i ) totals[i] = 0; }
1206 for( int i=0; i<16; ++i ) {
1207 static const char *fn[16] = {
1208 "recc", "rec", "recac", "reca",
1209 "recvc", "recv", "recvac", "recva",
1210 "rechc", "rech", "rechac", "recha",
1211 "rec4c", "rec4", "rec4ac", "rec4a"
1213 printf("%-8s %d\n",fn[i],totals[i]);
1216 void incr(int i) { if( i>0 && i<16 ) ++totals[i]; }
1221 inline void zvideo_t::
1222 recon_comp(uint8_t *s, uint8_t *d, int lx, int lx2, int h, int type)
1224 /* probably Accelerated functions */
1226 case 0x3: reca(s, d, lx2, h); break;
1227 case 0x2: recac(s, d, lx2, h); break;
1228 case 0x1: rec(s, d, lx2, h); break;
1229 case 0x0: recc(s, d, lx2, h); break;
1230 case 0x7: recva(s, d, lx, lx2, h); break;
1231 case 0x6: recvac(s, d, lx, lx2, h); break;
1232 case 0x5: recv(s, d, lx, lx2, h); break;
1233 case 0x4: recvc(s, d, lx, lx2, h); break;
1234 case 0x9: rech(s, d, lx2, h); break;
1235 case 0x8: rechc(s, d, lx2, h); break;
1236 /* maybe Unaccelerated functions */
1237 case 0xb: recha(s, d, lx2, h); break;
1238 case 0xa: rechac(s, d, lx2, h); break;
1239 case 0xf: rec4a(s, d, lx, lx2, h); break;
1240 case 0xe: rec4ac(s, d, lx, lx2, h); break;
1241 case 0xd: rec4(s, d, lx, lx2, h); break;
1242 case 0xc: rec4c(s, d, lx, lx2, h); break;
1250 uint8_t *src[]; * prediction source buffer *
1251 int sfield; * prediction source field number (0 or 1) *
1252 uint8_t *dst[]; * prediction destination buffer *
1253 int dfield; * prediction destination field number (0 or 1)*
1254 int lx,lx2; * horizontal offsets *
1255 int w,h; * prediction block/sub-block width, height *
1256 int x,y; * pixel co-ordinates of top-left sample in current MB *
1257 int dx,dy; * horizontal, vertical motion vector *
1258 int addflag; * add prediction error to prediction ? *
1261 recon( uint8_t *src[], int sfield,
1262 uint8_t *dst[], int dfield, int lx, int lx2,
1263 int w, int h, int x, int y, int dx, int dy, int addflag)
1265 /* validate parameters */
1267 int sofs = (y+(dy>>1))*lx + x+(dx>>1);
1268 int dofs = y*lx + x;
1269 if( sfield ) sofs += lx2 >> 1;
1270 if( dfield ) dofs += lx2 >> 1;
1271 if( sofs >= 0 && dofs >= 0 ) {
1272 int dsz = coded_picture_width * coded_picture_height;
1273 int ssz = dsz + 16*coded_picture_width + 16;
1274 int dlen = (h-1)*lx2 + w*8+8-1;
1275 int slen = dlen + (dy&1)*lx + (dx&1);
1276 if( sofs+slen >= ssz || dofs+dlen >= dsz )
1282 if( this->src->log_errs ) {
1283 zmsgs("err%c frm %dx%d @ %d,%d %dx%d dx=%d, dy=%d, sofs=%d, dofs=%d\n",
1284 err, coded_picture_width, coded_picture_height,
1285 x, y, 8+w*8, h, dx, dy, sofs, dofs);
1290 /* half pel scaling */
1291 int type = ((dx & 1) << 3) | ((dy & 1) << 2) | w;
1292 if( addflag ) type |= 2;
1294 recon_comp(src[0]+sofs, dst[0]+dofs, lx, lx2, h, type); /* Y */
1296 if( chroma_format != cfmt_444 ) {
1301 if( chroma_format == cfmt_420 ) {
1306 sofs = (y+(dy>>1))*lx + x+(dx>>1);
1308 if( sfield ) sofs += lx2 >> 1;
1309 if( dfield ) dofs += lx2 >> 1;
1310 type = ((dx & 1) << 3) | ((dy & 1) << 2) | w;
1311 if( addflag ) type |= 2;
1314 recon_comp(src[1]+sofs, dst[1]+dofs, lx, lx2, h, type); /* Cb */
1315 recon_comp(src[2]+sofs, dst[2]+dofs, lx, lx2, h, type); /* Cr */
1321 reconstruct( int bx, int by, int mb_type, int motion_type,
1322 int PMV[2][2][2], int mv_field_sel[2][2],
1323 int dmvector[2], int stwtype)
1326 uint8_t **predframe;
1330 stwtop = stwtype % 3; /* 0:temporal, 1:(spat+temp), 2:spatial */
1331 stwbot = stwtype / 3;
1333 if( (mb_type & slice_decoder_t::mb_FORWARD) || (pict_type == pic_type_P) ) {
1334 if( pict_struct == pics_FRAME_PICTURE ) {
1335 if( (motion_type == slice_decoder_t::mc_FRAME) ||
1336 !(mb_type & slice_decoder_t::mb_FORWARD) ) {
1337 /* frame-based prediction */
1339 recon(oldrefframe, 0, newframe, 0,
1340 coded_picture_width, coded_picture_width<<1,
1341 WIDTH, 8, bx, by, PMV[0][0][0], PMV[0][0][1], stwtop);
1344 recon(oldrefframe, 1, newframe, 1,
1345 coded_picture_width, coded_picture_width<<1,
1346 WIDTH, 8, bx, by, PMV[0][0][0], PMV[0][0][1], stwbot);
1348 else if(motion_type == slice_decoder_t::mc_FIELD) { /* field-based prediction */
1349 /* top field prediction */
1351 recon(oldrefframe, mv_field_sel[0][0], newframe, 0,
1352 coded_picture_width<<1, coded_picture_width<<1,
1353 WIDTH, 8, bx, by>>1, PMV[0][0][0], PMV[0][0][1]>>1, stwtop);
1355 /* bottom field prediction */
1357 recon(oldrefframe, mv_field_sel[1][0], newframe, 1,
1358 coded_picture_width<<1, coded_picture_width<<1,
1359 WIDTH, 8, bx, by>>1, PMV[1][0][0], PMV[1][0][1]>>1, stwbot);
1361 else if( motion_type == slice_decoder_t::mc_DMV ) {
1362 /* dual prime prediction */
1363 /* calculate derived motion vectors */
1364 calc_dmv(DMV, dmvector, PMV[0][0][0], PMV[0][0][1]>>1);
1367 /* predict top field from top field */
1368 recon(oldrefframe, 0, newframe, 0,
1369 coded_picture_width<<1, coded_picture_width<<1,
1370 WIDTH, 8, bx, by>>1, PMV[0][0][0], PMV[0][0][1]>>1, 0);
1372 /* predict and add to top field from bottom field */
1373 recon(oldrefframe, 1, newframe, 0,
1374 coded_picture_width<<1, coded_picture_width<<1,
1375 WIDTH, 8, bx, by>>1, DMV[0][0], DMV[0][1], 1);
1379 /* predict bottom field from bottom field */
1380 recon(oldrefframe, 1, newframe, 1,
1381 coded_picture_width<<1, coded_picture_width<<1,
1382 WIDTH, 8, bx, by>>1, PMV[0][0][0], PMV[0][0][1]>>1, 0);
1384 /* predict and add to bottom field from top field */
1385 recon(oldrefframe, 0, newframe, 1,
1386 coded_picture_width<<1, coded_picture_width<<1,
1387 WIDTH, 8, bx, by>>1, DMV[1][0], DMV[1][1], 1);
1390 else if( src->log_errs ) {
1391 /* invalid motion_type */
1392 zerrs("invalid motion_type 1 (%d)\n",motion_type);
1396 /* pics_TOP_FIELD or pics_BOTTOM_FIELD */
1398 currentfield = (pict_struct == pics_BOTTOM_FIELD);
1400 /* determine which frame to use for prediction */
1401 if( (pict_type == pic_type_P) && secondfield &&
1402 (currentfield != mv_field_sel[0][0]) )
1403 predframe = refframe; /* same frame */
1405 predframe = oldrefframe; /* previous frame */
1407 if( (motion_type == slice_decoder_t::mc_FIELD) ||
1408 !(mb_type & slice_decoder_t::mb_FORWARD) ) {
1409 /* field-based prediction */
1411 recon(predframe,mv_field_sel[0][0],newframe,0,
1412 coded_picture_width<<1,coded_picture_width<<1,
1413 WIDTH, 16, bx, by, PMV[0][0][0], PMV[0][0][1], stwtop);
1415 else if(motion_type == slice_decoder_t::mc_16X8) {
1417 recon(predframe, mv_field_sel[0][0], newframe, 0,
1418 coded_picture_width<<1, coded_picture_width<<1,
1419 WIDTH, 8, bx, by, PMV[0][0][0], PMV[0][0][1], stwtop);
1421 /* determine which frame to use for lower half prediction */
1422 if( (pict_type == pic_type_P) && secondfield &&
1423 (currentfield != mv_field_sel[1][0]) )
1424 predframe = refframe; /* same frame */
1426 predframe = oldrefframe; /* previous frame */
1428 recon(predframe, mv_field_sel[1][0], newframe, 0,
1429 coded_picture_width<<1, coded_picture_width<<1,
1430 WIDTH, 8, bx, by+8, PMV[1][0][0], PMV[1][0][1], stwtop);
1433 else if(motion_type == slice_decoder_t::mc_DMV) { /* dual prime prediction */
1435 predframe = refframe; /* same frame */
1437 predframe = oldrefframe; /* previous frame */
1439 /* calculate derived motion vectors */
1440 calc_dmv(DMV, dmvector, PMV[0][0][0], PMV[0][0][1]);
1442 /* predict from field of same parity */
1443 recon(oldrefframe, currentfield, newframe, 0,
1444 coded_picture_width<<1, coded_picture_width<<1,
1445 WIDTH, 16, bx, by, PMV[0][0][0], PMV[0][0][1], 0);
1447 /* predict from field of opposite parity */
1448 recon(predframe, !currentfield, newframe, 0,
1449 coded_picture_width<<1, coded_picture_width<<1,
1450 WIDTH, 16, bx, by, DMV[0][0], DMV[0][1], 1);
1452 else if( src->log_errs ) {
1453 /* invalid motion_type */
1454 zerrs("invalid motion_type 2 (%d)\n",motion_type);
1457 stwtop = stwbot = 1;
1460 if( (mb_type & slice_decoder_t::mb_BACKWARD) ) {
1461 if( pict_struct == pics_FRAME_PICTURE ) {
1462 if( motion_type == slice_decoder_t::mc_FRAME ) {
1463 /* frame-based prediction */
1465 recon(refframe, 0, newframe, 0,
1466 coded_picture_width, coded_picture_width<<1,
1467 WIDTH, 8, bx, by, PMV[0][1][0], PMV[0][1][1], stwtop);
1469 recon(refframe, 1, newframe, 1,
1470 coded_picture_width, coded_picture_width<<1,
1471 WIDTH, 8, bx, by, PMV[0][1][0], PMV[0][1][1], stwbot);
1474 /* field-based prediction */
1475 /* top field prediction */
1477 recon(refframe, mv_field_sel[0][1], newframe, 0,
1478 coded_picture_width<<1,coded_picture_width<<1,
1479 WIDTH, 8, bx, (by>>1), PMV[0][1][0], PMV[0][1][1]>>1, stwtop);
1480 /* bottom field prediction */
1482 recon(refframe, mv_field_sel[1][1], newframe, 1,
1483 coded_picture_width<<1, coded_picture_width<<1,
1484 WIDTH, 8, bx, (by>>1), PMV[1][1][0], PMV[1][1][1]>>1, stwbot);
1488 /* pics_TOP_FIELD or pics_BOTTOM_FIELD */
1490 if( motion_type == slice_decoder_t::mc_FIELD ) {
1491 /* field-based prediction */
1492 recon(refframe, mv_field_sel[0][1], newframe, 0,
1493 coded_picture_width<<1, coded_picture_width<<1,
1494 WIDTH, 16, bx, by, PMV[0][1][0], PMV[0][1][1], stwtop);
1496 else if( motion_type==slice_decoder_t::mc_16X8 ) {
1497 recon(refframe, mv_field_sel[0][1], newframe, 0,
1498 coded_picture_width<<1, coded_picture_width<<1,
1499 WIDTH, 8, bx, by, PMV[0][1][0], PMV[0][1][1], stwtop);
1501 recon(refframe, mv_field_sel[1][1], newframe, 0,
1502 coded_picture_width<<1, coded_picture_width<<1,
1503 WIDTH, 8, bx, by+8, PMV[1][1][0], PMV[1][1][1], stwtop);
1505 else if( src->log_errs ) {
1506 /* invalid motion_type */
1507 zerrs("invalid motion_type 3 (%d)\n",motion_type);
1510 } /* mb_type & slice_decoder_t::mb_BACKWARD */