2 * the input data is tranposed and each 16 bit element in the 8x8 matrix
4 * for example in 11...1110000 format
5 * If the iDCT is of I macroblock then 0.5 needs to be added to the;DC Component
6 * (element[0][0] of the matrix)
16 preSC: .short 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520
17 .short 22725, 31521, 29692, 26722, 22725, 17855, 12299, 6270
18 .short 21407, 29692, 27969, 25172, 21407, 16819, 11585, 5906
19 .short 19266, 26722, 25172, 22654, 19266, 15137, 10426, 5315
20 .short 16384, 22725, 21407, 19266, 16384, 12873, 8867, 4520
21 .short 12873, 17855, 16819, 15137, 25746, 20228, 13933, 7103
22 .short 17734, 24598, 23170, 20853, 17734, 13933, 9597, 4892
23 .short 18081, 25080, 23624, 21261, 18081, 14206, 9785, 4988
26 .type x0005000200010001, @object
27 .size x0005000200010001, 8
29 .long 0x00010001, 0x00050002
31 .type x0040000000000000, @object
32 .size x0040000000000000, 8
36 .type x5a825a825a825a82, @object
37 .size x5a825a825a825a82, 8
39 .long 0x5a825a82, 0x5a825a82
41 .type x539f539f539f539f, @object
42 .size x539f539f539f539f, 8
44 .long 0x539f539f, 0x539f539f
46 .type x4546454645464546, @object
47 .size x4546454645464546, 8
49 .long 0x45464546, 0x45464546
51 .type x61f861f861f861f8, @object
52 .size x61f861f861f861f8, 8
54 .long 0x61f861f8, 0x61f861f8
55 /* Static variables */
68 .type IDCT_mmx, @function
78 pushl $0 /* allocate the temp variables */
87 movl 8(%ebp), %esi /* source matrix */
89 /* column 0: even part
90 * use V4, V12, V0, V8 to produce V22..V25
92 movq 8*12(%ecx), %mm0 /* maybe the first mul can be done together */
93 /* with the dequantization in iHuff module */
94 pmulhw 8*12(%esi), %mm0 /* V12 */
96 pmulhw 8*4(%esi), %mm1 /* V4 */
98 psraw $1, %mm0 /* t64=t66 */
99 pmulhw (%esi), %mm3 /* V0 */
100 movq 8*8(%ecx), %mm5 /* duplicate V4 */
101 movq %mm1, %mm2 /* added 11/1/96 */
102 pmulhw 8*8(%esi),%mm5 /* V8 */
103 psubsw %mm0, %mm1 /* V16 */
104 pmulhw x5a825a825a825a82, %mm1 /* 23170 ->V18 */
105 paddsw %mm0, %mm2 /* V17 */
106 movq %mm2, %mm0 /* duplicate V17 */
107 psraw $1, %mm2 /* t75=t82 */
108 psraw $2, %mm0 /* t72 */
109 movq %mm3, %mm4 /* duplicate V0 */
110 paddsw %mm5, %mm3 /* V19 */
111 psubsw %mm5, %mm4 /* V20 ;mm5 free */
112 /* moved from the block below */
113 movq 8*10(%ecx), %mm7
114 psraw $1, %mm3 /* t74=t81 */
115 movq %mm3, %mm6 /* duplicate t74=t81 */
116 psraw $2, %mm4 /* t77=t79 */
117 psubsw %mm0, %mm1 /* V21 ; mm0 free */
118 paddsw %mm2, %mm3 /* V22 */
119 movq %mm1, %mm5 /* duplicate V21 */
120 paddsw %mm4, %mm1 /* V23 */
121 movq %mm3, 8*4(%esi) /* V22 */
122 psubsw %mm5, %mm4 /* V24; mm5 free */
123 movq %mm1, 8*12(%esi) /* V23 */
124 psubsw %mm2, %mm6 /* V25; mm2 free */
125 movq %mm4, (%esi) /* V24 */
126 /* keep mm6 alive all along the next block */
127 /* movq %mm6, 8*8(%esi) V25 */
128 /* column 0: odd part
129 * use V2, V6, V10, V14 to produce V31, V39, V40, V41
131 /* moved above: movq 8*10(%ecx), %mm7 */
133 pmulhw 8*10(%esi), %mm7 /* V10 */
135 pmulhw 8*6(%esi), %mm0 /* V6 */
137 movq %mm7, %mm3 /* duplicate V10 */
138 pmulhw 8*2(%esi), %mm5 /* V2 */
139 movq 8*14(%ecx), %mm4
140 psubsw %mm0, %mm7 /* V26 */
141 pmulhw 8*14(%esi), %mm4 /* V14 */
142 paddsw %mm0, %mm3 /* V29 ; free mm0 */
143 movq %mm7, %mm1 /* duplicate V26 */
144 psraw $1, %mm3 /* t91=t94 */
145 pmulhw x539f539f539f539f,%mm7 /* V33 */
146 psraw $1, %mm1 /* t96 */
147 movq %mm5, %mm0 /* duplicate V2 */
148 psraw $2, %mm4 /* t85=t87 */
149 paddsw %mm4,%mm5 /* V27 */
150 psubsw %mm4, %mm0 /* V28 ; free mm4 */
151 movq %mm0, %mm2 /* duplicate V28 */
152 psraw $1, %mm5 /* t90=t93 */
153 pmulhw x4546454645464546,%mm0 /* V35 */
154 psraw $1, %mm2 /* t97 */
155 movq %mm5, %mm4 /* duplicate t90=t93 */
156 psubsw %mm2, %mm1 /* V32 ; free mm2 */
157 pmulhw x61f861f861f861f8,%mm1 /* V36 */
158 psllw $1, %mm7 /* t107 */
159 paddsw %mm3, %mm5 /* V31 */
160 psubsw %mm3, %mm4 /* V30 ; free mm3 */
161 pmulhw x5a825a825a825a82,%mm4 /* V34 */
163 psubsw %mm1, %mm0 /* V38 */
164 psubsw %mm7, %mm1 /* V37 ; free mm7 */
165 psllw $1, %mm1 /* t114 */
166 /* move from the next block */
167 movq %mm6, %mm3 /* duplicate V25 */
168 /* move from the next block */
169 movq 8*4(%esi), %mm7 /* V22 */
170 psllw $1, %mm0 /* t110 */
171 psubsw %mm5, %mm0 /* V39 (mm5 needed for next block) */
172 psllw $2, %mm4 /* t112 */
173 /* moved from the next block */
174 movq 8*12(%esi), %mm2 /* V23 */
175 psubsw %mm0, %mm4 /* V40 */
176 paddsw %mm4, %mm1 /* V41; free mm0 */
177 /* moved from the next block */
178 psllw $1, %mm2 /* t117=t125 */
179 /* column 0: output butterfly */
181 * movq %mm6, %mm3 duplicate V25
182 * movq 8*4(%esi), %mm7 V22
183 * movq 8*12(%esi), %mm2 V23
184 * psllw $1, %mm2 t117=t125
186 psubsw %mm1, %mm6 /* tm6 */
187 paddsw %mm1, %mm3 /* tm8; free mm1 */
188 movq %mm7, %mm1 /* duplicate V22 */
189 paddsw %mm5, %mm7 /* tm0 */
190 movq %mm3, 8*8(%esi) /* tm8; free mm3 */
191 psubsw %mm5, %mm1 /* tm14; free mm5 */
192 movq %mm6, 8*6(%esi) /* tm6; free mm6 */
193 movq %mm2, %mm3 /* duplicate t117=t125 */
194 movq (%esi), %mm6 /* V24 */
195 paddsw %mm0, %mm2 /* tm2 */
196 movq %mm7, (%esi) /* tm0; free mm7 */
197 psubsw %mm0, %mm3 /* tm12; free mm0 */
198 movq %mm1, 8*14(%esi) /* tm14; free mm1 */
199 psllw $1, %mm6 /* t119=t123 */
200 movq %mm2, 8*2(%esi) /* tm2; free mm2 */
201 movq %mm6, %mm0 /* duplicate t119=t123 */
202 movq %mm3, 8*12(%esi) /* tm12; free mm3 */
203 paddsw %mm4, %mm6 /* tm4 */
204 /* moved from next block */
206 psubsw %mm4, %mm0 /* tm10; free mm4 */
207 /* moved from next block */
208 pmulhw 8*5(%esi), %mm1 /* V5 */
209 movq %mm6, 8*4(%esi) /* tm4; free mm6 */
210 movq %mm0, 8*10(%esi) /* tm10; free mm0 */
211 /* column 1: even part
212 * use V5, V13, V1, V9 to produce V56..V59
214 /* moved to prev block:
215 * movq 8*5(%ecx), %mm1
216 * pmulhw 8*5(%esi), %mm1 V5
218 movq 8*13(%ecx), %mm7
219 psllw $1, %mm1 /* t128=t130 */
220 pmulhw 8*13(%esi), %mm7 /* V13 */
221 movq %mm1, %mm2 /* duplicate t128=t130 */
223 pmulhw 8(%esi), %mm3 /* V1 */
225 psubsw %mm7, %mm1 /* V50 */
226 pmulhw 8*9(%esi), %mm5 /* V9 */
227 paddsw %mm7, %mm2 /* V51 */
228 pmulhw x5a825a825a825a82, %mm1 /* 23170 ->V52 */
229 movq %mm2, %mm6 /* duplicate V51 */
230 psraw $1, %mm2 /* t138=t144 */
231 movq %mm3, %mm4 /* duplicate V1 */
232 psraw $2, %mm6 /* t136 */
233 paddsw %mm5, %mm3 /* V53 */
234 psubsw %mm5, %mm4 /* V54 ;mm5 free */
235 movq %mm3, %mm7 /* duplicate V53 */
236 /* moved from next block */
237 movq 8*11(%ecx), %mm0
238 psraw $1, %mm4 /* t140=t142 */
239 psubsw %mm6, %mm1 /* V55 ; mm6 free */
240 paddsw %mm2, %mm3 /* V56 */
241 movq %mm4, %mm5 /* duplicate t140=t142 */
242 paddsw %mm1, %mm4 /* V57 */
243 movq %mm3, 8*5(%esi) /* V56 */
244 psubsw %mm1, %mm5 /* V58; mm1 free */
245 movq %mm4, 8*13(%esi) /* V57 */
246 psubsw %mm2, %mm7 /* V59; mm2 free */
247 movq %mm5, 8*9(%esi) /* V58 */
248 /* keep mm7 alive all along the next block
249 * movq %mm7, 8(%esi) V59
251 * movq 8*11(%ecx), %mm0
253 pmulhw 8*11(%esi), %mm0 /* V11 */
255 pmulhw 8*7(%esi), %mm6 /* V7 */
256 movq 8*15(%ecx), %mm4
257 movq %mm0, %mm3 /* duplicate V11 */
258 pmulhw 8*15(%esi), %mm4 /* V15 */
260 psllw $1, %mm6 /* t146=t152 */
261 pmulhw 8*3(%esi), %mm5 /* V3 */
262 paddsw %mm6, %mm0 /* V63 */
263 /* note that V15 computation has a correction step:
264 * this is a 'magic' constant that rebiases the results to be closer to the
265 * expected result. this magic constant can be refined to reduce the error
266 * even more by doing the correction step in a later stage when the number
267 * is actually multiplied by 16
269 paddw x0005000200010001, %mm4
270 psubsw %mm6, %mm3 /* V60 ; free mm6 */
271 psraw $1, %mm0 /* t154=t156 */
272 movq %mm3, %mm1 /* duplicate V60 */
273 pmulhw x539f539f539f539f, %mm1 /* V67 */
274 movq %mm5, %mm6 /* duplicate V3 */
275 psraw $2, %mm4 /* t148=t150 */
276 paddsw %mm4, %mm5 /* V61 */
277 psubsw %mm4, %mm6 /* V62 ; free mm4 */
278 movq %mm5, %mm4 /* duplicate V61 */
279 psllw $1, %mm1 /* t169 */
280 paddsw %mm0, %mm5 /* V65 -> result */
281 psubsw %mm0, %mm4 /* V64 ; free mm0 */
282 pmulhw x5a825a825a825a82, %mm4 /* V68 */
283 psraw $1, %mm3 /* t158 */
284 psubsw %mm6, %mm3 /* V66 */
285 movq %mm5, %mm2 /* duplicate V65 */
286 pmulhw x61f861f861f861f8, %mm3 /* V70 */
287 psllw $1, %mm6 /* t165 */
288 pmulhw x4546454645464546, %mm6 /* V69 */
289 psraw $1, %mm2 /* t172 */
290 /* moved from next block */
291 movq 8*5(%esi), %mm0 /* V56 */
292 psllw $1, %mm4 /* t174 */
293 /* moved from next block */
294 psraw $1, %mm0 /* t177=t188 */
296 psubsw %mm3, %mm6 /* V72 */
297 psubsw %mm1, %mm3 /* V71 ; free mm1 */
298 psubsw %mm2, %mm6 /* V73 ; free mm2 */
299 /* moved from next block */
300 psraw $1, %mm5 /* t178=t189 */
301 psubsw %mm6, %mm4 /* V74 */
302 /* moved from next block */
303 movq %mm0, %mm1 /* duplicate t177=t188 */
304 paddsw %mm4, %mm3 /* V75 */
305 /* moved from next block */
306 paddsw %mm5, %mm0 /* tm1 */
316 * free mm0, mm1 & mm2
318 * movq 8*5(%esi), %mm0 V56
319 * psllw $1, %mm0 t177=t188 ! new !!
320 * psllw $1, %mm5 t178=t189 ! new !!
321 * movq %mm0, %mm1 duplicate t177=t188
322 * paddsw %mm5, %mm0 tm1
324 movq 8*13(%esi), %mm2 /* V57 */
325 psubsw %mm5, %mm1 /* tm15; free mm5 */
326 movq %mm0, 8(%esi) /* tm1; free mm0 */
327 psraw $1, %mm7 /* t182=t184 ! new !! */
328 /* save the store as used directly in the transpose
329 * movq %mm1, 120(%esi) tm15; free mm1
331 movq %mm7, %mm5 /* duplicate t182=t184 */
332 psubsw %mm3, %mm7 /* tm7 */
333 paddsw %mm3, %mm5 /* tm9; free mm3 */
334 movq 8*9(%esi), %mm0 /* V58 */
335 movq %mm2, %mm3 /* duplicate V57 */
336 movq %mm7, 8*7(%esi) /* tm7; free mm7 */
337 psubsw %mm6, %mm3 /* tm13 */
338 paddsw %mm6, %mm2 /* tm3 ; free mm6 */
339 /* moved up from the transpose */
341 /* moved up from the transpose */
343 movq %mm0, %mm6 /* duplicate V58 */
344 movq %mm2, 8*3(%esi) /* tm3; free mm2 */
345 paddsw %mm4, %mm0 /* tm5 */
346 psubsw %mm4, %mm6 /* tm11; free mm4 */
347 /* moved up from the transpose */
349 movq %mm0, 8*5(%esi) /* tm5; free mm0 */
350 /* moved up from the transpose */
352 /* transpose - M4 part
353 * --------- ---------
354 * | M1 | M2 | | M1'| M3'|
355 * --------- --> ---------
356 * | M3 | M4 | | M2'| M4'|
357 * --------- ---------
358 * Two alternatives: use full mmword approach so the following code can be
359 * scheduled before the transpose is done without stores, or use the faster
360 * half mmword stores (when possible)
362 movd %mm3, 8*9+4(%esi) /* MS part of tmt9 */
364 movd %mm7, 8*13+4(%esi) /* MS part of tmt13 */
366 movd %mm5, 8*9(%esi) /* LS part of tmt9 */
367 punpckhdq %mm3, %mm5 /* free mm3 */
368 movd %mm2, 8*13(%esi) /* LS part of tmt13 */
369 punpckhdq %mm7, %mm2 /* free mm7 */
370 /* moved up from the M3 transpose */
372 /* moved up from the M3 transpose */
373 movq 8*10(%esi), %mm1
374 /* moved up from the M3 transpose */
376 /* shuffle the rest of the data, and write it with 2 mmword writes */
377 movq %mm5, 8*11(%esi) /* tmt11 */
378 /* moved up from the M3 transpose */
380 movq %mm2, 8*15(%esi) /* tmt15 */
381 /* moved up from the M3 transpose */
383 /* transpose - M3 part
384 * moved up to previous code section
385 * movq 8*8(%esi), %mm0
386 * movq 8*10(%esi), %mm1
388 * punpcklwd %mm1, %mm0
389 * punpckhwd %mm1, %mm3
391 movq 8*12(%esi), %mm6
392 movq 8*14(%esi), %mm4
394 /* shuffle the data and write the lower parts of the transposed in 4 dwords */
399 punpckhwd %mm4, %mm2 /* free mm4 */
400 punpckldq %mm6, %mm0 /* free mm6 */
401 /* moved from next block */
402 movq 8*13(%esi), %mm4 /* tmt13 */
404 punpckhdq %mm2, %mm7 /* free mm2 */
405 /* moved from next block */
406 movq %mm3, %mm5 /* duplicate tmt5 */
407 /* column 1: even part (after transpose)
409 * movq %mm3, %mm5 duplicate tmt5
410 * movq 8*13(%esi), %mm4 tmt13
412 psubsw %mm4, %mm3 /* V134 */
413 pmulhw x5a825a825a825a82, %mm3 /* 23170 ->V136 */
414 movq 8*9(%esi), %mm6 /* tmt9 */
415 paddsw %mm4, %mm5 /* V135 ; mm4 free */
416 movq %mm0, %mm4 /* duplicate tmt1 */
417 paddsw %mm6, %mm0 /* V137 */
418 psubsw %mm6, %mm4 /* V138 ; mm6 free */
419 psllw $2, %mm3 /* t290 */
420 psubsw %mm5, %mm3 /* V139 */
421 movq %mm0, %mm6 /* duplicate V137 */
422 paddsw %mm5, %mm0 /* V140 */
423 movq %mm4, %mm2 /* duplicate V138 */
424 paddsw %mm3, %mm2 /* V141 */
425 psubsw %mm3, %mm4 /* V142 ; mm3 free */
426 movq %mm0, 8*9(%esi) /* V140 */
427 psubsw %mm5, %mm6 /* V143 ; mm5 free */
428 /* moved from next block */
429 movq 8*11(%esi), %mm0 /* tmt11 */
430 movq %mm2, 8*13(%esi) /* V141 */
431 /* moved from next block */
432 movq %mm0, %mm2 /* duplicate tmt11 */
433 /* column 1: odd part (after transpose) */
434 /* moved up to the prev block
435 * movq 8*11(%esi), %mm0 tmt11
436 * movq %mm0, %mm2 duplicate tmt11
438 movq 8*15(%esi), %mm5 /* tmt15 */
439 psubsw %mm7, %mm0 /* V144 */
440 movq %mm0, %mm3 /* duplicate V144 */
441 paddsw %mm7, %mm2 /* V147 ; free mm7 */
442 pmulhw x539f539f539f539f, %mm0 /* 21407-> V151 */
443 movq %mm1, %mm7 /* duplicate tmt3 */
444 paddsw %mm5, %mm7 /* V145 */
445 psubsw %mm5, %mm1 /* V146 ; free mm5 */
446 psubsw %mm1, %mm3 /* V150 */
447 movq %mm7, %mm5 /* duplicate V145 */
448 pmulhw x4546454645464546, %mm1 /* 17734-> V153 */
449 psubsw %mm2, %mm5 /* V148 */
450 pmulhw x61f861f861f861f8, %mm3 /* 25080-> V154 */
451 psllw $2, %mm0 /* t311 */
452 pmulhw x5a825a825a825a82, %mm5 /* 23170-> V152 */
453 paddsw %mm2, %mm7 /* V149 ; free mm2 */
454 psllw $1, %mm1 /* t313 */
455 nop /* without the nop - freeze here for one clock */
456 movq %mm3, %mm2 /* duplicate V154 */
457 psubsw %mm0, %mm3 /* V155 ; free mm0 */
458 psubsw %mm2, %mm1 /* V156 ; free mm2 */
459 /* moved from the next block */
460 movq %mm6, %mm2 /* duplicate V143 */
461 /* moved from the next block */
462 movq 8*13(%esi), %mm0 /* V141 */
463 psllw $1, %mm1 /* t315 */
464 psubsw %mm7, %mm1 /* V157 (keep V149) */
465 psllw $2, %mm5 /* t317 */
466 psubsw %mm1, %mm5 /* V158 */
467 psllw $1, %mm3 /* t319 */
468 paddsw %mm5, %mm3 /* V159 */
469 /* column 1: output butterfly (after transform)
470 * moved to the prev block
471 * movq %mm6, %mm2 duplicate V143
472 * movq 8*13(%esi), %mm0 V141
474 psubsw %mm3, %mm2 /* V163 */
475 paddsw %mm3, %mm6 /* V164 ; free mm3 */
476 movq %mm4, %mm3 /* duplicate V142 */
477 psubsw %mm5, %mm4 /* V165 ; free mm5 */
478 movq %mm2, (%esp) /* out7 */
481 paddsw %mm5, %mm3 /* V162 */
482 movq 8*9(%esi), %mm2 /* V140 */
483 movq %mm0, %mm5 /* duplicate V141 */
484 /* in order not to perculate this line up,
485 * we read 72(%esi) very near to this location
487 movq %mm6, 8*9(%esi) /* out9 */
488 paddsw %mm1, %mm0 /* V161 */
489 movq %mm3, 8(%esp) /* out5 */
490 psubsw %mm1, %mm5 /* V166 ; free mm1 */
491 movq %mm4, 8*11(%esi) /* out11 */
493 movq %mm0, 16(%esp) /* out3 */
494 movq %mm2, %mm4 /* duplicate V140 */
495 movq %mm5, 8*13(%esi) /* out13 */
496 paddsw %mm7, %mm2 /* V160 */
497 /* moved from the next block */
499 psubsw %mm7, %mm4 /* V167 ; free mm7 */
500 /* moved from the next block */
503 movq %mm2, 24(%esp) /* out1 */
504 /* moved from the next block */
506 movq %mm4, 8*15(%esi) /* out15 */
507 /* moved from the next block */
509 /* transpose - M2 parts
510 * moved up to the prev block
512 * movq 8*3(%esi), %mm7
514 * punpcklwd %mm7, %mm0
520 /* shuffle the data and write the lower parts of the trasposed in 4 dwords */
521 movd %mm0, 8*8(%esi) /* LS part of tmt8 */
523 movd %mm1, 8*12(%esi) /* LS part of tmt12 */
525 movd %mm5, 8*8+4(%esi) /* MS part of tmt8 */
526 punpckhdq %mm5, %mm0 /* tmt10 */
527 movd %mm3, 8*12+4(%esi) /* MS part of tmt12 */
528 punpckhdq %mm3, %mm1 /* tmt14 */
529 /* transpose - M1 parts */
536 punpckhwd %mm2, %mm6 /* free mm2 */
539 punpckhwd %mm4, %mm3 /* free mm4 */
542 punpckldq %mm5, %mm7 /* tmt0 */
543 punpckhdq %mm5, %mm2 /* tmt2 ; free mm5 */
544 /* shuffle the rest of the data, and write it with 2 mmword writes */
545 punpckldq %mm3, %mm6 /* tmt4 */
546 /* moved from next block */
547 movq %mm2, %mm5 /* duplicate tmt2 */
548 punpckhdq %mm3, %mm4 /* tmt6 ; free mm3 */
549 /* moved from next block */
550 movq %mm0, %mm3 /* duplicate tmt10 */
551 /* column 0: odd part (after transpose)
552 *moved up to prev block
553 * movq %mm0, %mm3 duplicate tmt10
554 * movq %mm2, %mm5 duplicate tmt2
556 psubsw %mm4, %mm0 /* V110 */
557 paddsw %mm4, %mm3 /* V113 ; free mm4 */
558 movq %mm0, %mm4 /* duplicate V110 */
559 paddsw %mm1, %mm2 /* V111 */
560 pmulhw x539f539f539f539f, %mm0 /* 21407-> V117 */
561 psubsw %mm1, %mm5 /* V112 ; free mm1 */
562 psubsw %mm5, %mm4 /* V116 */
563 movq %mm2, %mm1 /* duplicate V111 */
564 pmulhw x4546454645464546, %mm5 /* 17734-> V119 */
565 psubsw %mm3, %mm2 /* V114 */
566 pmulhw x61f861f861f861f8, %mm4 /* 25080-> V120 */
567 paddsw %mm3, %mm1 /* V115 ; free mm3 */
568 pmulhw x5a825a825a825a82, %mm2 /* 23170-> V118 */
569 psllw $2, %mm0 /* t266 */
570 movq %mm1, (%esi) /* save V115 */
571 psllw $1, %mm5 /* t268 */
572 psubsw %mm4, %mm5 /* V122 */
573 psubsw %mm0, %mm4 /* V121 ; free mm0 */
574 psllw $1, %mm5 /* t270 */
575 psubsw %mm1, %mm5 /* V123 ; free mm1 */
576 psllw $2, %mm2 /* t272 */
577 psubsw %mm5, %mm2 /* V124 (keep V123) */
578 psllw $1, %mm4 /* t274 */
579 movq %mm5, 8*2(%esi) /* save V123 ; free mm5 */
580 paddsw %mm2, %mm4 /* V125 (keep V124) */
581 /* column 0: even part (after transpose) */
582 movq 8*12(%esi), %mm0 /* tmt12 */
583 movq %mm6, %mm3 /* duplicate tmt4 */
584 psubsw %mm0, %mm6 /* V100 */
585 paddsw %mm0, %mm3 /* V101 ; free mm0 */
586 pmulhw x5a825a825a825a82, %mm6 /* 23170 ->V102 */
587 movq %mm7, %mm5 /* duplicate tmt0 */
588 movq 8*8(%esi), %mm1 /* tmt8 */
589 paddsw %mm1, %mm7 /* V103 */
590 psubsw %mm1, %mm5 /* V104 ; free mm1 */
591 movq %mm7, %mm0 /* duplicate V103 */
592 psllw $2, %mm6 /* t245 */
593 paddsw %mm3, %mm7 /* V106 */
594 movq %mm5, %mm1 /* duplicate V104 */
595 psubsw %mm3, %mm6 /* V105 */
596 psubsw %mm3, %mm0 /* V109; free mm3 */
597 paddsw %mm6, %mm5 /* V107 */
598 psubsw %mm6, %mm1 /* V108 ; free mm6 */
599 /* column 0: output butterfly (after transform) */
600 movq %mm1, %mm3 /* duplicate V108 */
601 paddsw %mm2, %mm1 /* out4 */
603 psubsw %mm2, %mm3 /* out10 ; free mm2 */
605 movq %mm0, %mm6 /* duplicate V109 */
606 movq %mm1, 8*4(%esi) /* out4 ; free mm1 */
607 psubsw %mm4, %mm0 /* out6 */
608 movq %mm3, 8*10(%esi) /* out10 ; free mm3 */
610 paddsw %mm4, %mm6 /* out8 ; free mm4 */
611 movq %mm7, %mm1 /* duplicate V106 */
612 movq %mm0, 8*6(%esi) /* out6 ; free mm0 */
614 movq (%esi), %mm4 /* V115 */
615 movq %mm6, 8*8(%esi) /* out8 ; free mm6 */
616 movq %mm5, %mm2 /* duplicate V107 */
617 movq 8*2(%esi), %mm3 /* V123 */
618 paddsw %mm4, %mm7 /* out0 */
619 /* moved up from next block */
622 /* moved up from next block */
624 psubsw %mm4, %mm1 /* out14 ; free mm4 */
625 paddsw %mm3, %mm5 /* out2 */
627 movq %mm7, (%esi) /* out0 ; free mm7 */
629 movq %mm1, 8*14(%esi) /* out14 ; free mm1 */
630 psubsw %mm3, %mm2 /* out12 ; free mm3 */
631 movq %mm5, 8*2(%esi) /* out2 ; free mm5 */
633 /* moved up to the prev block */
635 /* moved up to the prev block */
637 movq %mm2, 8*12(%esi) /* out12 ; free mm2 */
638 /* moved up to the prev block */
640 /* move back the data to its correct place
641 * moved up to the prev block
642 * movq 16(%esp), %mm0
650 movq %mm0, 8*3(%esi) /* out3 */
652 movq %mm6, 8*5(%esi) /* out5 */
653 movq %mm4, 8*7(%esi) /* out7 */
654 movq %mm1, 8(%esi) /* out1 */
656 popl %edi /* Pop off the temp variables */
665 popl %edi /* Pop off the old variables */
675 .size IDCT_mmx,.Lfe1-IDCT_mmx