2 ; MMX32 iDCT algorithm (IEEE-1180 compliant) :: idct_mmx32()
6 ; v0.16B33 initial release
8 ; This was one of the harder pieces of work to code.
9 ; Intel's app-note focuses on the numerical issues of the algorithm, but
10 ; assumes the programmer is familiar with IDCT mathematics, leaving the
11 ; form of the complete function up to the programmer's imagination.
15 ; I played around with the code for quite a few hours. I came up
16 ; with *A* working IDCT algorithm, however I'm not sure whether my routine
17 ; is "the correct one." But rest assured, my code passes all six IEEE
18 ; accuracy tests with plenty of margin.
20 ; My IDCT algorithm consists of 4 steps:
22 ; 1) IDCT-row transformation (using the IDCT-row function) on all 8 rows
23 ; This yields an intermediate 8x8 matrix.
25 ; 2) intermediate matrix transpose (mandatory)
27 ; 3) IDCT-row transformation (2nd time) on all 8 rows of the intermediate
28 ; matrix. The output is the final-result, in transposed form.
30 ; 4) post-transformation matrix transpose
31 ; (not necessary if the input-data is already transposed, this could
32 ; be done during the MPEG "zig-zag" scan, but since my algorithm
33 ; requires at least one transpose operation, why not re-use the
36 ; Although the (1st) and (3rd) steps use the SAME row-transform operation,
37 ; the (3rd) step uses different shift&round constants (explained later.)
39 ; Also note that the intermediate transpose (2) would not be neccessary,
40 ; if the subsequent operation were a iDCT-column transformation. Since
41 ; we only have the iDCT-row transform, we transpose the intermediate
42 ; matrix and use the iDCT-row transform a 2nd time.
44 ; I had to change some constants/variables for my method to work :
46 ; As given by Intel, the #defines for SHIFT_INV_COL and RND_INV_COL are
47 ; wrong. Not surprising since I'm not using a true column-transform
48 ; operation, but the row-transform operation (as mentioned earlier.)
49 ; round_inv_col[], which is given as "4 short" values, should have the
50 ; same dimensions as round_inv_row[]. The corrected variables are
53 ; Intel's code defines a different table for each each row operation.
54 ; The tables given are 0/4, 1/7, 2/6, and 5/3. My code only uses row#0.
55 ; Using the other rows messes up the overall transform.
57 ; IMPLEMENTATION DETAILs
58 ; ----------------------
60 ; I divided the algorithm's work into two subroutines,
61 ; 1) idct_mmx32_rows() - transforms 8 rows, then transpose
62 ; 2) idct_mmx32_cols() - transforms 8 rows, then transpose
63 ; yields final result ("drop-in" direct replacement for INT32 IDCT)
65 ; The 2nd function is a clone of the 1st, with changes made only to the
66 ; shift&rounding instructions.
68 ; In the 1st function (rows), the shift & round instructions use
69 ; SHIFT_INV_ROW & round_inv_row[] (renamed to r_inv_row[])
71 ; In the 2nd function (cols)-> r_inv_col[], and
72 ; SHIFT_INV_COL & round_inv_col[] (renamed to r_inv_col[])
74 ; Each function contains an integrated transpose-operator, which comes
75 ; AFTER the primary transformation operation. In the future, I'll optimize
76 ; the code to do more of the transpose-work "in-place". Right now, I've
77 ; left the code as two subroutines and a main calling function, so other
78 ; people can read the code more easily.
80 ; liaor@umcc.ais.org http:;members.tripod.com/~liaor
84 ;;; A.Stevens Jul 2000 easy-peasy quick port to nasm
85 ;;; Isn't open source a sensible idea...
88 ;=============================================================================
90 ; AP-922 http:;developer.intel.com/vtune/cbts/strmsimd
91 ; These examples contain code fragments for first stage iDCT 8x8
92 ; (for rows) and first stage DCT 8x8 (for columns)
94 ;============================================================================
96 %define INP eax ; pointer to (short *blk)
97 %define OUT ecx ; pointer to output (temporary store space qwTemp[])
98 %define TABLE ebx ; pointer to idct_tab_01234567[]
99 %define round_inv_row edx
100 %define round_inv_col edx
103 %define ROW_STRIDE 16 ; for 8x8 matrix transposer
104 %define BITS_INV_ACC 4 ; 4 or 5 for IEEE
105 %define SHIFT_INV_ROW (16 - BITS_INV_ACC)
106 %define SHIFT_INV_COL (1 + BITS_INV_ACC +14 ) ; changed from Intel's val)
109 ;; Variables and tables defined in C for convenience
111 extern idct_r_inv_row ; 2 DWORDSs
112 extern idct_r_inv_col ; "
113 extern idct_r_inv_corr ; "
114 extern idct_tab_01234567 ; Catenated table of coefficients
117 ;; private variables and functions
122 ; qwTemp: resw 64 ; temporary storage space, 8x8 of shorts
127 ;; static void idct_mmx( short *blk
131 push ebp ; save frame pointer
140 ;; transform all 8 rows of 8x8 iDCT block
143 ; this subroutine performs two operations
144 ; 1) iDCT row transform
145 ; for( i = 0; i < 8; ++ i)
146 ; DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
148 ; 2) transpose the matrix (which was stored in qwTemp[])
149 ; qwTemp[] -> [8x8 matrix transpose] -> blk[]
151 mov INP, [ebp+8] ; INP = blk
152 mov edi, 0x00; ; x = 0
153 lea TABLE,[idct_tab_01234567]; ; row 0
158 lea round_inv_row, [idct_r_inv_row]
161 ; for ( x = 0; x < 8; ++x ) ; transform one row per iteration
164 movq mm0, [INP] ; 0 ; x3 x2 x1 x0
166 movq mm1, [INP+8] ; 1 ; x7 x6 x5 x4
167 movq mm2, mm0 ; ; 2 ; x3 x2 x1 x0
169 movq mm3, [TABLE] ; 3 ; w06 w04 w02 w00
170 punpcklwd mm0, mm1 ; x5 x1 x4 x0
173 movq mm5, mm0 ; ; 5 ; x5 x1 x4 x0
174 punpckldq mm0, mm0 ; ; x4 x0 x4 x0
176 movq mm4, [TABLE+8] ; ; 4 ; w07 w05 w03 w01
177 punpckhwd mm2, mm1 ; ; 1 ; x7 x3 x6 x2
179 pmaddwd mm3, mm0 ; ; x4*w06+x0*w04 x4*w02+x0*w00
180 movq mm6, mm2 ; ; 6 ; x7 x3 x6 x2
182 movq mm1, [TABLE+32] ;; 1 ; w22 w20 w18 w16
183 punpckldq mm2, mm2 ; ; x6 x2 x6 x2
185 pmaddwd mm4, mm2 ; ; x6*w07+x2*w05 x6*w03+x2*w01
186 punpckhdq mm5, mm5 ; ; x5 x1 x5 x1
188 pmaddwd mm0, [TABLE+16] ;; x4*w14+x0*w12 x4*w10+x0*w08
189 punpckhdq mm6, mm6 ; ; x7 x3 x7 x3
191 movq mm7, [TABLE+40] ;; 7 ; w23 w21 w19 w17
192 pmaddwd mm1, mm5 ; ; x5*w22+x1*w20 x5*w18+x1*w16
194 paddd mm3, [round_inv_row];; +rounder
195 pmaddwd mm7, mm6 ; ; x7*w23+x3*w21 x7*w19+x3*w17
197 pmaddwd mm2, [TABLE+24] ;; x6*w15+x2*w13 x6*w11+x2*w09
198 paddd mm3, mm4 ; ; 4 ; a1=sum(even1) a0=sum(even0)
200 pmaddwd mm5, [TABLE+48] ;; x5*w30+x1*w28 x5*w26+x1*w24
201 movq mm4, mm3 ; ; 4 ; a1 a0
203 pmaddwd mm6, [TABLE+56] ;; x7*w31+x3*w29 x7*w27+x3*w25
204 paddd mm1, mm7 ; ; 7 ; b1=sum(odd1) b0=sum(odd0)
206 paddd mm0, [round_inv_row];; +rounder
207 psubd mm3, mm1 ; ; a1-b1 a0-b0
209 psrad mm3, SHIFT_INV_ROW ; ; y6=a1-b1 y7=a0-b0
210 paddd mm1, mm4 ; ; 4 ; a1+b1 a0+b0
212 paddd mm0, mm2 ; ; 2 ; a3=sum(even3) a2=sum(even2)
213 psrad mm1, SHIFT_INV_ROW ; ; y1=a1+b1 y0=a0+b0
215 paddd mm5, mm6 ; ; 6 ; b3=sum(odd3) b2=sum(odd2)
216 movq mm4, mm0 ; ; 4 ; a3 a2
218 paddd mm0, mm5 ; ; a3+b3 a2+b2
219 psubd mm4, mm5 ; ; 5 ; a3-b3 a2-b2
221 add INP, 16; ; increment INPUT pointer -> row 1
222 psrad mm4, SHIFT_INV_ROW ; ; y4=a3-b3 y5=a2-b2
224 ; add TABLE, 0; ; TABLE += 64 -> row 1
225 psrad mm0, SHIFT_INV_ROW ; ; y3=a3+b3 y2=a2+b2
227 ; movq mm2, [INP] ; ; row+1; 0; x3 x2 x1 x0
228 packssdw mm4, mm3 ; ; 3 ; y6 y7 y4 y5
230 packssdw mm1, mm0 ; ; 0 ; y3 y2 y1 y0
231 movq mm7, mm4 ; ; 7 ; y6 y7 y4 y5
233 ; movq mm0, mm2 ; ; row+1; 2 ; x3 x2 x1 x0
234 psrld mm4, 16 ; ; 0 y6 0 y4
236 movq [OUT], mm1 ; ; 1 ; save y3 y2 y1 y0
237 pslld mm7, 16 ; ; y7 0 y5 0
239 ; movq mm1, [INP+8] ; ; row+1; 1 ; x7 x6 x5 x4
240 por mm7, mm4 ; ; 4 ; y7 y6 y5 y4
242 movq mm3, [TABLE] ; ; 3 ; w06 w04 w02 w00
243 ; punpcklwd mm0, mm1 ; ; row+1; x5 x1 x4 x0
245 ; begin processing row 1
246 movq [OUT+8], mm7 ; ; 7 ; save y7 y6 y5 y4
249 add OUT, 16; ; increment OUTPUT pointer -> row 1
251 jl near lpa; ; end for ( x = 0; x < 8; ++x )
253 ; done with the iDCT row-transformation
255 ; now we have to transpose the output 8x8 matrix
256 ; 8x8 (OUT) -> 8x8't' (IN)
257 ; the transposition is implemented as 4 sub-operations.
258 ; 1) transpose upper-left quad
259 ; 2) transpose lower-right quad
260 ; 3) transpose lower-left quad
261 ; 4) transpose upper-right quad
264 ; mm0 = 1st row [ A B C D ] row1
265 ; mm1 = 2nd row [ E F G H ] 2
266 ; mm2 = 3rd row [ I J K L ] 3
267 ; mm3 = 4th row [ M N O P ] 4
269 ; 1) transpose upper-left quad
273 movq mm0, [OUT + ROW_STRIDE * 0 ]
275 movq mm1, [OUT + ROW_STRIDE * 1 ]
276 movq mm4, mm0; ; mm4 = copy of row1[A B C D]
278 movq mm2, [OUT + ROW_STRIDE * 2 ]
279 punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
281 movq mm3, [OUT + ROW_STRIDE * 3]
282 punpckhwd mm4, mm1; ; mm4 = [ 2 6 3 7]
285 punpcklwd mm2, mm3; ; mm2 = [ 8 12 9 13]
287 punpckhwd mm6, mm3; ; mm6 = 10 14 11 15]
288 movq mm1, mm0; ; mm1 = [ 0 4 1 5]
290 mov INP, [ebp+8]; ; load input address
291 punpckldq mm0, mm2; ; final result mm0 = row1 [0 4 8 12]
293 movq mm3, mm4; ; mm3 = [ 2 6 3 7]
294 punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
296 movq [ INP + ROW_STRIDE * 0 ], mm0; ; store row 1
297 punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
299 ; begin reading next quadrant (lower-right)
300 movq mm0, [OUT + ROW_STRIDE*4 + 8];
301 punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
303 movq [ INP +ROW_STRIDE * 2], mm4; ; store row 3
304 movq mm4, mm0; ; mm4 = copy of row1[A B C D]
306 movq [ INP +ROW_STRIDE * 1], mm1; ; store row 2
308 movq mm1, [OUT + ROW_STRIDE*5 + 8]
310 movq [ INP +ROW_STRIDE * 3], mm3; ; store row 4
311 punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
313 ; 2) transpose lower-right quadrant
315 ; movq mm0, [OUT + ROW_STRIDE*4 + 8]
317 ; movq mm1, [OUT + ROW_STRIDE*5 + 8]
318 ; movq mm4, mm0; ; mm4 = copy of row1[A B C D]
320 movq mm2, [OUT + ROW_STRIDE*6 + 8]
321 ; punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
322 punpckhwd mm4, mm1; ; mm4 = [ 2 6 3 7]
324 movq mm3, [OUT + ROW_STRIDE*7 + 8]
327 punpcklwd mm2, mm3; ; mm2 = [ 8 12 9 13]
328 movq mm1, mm0; ; mm1 = [ 0 4 1 5]
330 punpckhwd mm6, mm3; ; mm6 = 10 14 11 15]
331 movq mm3, mm4; ; mm3 = [ 2 6 3 7]
333 punpckldq mm0, mm2; ; final result mm0 = row1 [0 4 8 12]
335 punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
338 movq [ INP + ROW_STRIDE*4 + 8], mm0; ; store row 1
339 punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
341 movq mm0, [OUT + ROW_STRIDE * 4 ]
342 punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
343 movq [ INP +ROW_STRIDE*6 + 8], mm4; ; store row 3
344 movq mm4, mm0; ; mm4 = copy of row1[A B C D]
345 movq [ INP +ROW_STRIDE*5 + 8], mm1; ; store row 2
347 movq mm1, [OUT + ROW_STRIDE * 5 ]
350 movq [ INP +ROW_STRIDE*7 + 8], mm3; ; store row 4
351 punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
353 ; 3) transpose lower-left
354 ; movq mm0, [OUT + ROW_STRIDE * 4 ]
356 ; movq mm1, [OUT + ROW_STRIDE * 5 ]
357 ; movq mm4, mm0; ; mm4 = copy of row1[A B C D]
359 movq mm2, [OUT + ROW_STRIDE * 6 ]
360 ; punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
361 punpckhwd mm4, mm1; ; mm4 = [ 2 6 3 7]
363 movq mm3, [OUT + ROW_STRIDE * 7 ]
366 punpcklwd mm2, mm3; ; mm2 = [ 8 12 9 13]
367 movq mm1, mm0; ; mm1 = [ 0 4 1 5]
369 punpckhwd mm6, mm3; ; mm6 = 10 14 11 15]
370 movq mm3, mm4; ; mm3 = [ 2 6 3 7]
372 punpckldq mm0, mm2; ; final result mm0 = row1 [0 4 8 12]
374 punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
377 movq [ INP + ROW_STRIDE * 0 + 8 ], mm0; ; store row 1
378 punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
380 ; begin reading next quadrant (upper-right)
381 movq mm0, [OUT + ROW_STRIDE*0 + 8];
382 punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
384 movq [ INP +ROW_STRIDE * 2 + 8], mm4; ; store row 3
385 movq mm4, mm0; ; mm4 = copy of row1[A B C D]
387 movq [ INP +ROW_STRIDE * 1 + 8 ], mm1; ; store row 2
388 movq mm1, [OUT + ROW_STRIDE*1 + 8]
390 movq [ INP +ROW_STRIDE * 3 + 8], mm3; ; store row 4
391 punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
394 ; 2) transpose lower-right quadrant
396 ; movq mm0, [OUT + ROW_STRIDE*4 + 8]
398 ; movq mm1, [OUT + ROW_STRIDE*5 + 8]
399 ; movq mm4, mm0; ; mm4 = copy of row1[A B C D]
401 movq mm2, [OUT + ROW_STRIDE*2 + 8]
402 ; punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
403 punpckhwd mm4, mm1; ; mm4 = [ 2 6 3 7]
405 movq mm3, [OUT + ROW_STRIDE*3 + 8]
408 punpcklwd mm2, mm3; ; mm2 = [ 8 12 9 13]
409 movq mm1, mm0; ; mm1 = [ 0 4 1 5]
411 punpckhwd mm6, mm3; ; mm6 = 10 14 11 15]
412 movq mm3, mm4; ; mm3 = [ 2 6 3 7]
414 punpckldq mm0, mm2; ; final result mm0 = row1 [0 4 8 12]
416 punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
419 movq [ INP + ROW_STRIDE*4 ], mm0; ; store row 1
420 punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
422 movq [ INP +ROW_STRIDE*5 ], mm1; ; store row 2
423 punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
425 movq [ INP +ROW_STRIDE*6 ], mm4; ; store row 3
428 movq [ INP +ROW_STRIDE*7 ], mm3; ; store row 4
430 ; Conceptually this is the column transform.
431 ; Actually, the matrix is transformed
432 ; row by row. This function is identical to idct_mmx32_rows(),
433 ; except for the SHIFT amount and ROUND_INV amount.
435 ; this subroutine performs two operations
436 ; 1) iDCT row transform
437 ; for( i = 0; i < 8; ++ i)
438 ; DCT_8_INV_ROW_1( blk[i*8], qwTemp[i] );
440 ; 2) transpose the matrix (which was stored in qwTemp[])
441 ; qwTemp[] -> [8x8 matrix transpose] -> blk[]
444 mov INP, [ebp+8]; ; ; row 0
445 mov edi, 0x00; ; x = 0
447 lea TABLE, [idct_tab_01234567]; ; row 0
450 ; mov OUT, INP; ; algorithm writes data in-place -> row 0
452 lea round_inv_col, [idct_r_inv_col]
453 jmp acc_idct_colloop1
455 ; for ( x = 0; x < 8; ++x ) ; transform one row per iteration
459 movq mm0, [INP] ; ; 0 ; x3 x2 x1 x0
461 movq mm1, [INP+8] ; ; 1 ; x7 x6 x5 x4
462 movq mm2, mm0 ; ; 2 ; x3 x2 x1 x0
464 movq mm3, [TABLE] ; ; 3 ; w06 w04 w02 w00
465 punpcklwd mm0, mm1 ; ; x5 x1 x4 x0
468 movq mm5, mm0 ; ; 5 ; x5 x1 x4 x0
469 punpckldq mm0, mm0 ; ; x4 x0 x4 x0
471 movq mm4, [TABLE+8] ; ; 4 ; w07 w05 w03 w01
472 punpckhwd mm2, mm1 ; ; 1 ; x7 x3 x6 x2
474 pmaddwd mm3, mm0 ; ; x4*w06+x0*w04 x4*w02+x0*w00
475 movq mm6, mm2 ; ; 6 ; x7 x3 x6 x2
477 movq mm1, [TABLE+32] ;; 1 ; w22 w20 w18 w16
478 punpckldq mm2, mm2 ; ; x6 x2 x6 x2
480 pmaddwd mm4, mm2 ; ; x6*w07+x2*w05 x6*w03+x2*w01
481 punpckhdq mm5, mm5 ; ; x5 x1 x5 x1
483 pmaddwd mm0, [TABLE+16] ;; x4*w14+x0*w12 x4*w10+x0*w08
484 punpckhdq mm6, mm6 ; ; x7 x3 x7 x3
486 movq mm7, [TABLE+40] ;; 7 ; w23 w21 w19 w17
487 pmaddwd mm1, mm5 ; ; x5*w22+x1*w20 x5*w18+x1*w16
489 paddd mm3, [round_inv_col] ;; +rounder
490 pmaddwd mm7, mm6 ; ; x7*w23+x3*w21 x7*w19+x3*w17
492 pmaddwd mm2, [TABLE+24] ;; x6*w15+x2*w13 x6*w11+x2*w09
493 paddd mm3, mm4 ; ; 4 ; a1=sum(even1) a0=sum(even0)
495 pmaddwd mm5, [TABLE+48] ;; x5*w30+x1*w28 x5*w26+x1*w24
496 movq mm4, mm3 ; ; 4 ; a1 a0
498 pmaddwd mm6, [TABLE+56] ;; x7*w31+x3*w29 x7*w27+x3*w25
499 paddd mm1, mm7 ; ; 7 ; b1=sum(odd1) b0=sum(odd0)
501 paddd mm0, [round_inv_col] ;; +rounder
502 psubd mm3, mm1 ; ; a1-b1 a0-b0
504 psrad mm3, SHIFT_INV_COL; ; y6=a1-b1 y7=a0-b0
505 paddd mm1, mm4 ; ; 4 ; a1+b1 a0+b0
507 paddd mm0, mm2 ; ; 2 ; a3=sum(even3) a2=sum(even2)
508 psrad mm1, SHIFT_INV_COL; ; y1=a1+b1 y0=a0+b0
510 paddd mm5, mm6 ; ; 6 ; b3=sum(odd3) b2=sum(odd2)
511 movq mm4, mm0 ; ; 4 ; a3 a2
513 paddd mm0, mm5 ; ; a3+b3 a2+b2
514 psubd mm4, mm5 ; ; 5 ; a3-b3 a2-b2
516 add INP, 16; ; increment INPUT pointer -> row 1
517 psrad mm4, SHIFT_INV_COL; ; y4=a3-b3 y5=a2-b2
519 add TABLE, 0; ; TABLE += 64 -> row 1
520 psrad mm0, SHIFT_INV_COL; ; y3=a3+b3 y2=a2+b2
522 ; movq mm2, [INP] ; ; row+1; 0; x3 x2 x1 x0
523 packssdw mm4, mm3 ; ; 3 ; y6 y7 y4 y5
525 packssdw mm1, mm0 ; ; 0 ; y3 y2 y1 y0
526 movq mm7, mm4 ; ; 7 ; y6 y7 y4 y5
528 ; movq mm0, mm2 ; ; row+1; 2 ; x3 x2 x1 x0
529 ; por mm1, dct_one_corr ; ; correction y2 +0.5
530 psrld mm4, 16 ; ; 0 y6 0 y4
532 movq [OUT], mm1 ; ; 1 ; save y3 y2 y1 y0
533 pslld mm7, 16 ; ; y7 0 y5 0
535 ; movq mm1, [INP+8] ; ; row+1; 1 ; x7 x6 x5 x4
536 ; por mm7, dct_one_corr ; ; correction y2 +0.5
537 por mm7, mm4 ; ; 4 ; y7 y6 y5 y4
539 ; movq mm3, [TABLE] ; ; 3 ; w06 w04 w02 w00
540 ; punpcklwd mm0, mm1 ; ; row+1; x5 x1 x4 x0
542 ; begin processing row 1
543 movq [OUT+8], mm7 ; ; 7 ; save y7 y6 y5 y4
547 cmp edi, 0x08; ; compare x <> 8
549 jl near acc_idct_colloop1; ; end for ( x = 0; x < 8; ++x )
551 ; done with the iDCT column-transformation
553 ; now we have to transpose the output 8x8 matrix
554 ; 8x8 (OUT) -> 8x8't' (IN)
556 ; the transposition is implemented as 4 sub-operations.
557 ; 1) transpose upper-left quad
558 ; 2) transpose lower-right quad
559 ; 3) transpose lower-left quad
560 ; 4) transpose upper-right quad
564 ; mm0 = 1st row [ A B C D ] row1
565 ; mm1 = 2nd row [ E F G H ] 2
566 ; mm2 = 3rd row [ I J K L ] 3
567 ; mm3 = 4th row [ M N O P ] 4
569 ; 1) transpose upper-left quad
573 movq mm0, [OUT + ROW_STRIDE * 0 ]
575 movq mm1, [OUT + ROW_STRIDE * 1 ]
576 movq mm4, mm0; ; mm4 = copy of row1[A B C D]
578 movq mm2, [OUT + ROW_STRIDE * 2 ]
579 punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
581 movq mm3, [OUT + ROW_STRIDE * 3]
582 punpckhwd mm4, mm1 ; mm4 = [ 2 6 3 7]
585 punpcklwd mm2, mm3 ; mm2 = [ 8 12 9 13]
587 punpckhwd mm6, mm3 ; mm6 = 10 14 11 15]
588 movq mm1, mm0 ; mm1 = [ 0 4 1 5]
590 mov INP, [ebp+8] ; load input address
591 punpckldq mm0, mm2 ; final result mm0 = row1 [0 4 8 12]
593 movq mm3, mm4; ; mm3 = [ 2 6 3 7]
594 punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
596 movq [ INP + ROW_STRIDE * 0 ], mm0; ; store row 1
597 punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
599 ; begin reading next quadrant (lower-right)
600 movq mm0, [OUT + ROW_STRIDE*4 + 8];
601 punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
603 movq [ INP +ROW_STRIDE * 2], mm4; ; store row 3
604 movq mm4, mm0; ; mm4 = copy of row1[A B C D]
606 movq [ INP +ROW_STRIDE * 1], mm1; ; store row 2
608 movq mm1, [OUT + ROW_STRIDE*5 + 8]
610 movq [ INP +ROW_STRIDE * 3], mm3; ; store row 4
611 punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
613 ; 2) transpose lower-right quadrant
615 ; movq mm0, [OUT + ROW_STRIDE*4 + 8]
617 ; movq mm1, [OUT + ROW_STRIDE*5 + 8]
618 ; movq mm4, mm0; ; mm4 = copy of row1[A B C D]
620 movq mm2, [OUT + ROW_STRIDE*6 + 8]
621 ; punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
622 punpckhwd mm4, mm1; ; mm4 = [ 2 6 3 7]
624 movq mm3, [OUT + ROW_STRIDE*7 + 8]
627 punpcklwd mm2, mm3; ; mm2 = [ 8 12 9 13]
628 movq mm1, mm0; ; mm1 = [ 0 4 1 5]
630 punpckhwd mm6, mm3; ; mm6 = 10 14 11 15]
631 movq mm3, mm4; ; mm3 = [ 2 6 3 7]
633 punpckldq mm0, mm2; ; final result mm0 = row1 [0 4 8 12]
635 punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
638 movq [ INP + ROW_STRIDE*4 + 8], mm0; ; store row 1
639 punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
641 movq mm0, [OUT + ROW_STRIDE * 4 ]
642 punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
643 movq [ INP +ROW_STRIDE*6 + 8], mm4; ; store row 3
644 movq mm4, mm0; ; mm4 = copy of row1[A B C D]
646 movq [ INP +ROW_STRIDE*5 + 8], mm1; ; store row 2
648 movq mm1, [OUT + ROW_STRIDE * 5 ]
651 movq [ INP +ROW_STRIDE*7 + 8], mm3; ; store row 4
652 punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
654 ; 3) transpose lower-left
655 ; movq mm0, [OUT + ROW_STRIDE * 4 ]
657 ; movq mm1, [OUT + ROW_STRIDE * 5 ]
658 ; movq mm4, mm0; ; mm4 = copy of row1[A B C D]
660 movq mm2, [OUT + ROW_STRIDE * 6 ]
661 ; punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
662 punpckhwd mm4, mm1; ; mm4 = [ 2 6 3 7]
664 movq mm3, [OUT + ROW_STRIDE * 7 ]
667 punpcklwd mm2, mm3; ; mm2 = [ 8 12 9 13]
668 movq mm1, mm0; ; mm1 = [ 0 4 1 5]
670 punpckhwd mm6, mm3; ; mm6 = 10 14 11 15]
671 movq mm3, mm4; ; mm3 = [ 2 6 3 7]
673 punpckldq mm0, mm2; ; final result mm0 = row1 [0 4 8 12]
675 punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
678 movq [ INP + ROW_STRIDE * 0 + 8 ], mm0; ; store row 1
679 punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
681 ; begin reading next quadrant (upper-right)
682 movq mm0, [OUT + ROW_STRIDE*0 + 8];
683 punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
685 movq [ INP +ROW_STRIDE * 2 + 8], mm4; ; store row 3
686 movq mm4, mm0; ; mm4 = copy of row1[A B C D]
688 movq [ INP +ROW_STRIDE * 1 + 8 ], mm1; ; store row 2
689 movq mm1, [OUT + ROW_STRIDE*1 + 8]
691 movq [ INP +ROW_STRIDE * 3 + 8], mm3; ; store row 4
692 punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
695 ; 2) transpose lower-right quadrant
697 ; movq mm0, [OUT + ROW_STRIDE*4 + 8]
699 ; movq mm1, [OUT + ROW_STRIDE*5 + 8]
700 ; movq mm4, mm0; ; mm4 = copy of row1[A B C D]
702 movq mm2, [OUT + ROW_STRIDE*2 + 8]
703 ; punpcklwd mm0, mm1; ; mm0 = [ 0 4 1 5]
704 punpckhwd mm4, mm1; ; mm4 = [ 2 6 3 7]
706 movq mm3, [OUT + ROW_STRIDE*3 + 8]
709 punpcklwd mm2, mm3; ; mm2 = [ 8 12 9 13]
710 movq mm1, mm0; ; mm1 = [ 0 4 1 5]
712 punpckhwd mm6, mm3; ; mm6 = 10 14 11 15]
713 movq mm3, mm4; ; mm3 = [ 2 6 3 7]
715 punpckldq mm0, mm2; ; final result mm0 = row1 [0 4 8 12]
717 punpckhdq mm1, mm2; ; mm1 = final result mm1 = row2 [1 5 9 13]
720 movq [ INP + ROW_STRIDE*4 ], mm0; ; store row 1
721 punpckldq mm4, mm6; ; final result mm4 = row3 [2 6 10 14]
723 movq [ INP +ROW_STRIDE*5 ], mm1; ; store row 2
724 punpckhdq mm3, mm6; ; final result mm3 = row4 [3 7 11 15]
726 movq [ INP +ROW_STRIDE*6 ], mm4; ; store row 3
729 movq [ INP +ROW_STRIDE*7 ], mm3; ; store row 4
736 pop ebp ; restore frame pointer