cinelerra-5.0/libzmpeg3/a52dec-0.7.3/liba52/imdct.c

   1 /*
   2  * imdct.c
   3  * Copyright (C) 2000-2002 Michel Lespinasse <walken@zoy.org>
   4  * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
   5  *
   6  * The ifft algorithms in this file have been largely inspired by Dan
   7  * Bernstein's work, djbfft, available at http://cr.yp.to/djbfft.html
   8  *
   9  * This file is part of a52dec, a free ATSC A-52 stream decoder.
  10  * See http://liba52.sourceforge.net/ for updates.
  11  *
  12  * a52dec is free software; you can redistribute it and/or modify
  13  * it under the terms of the GNU General Public License as published by
  14  * the Free Software Foundation; either version 2 of the License, or
  15  * (at your option) any later version.
  16  *
  17  * a52dec is distributed in the hope that it will be useful,
  18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20  * GNU General Public License for more details.
  21  *
  22  * You should have received a copy of the GNU General Public License
  23  * along with this program; if not, write to the Free Software
  24  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  25  */
  26
  27 #include "config.h"
  28
  29 #include <math.h>
  30 #include <stdio.h>
  31 #ifdef LIBA52_DJBFFT
  32 #include <fftc4.h>
  33 #endif
  34 #ifndef M_PI
  35 #define M_PI 3.1415926535897932384626433832795029
  36 #endif
  37 #include <inttypes.h>
  38
  39 #include "a52.h"
  40 #include "a52_internal.h"
  41 #include "mm_accel.h"
  42
  43 typedef struct complex_s {
  44     sample_t real;
  45     sample_t imag;
  46 } complex_t;
  47
  48 static complex_t buf[128];
  49
  50 static uint8_t fftorder[] = {
  51       0,128, 64,192, 32,160,224, 96, 16,144, 80,208,240,112, 48,176,
  52       8,136, 72,200, 40,168,232,104,248,120, 56,184, 24,152,216, 88,
  53       4,132, 68,196, 36,164,228,100, 20,148, 84,212,244,116, 52,180,
  54     252,124, 60,188, 28,156,220, 92, 12,140, 76,204,236,108, 44,172,
  55       2,130, 66,194, 34,162,226, 98, 18,146, 82,210,242,114, 50,178,
  56      10,138, 74,202, 42,170,234,106,250,122, 58,186, 26,154,218, 90,
  57     254,126, 62,190, 30,158,222, 94, 14,142, 78,206,238,110, 46,174,
  58       6,134, 70,198, 38,166,230,102,246,118, 54,182, 22,150,214, 86
  59 };
  60
  61 /* Root values for IFFT */
  62 static sample_t roots16[3];
  63 static sample_t roots32[7];
  64 static sample_t roots64[15];
  65 static sample_t roots128[31];
  66
  67 /* Twiddle factors for IMDCT */
  68 static complex_t pre1[128];
  69 static complex_t post1[64];
  70 static complex_t pre2[64];
  71 static complex_t post2[32];
  72
  73 /* Windowing function for Modified DCT - Thank you acroread */
  74 static const sample_t a52_imdct_window[] = {
  75     0.00014, 0.00024, 0.00037, 0.00051, 0.00067, 0.00086, 0.00107, 0.00130,
  76     0.00157, 0.00187, 0.00220, 0.00256, 0.00297, 0.00341, 0.00390, 0.00443,
  77     0.00501, 0.00564, 0.00632, 0.00706, 0.00785, 0.00871, 0.00962, 0.01061,
  78     0.01166, 0.01279, 0.01399, 0.01526, 0.01662, 0.01806, 0.01959, 0.02121,
  79     0.02292, 0.02472, 0.02662, 0.02863, 0.03073, 0.03294, 0.03527, 0.03770,
  80     0.04025, 0.04292, 0.04571, 0.04862, 0.05165, 0.05481, 0.05810, 0.06153,
  81     0.06508, 0.06878, 0.07261, 0.07658, 0.08069, 0.08495, 0.08935, 0.09389,
  82     0.09859, 0.10343, 0.10842, 0.11356, 0.11885, 0.12429, 0.12988, 0.13563,
  83     0.14152, 0.14757, 0.15376, 0.16011, 0.16661, 0.17325, 0.18005, 0.18699,
  84     0.19407, 0.20130, 0.20867, 0.21618, 0.22382, 0.23161, 0.23952, 0.24757,
  85     0.25574, 0.26404, 0.27246, 0.28100, 0.28965, 0.29841, 0.30729, 0.31626,
  86     0.32533, 0.33450, 0.34376, 0.35311, 0.36253, 0.37204, 0.38161, 0.39126,
  87     0.40096, 0.41072, 0.42054, 0.43040, 0.44030, 0.45023, 0.46020, 0.47019,
  88     0.48020, 0.49022, 0.50025, 0.51028, 0.52031, 0.53033, 0.54033, 0.55031,
  89     0.56026, 0.57019, 0.58007, 0.58991, 0.59970, 0.60944, 0.61912, 0.62873,
  90     0.63827, 0.64774, 0.65713, 0.66643, 0.67564, 0.68476, 0.69377, 0.70269,
  91     0.71150, 0.72019, 0.72877, 0.73723, 0.74557, 0.75378, 0.76186, 0.76981,
  92     0.77762, 0.78530, 0.79283, 0.80022, 0.80747, 0.81457, 0.82151, 0.82831,
  93     0.83496, 0.84145, 0.84779, 0.85398, 0.86001, 0.86588, 0.87160, 0.87716,
  94     0.88257, 0.88782, 0.89291, 0.89785, 0.90264, 0.90728, 0.91176, 0.91610,
  95     0.92028, 0.92432, 0.92822, 0.93197, 0.93558, 0.93906, 0.94240, 0.94560,
  96     0.94867, 0.95162, 0.95444, 0.95713, 0.95971, 0.96217, 0.96451, 0.96674,
  97     0.96887, 0.97089, 0.97281, 0.97463, 0.97635, 0.97799, 0.97953, 0.98099,
  98     0.98236, 0.98366, 0.98488, 0.98602, 0.98710, 0.98811, 0.98905, 0.98994,
  99     0.99076, 0.99153, 0.99225, 0.99291, 0.99353, 0.99411, 0.99464, 0.99513,
 100     0.99558, 0.99600, 0.99639, 0.99674, 0.99706, 0.99736, 0.99763, 0.99788,
 101     0.99811, 0.99831, 0.99850, 0.99867, 0.99882, 0.99895, 0.99908, 0.99919,
 102     0.99929, 0.99938, 0.99946, 0.99953, 0.99959, 0.99965, 0.99969, 0.99974,
 103     0.99978, 0.99981, 0.99984, 0.99986, 0.99988, 0.99990, 0.99992, 0.99993,
 104     0.99994, 0.99995, 0.99996, 0.99997, 0.99998, 0.99998, 0.99998, 0.99999,
 105     0.99999, 0.99999, 0.99999, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000,
 106     1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000, 1.00000
 107 };
 108
 109 static void (* ifft128) (complex_t * buf);
 110 static void (* ifft64) (complex_t * buf);
 111
 112 static inline void ifft2 (complex_t * buf)
 113 {
 114     double r, i;
 115
 116     r = buf[0].real;
 117     i = buf[0].imag;
 118     buf[0].real += buf[1].real;
 119     buf[0].imag += buf[1].imag;
 120     buf[1].real = r - buf[1].real;
 121     buf[1].imag = i - buf[1].imag;
 122 }
 123
 124 static inline void ifft4 (complex_t * buf)
 125 {
 126     double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
 127
 128     tmp1 = buf[0].real + buf[1].real;
 129     tmp2 = buf[3].real + buf[2].real;
 130     tmp3 = buf[0].imag + buf[1].imag;
 131     tmp4 = buf[2].imag + buf[3].imag;
 132     tmp5 = buf[0].real - buf[1].real;
 133     tmp6 = buf[0].imag - buf[1].imag;
 134     tmp7 = buf[2].imag - buf[3].imag;
 135     tmp8 = buf[3].real - buf[2].real;
 136
 137     buf[0].real = tmp1 + tmp2;
 138     buf[0].imag = tmp3 + tmp4;
 139     buf[2].real = tmp1 - tmp2;
 140     buf[2].imag = tmp3 - tmp4;
 141     buf[1].real = tmp5 + tmp7;
 142     buf[1].imag = tmp6 + tmp8;
 143     buf[3].real = tmp5 - tmp7;
 144     buf[3].imag = tmp6 - tmp8;
 145 }
 146
 147 /* the basic split-radix ifft butterfly */
 148
 149 #define BUTTERFLY(a0,a1,a2,a3,wr,wi) do {       \
 150     tmp5 = a2.real * wr + a2.imag * wi;         \
 151     tmp6 = a2.imag * wr - a2.real * wi;         \
 152     tmp7 = a3.real * wr - a3.imag * wi;         \
 153     tmp8 = a3.imag * wr + a3.real * wi;         \
 154     tmp1 = tmp5 + tmp7;                         \
 155     tmp2 = tmp6 + tmp8;                         \
 156     tmp3 = tmp6 - tmp8;                         \
 157     tmp4 = tmp7 - tmp5;                         \
 158     a2.real = a0.real - tmp1;                   \
 159     a2.imag = a0.imag - tmp2;                   \
 160     a3.real = a1.real - tmp3;                   \
 161     a3.imag = a1.imag - tmp4;                   \
 162     a0.real += tmp1;                            \
 163     a0.imag += tmp2;                            \
 164     a1.real += tmp3;                            \
 165     a1.imag += tmp4;                            \
 166 } while (0)
 167
 168 /* split-radix ifft butterfly, specialized for wr=1 wi=0 */
 169
 170 #define BUTTERFLY_ZERO(a0,a1,a2,a3) do {        \
 171     tmp1 = a2.real + a3.real;                   \
 172     tmp2 = a2.imag + a3.imag;                   \
 173     tmp3 = a2.imag - a3.imag;                   \
 174     tmp4 = a3.real - a2.real;                   \
 175     a2.real = a0.real - tmp1;                   \
 176     a2.imag = a0.imag - tmp2;                   \
 177     a3.real = a1.real - tmp3;                   \
 178     a3.imag = a1.imag - tmp4;                   \
 179     a0.real += tmp1;                            \
 180     a0.imag += tmp2;                            \
 181     a1.real += tmp3;                            \
 182     a1.imag += tmp4;                            \
 183 } while (0)
 184
 185 /* split-radix ifft butterfly, specialized for wr=wi */
 186
 187 #define BUTTERFLY_HALF(a0,a1,a2,a3,w) do {      \
 188     tmp5 = (a2.real + a2.imag) * w;             \
 189     tmp6 = (a2.imag - a2.real) * w;             \
 190     tmp7 = (a3.real - a3.imag) * w;             \
 191     tmp8 = (a3.imag + a3.real) * w;             \
 192     tmp1 = tmp5 + tmp7;                         \
 193     tmp2 = tmp6 + tmp8;                         \
 194     tmp3 = tmp6 - tmp8;                         \
 195     tmp4 = tmp7 - tmp5;                         \
 196     a2.real = a0.real - tmp1;                   \
 197     a2.imag = a0.imag - tmp2;                   \
 198     a3.real = a1.real - tmp3;                   \
 199     a3.imag = a1.imag - tmp4;                   \
 200     a0.real += tmp1;                            \
 201     a0.imag += tmp2;                            \
 202     a1.real += tmp3;                            \
 203     a1.imag += tmp4;                            \
 204 } while (0)
 205
 206 static inline void ifft8 (complex_t * buf)
 207 {
 208     double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
 209
 210     ifft4 (buf);
 211     ifft2 (buf + 4);
 212     ifft2 (buf + 6);
 213     BUTTERFLY_ZERO (buf[0], buf[2], buf[4], buf[6]);
 214     BUTTERFLY_HALF (buf[1], buf[3], buf[5], buf[7], roots16[1]);
 215 }
 216
 217 static void ifft_pass (complex_t * buf, sample_t * weight, int n)
 218 {
 219     complex_t * buf1;
 220     complex_t * buf2;
 221     complex_t * buf3;
 222     double tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8;
 223     int i;
 224
 225     buf++;
 226     buf1 = buf + n;
 227     buf2 = buf + 2 * n;
 228     buf3 = buf + 3 * n;
 229
 230     BUTTERFLY_ZERO (buf[-1], buf1[-1], buf2[-1], buf3[-1]);
 231
 232     i = n - 1;
 233
 234     do {
 235         BUTTERFLY (buf[0], buf1[0], buf2[0], buf3[0], weight[n], weight[2*i]);
 236         buf++;
 237         buf1++;
 238         buf2++;
 239         buf3++;
 240         weight++;
 241     } while (--i);
 242 }
 243
 244 static void ifft16 (complex_t * buf)
 245 {
 246     ifft8 (buf);
 247     ifft4 (buf + 8);
 248     ifft4 (buf + 12);
 249     ifft_pass (buf, roots16 - 4, 4);
 250 }
 251
 252 static void ifft32 (complex_t * buf)
 253 {
 254     ifft16 (buf);
 255     ifft8 (buf + 16);
 256     ifft8 (buf + 24);
 257     ifft_pass (buf, roots32 - 8, 8);
 258 }
 259
 260 static void ifft64_c (complex_t * buf)
 261 {
 262     ifft32 (buf);
 263     ifft16 (buf + 32);
 264     ifft16 (buf + 48);
 265     ifft_pass (buf, roots64 - 16, 16);
 266 }
 267
 268 static void ifft128_c (complex_t * buf)
 269 {
 270     ifft32 (buf);
 271     ifft16 (buf + 32);
 272     ifft16 (buf + 48);
 273     ifft_pass (buf, roots64 - 16, 16);
 274
 275     ifft32 (buf + 64);
 276     ifft32 (buf + 96);
 277     ifft_pass (buf, roots128 - 32, 32);
 278 }
 279
 280 void a52_imdct_512 (sample_t * data, sample_t * delay, sample_t bias)
 281 {
 282     int i, k;
 283     sample_t t_r, t_i, a_r, a_i, b_r, b_i, w_1, w_2;
 284     const sample_t * window = a52_imdct_window;
 285
 286     for (i = 0; i < 128; i++) {
 287         k = fftorder[i];
 288         t_r = pre1[i].real;
 289         t_i = pre1[i].imag;
 290
 291         buf[i].real = t_i * data[255-k] + t_r * data[k];
 292         buf[i].imag = t_r * data[255-k] - t_i * data[k];
 293     }
 294
 295     ifft128 (buf);
 296
 297     /* Post IFFT complex multiply plus IFFT complex conjugate*/
 298     /* Window and convert to real valued signal */
 299     for (i = 0; i < 64; i++) {
 300         /* y[n] = z[n] * (xcos1[n] + j * xsin1[n]) ; */
 301         t_r = post1[i].real;
 302         t_i = post1[i].imag;
 303
 304         a_r = t_r * buf[i].real     + t_i * buf[i].imag;
 305         a_i = t_i * buf[i].real     - t_r * buf[i].imag;
 306         b_r = t_i * buf[127-i].real + t_r * buf[127-i].imag;
 307         b_i = t_r * buf[127-i].real - t_i * buf[127-i].imag;
 308
 309         w_1 = window[2*i];
 310         w_2 = window[255-2*i];
 311         data[2*i]     = delay[2*i] * w_2 - a_r * w_1 + bias;
 312         data[255-2*i] = delay[2*i] * w_1 + a_r * w_2 + bias;
 313         delay[2*i] = a_i;
 314
 315         w_1 = window[2*i+1];
 316         w_2 = window[254-2*i];
 317         data[2*i+1]   = delay[2*i+1] * w_2 + b_r * w_1 + bias;
 318         data[254-2*i] = delay[2*i+1] * w_1 - b_r * w_2 + bias;
 319         delay[2*i+1] = b_i;
 320     }
 321 }
 322
 323 void a52_imdct_256(sample_t data[],sample_t delay[],sample_t bias)
 324 {
 325     int i, k;
 326     sample_t t_r, t_i, a_r, a_i, b_r, b_i, c_r, c_i, d_r, d_i, w_1, w_2;
 327     complex_t * buf1, * buf2;
 328     const sample_t * window = a52_imdct_window;
 329
 330     buf1 = &buf[0];
 331     buf2 = &buf[64];
 332
 333     /* Pre IFFT complex multiply plus IFFT cmplx conjugate */
 334     for (i = 0; i < 64; i++) {
 335         k = fftorder[i];
 336         t_r = pre2[i].real;
 337         t_i = pre2[i].imag;
 338
 339         buf1[i].real = t_i * data[254-k] + t_r * data[k];
 340         buf1[i].imag = t_r * data[254-k] - t_i * data[k];
 341
 342         buf2[i].real = t_i * data[255-k] + t_r * data[k+1];
 343         buf2[i].imag = t_r * data[255-k] - t_i * data[k+1];
 344     }
 345
 346     ifft64 (buf1);
 347     ifft64 (buf2);
 348
 349     /* Post IFFT complex multiply */
 350     /* Window and convert to real valued signal */
 351     for (i = 0; i < 32; i++) {
 352         /* y1[n] = z1[n] * (xcos2[n] + j * xs in2[n]) ; */
 353         t_r = post2[i].real;
 354         t_i = post2[i].imag;
 355
 356         a_r = t_r * buf1[i].real    + t_i * buf1[i].imag;
 357         a_i = t_i * buf1[i].real    - t_r * buf1[i].imag;
 358         b_r = t_i * buf1[63-i].real + t_r * buf1[63-i].imag;
 359         b_i = t_r * buf1[63-i].real - t_i * buf1[63-i].imag;
 360
 361         c_r = t_r * buf2[i].real    + t_i * buf2[i].imag;
 362         c_i = t_i * buf2[i].real    - t_r * buf2[i].imag;
 363         d_r = t_i * buf2[63-i].real + t_r * buf2[63-i].imag;
 364         d_i = t_r * buf2[63-i].real - t_i * buf2[63-i].imag;
 365
 366         w_1 = window[2*i];
 367         w_2 = window[255-2*i];
 368         data[2*i]     = delay[2*i] * w_2 - a_r * w_1 + bias;
 369         data[255-2*i] = delay[2*i] * w_1 + a_r * w_2 + bias;
 370         delay[2*i] = c_i;
 371
 372         w_1 = window[128+2*i];
 373         w_2 = window[127-2*i];
 374         data[128+2*i] = delay[127-2*i] * w_2 + a_i * w_1 + bias;
 375         data[127-2*i] = delay[127-2*i] * w_1 - a_i * w_2 + bias;
 376         delay[127-2*i] = c_r;
 377
 378         w_1 = window[2*i+1];
 379         w_2 = window[254-2*i];
 380         data[2*i+1]   = delay[2*i+1] * w_2 - b_i * w_1 + bias;
 381         data[254-2*i] = delay[2*i+1] * w_1 + b_i * w_2 + bias;
 382         delay[2*i+1] = d_r;
 383
 384         w_1 = window[129+2*i];
 385         w_2 = window[126-2*i];
 386         data[129+2*i] = delay[126-2*i] * w_2 + b_r * w_1 + bias;
 387         data[126-2*i] = delay[126-2*i] * w_1 - b_r * w_2 + bias;
 388         delay[126-2*i] = d_i;
 389     }
 390 }
 391
 392 void a52_imdct_init (uint32_t mm_accel)
 393 {
 394     int i, k;
 395
 396     for (i = 0; i < 3; i++)
 397         roots16[i] = cos ((M_PI / 8) * (i + 1));
 398
 399     for (i = 0; i < 7; i++)
 400         roots32[i] = cos ((M_PI / 16) * (i + 1));
 401
 402     for (i = 0; i < 15; i++)
 403         roots64[i] = cos ((M_PI / 32) * (i + 1));
 404
 405     for (i = 0; i < 31; i++)
 406         roots128[i] = cos ((M_PI / 64) * (i + 1));
 407
 408     for (i = 0; i < 64; i++) {
 409         k = fftorder[i] / 2 + 64;
 410         pre1[i].real = cos ((M_PI / 256) * (k - 0.25));
 411         pre1[i].imag = sin ((M_PI / 256) * (k - 0.25));
 412     }
 413
 414     for (i = 64; i < 128; i++) {
 415         k = fftorder[i] / 2 + 64;
 416         pre1[i].real = -cos ((M_PI / 256) * (k - 0.25));
 417         pre1[i].imag = -sin ((M_PI / 256) * (k - 0.25));
 418     }
 419
 420     for (i = 0; i < 64; i++) {
 421         post1[i].real = cos ((M_PI / 256) * (i + 0.5));
 422         post1[i].imag = sin ((M_PI / 256) * (i + 0.5));
 423     }
 424
 425     for (i = 0; i < 64; i++) {
 426         k = fftorder[i] / 4;
 427         pre2[i].real = cos ((M_PI / 128) * (k - 0.25));
 428         pre2[i].imag = sin ((M_PI / 128) * (k - 0.25));
 429     }
 430
 431     for (i = 0; i < 32; i++) {
 432         post2[i].real = cos ((M_PI / 128) * (i + 0.5));
 433         post2[i].imag = sin ((M_PI / 128) * (i + 0.5));
 434     }
 435
 436 #ifdef LIBA52_DJBFFT
 437     if (mm_accel & MM_ACCEL_DJBFFT) {
 438         fprintf (stderr, "Using djbfft for IMDCT transform\n");
 439         ifft128 = (void (*) (complex_t *)) fftc4_un128;
 440         ifft64 = (void (*) (complex_t *)) fftc4_un64;
 441     } else
 442 #endif
 443     {
 444 //      fprintf (stderr, "No accelerated IMDCT transform found\n");
 445         ifft128 = ifft128_c;
 446         ifft64 = ifft64_c;
 447     }
 448 }