version update
[goodguy/cinelerra.git] / cinelerra-5.1 / mpeg2enc / transfrm.c
1 /* transfrm.c,  forward / inverse transformation                            */
2
3 /* Copyright (C) 1996, MPEG Software Simulation Group. All Rights Reserved. */
4
5 /*
6  * Disclaimer of Warranty
7  *
8  * These software programs are available to the user without any license fee or
9  * royalty on an "as is" basis.  The MPEG Software Simulation Group disclaims
10  * any and all warranties, whether express, implied, or statuary, including any
11  * implied warranties or merchantability or of fitness for a particular
12  * purpose.  In no event shall the copyright-holder be liable for any
13  * incidental, punitive, or consequential damages of any kind whatsoever
14  * arising from the use of these programs.
15  *
16  * This disclaimer of warranty extends to the user of these programs and user's
17  * customers, employees, agents, transferees, successors, and assigns.
18  *
19  * The MPEG Software Simulation Group does not represent or warrant that the
20  * programs furnished hereunder are free of infringement of any third-party
21  * patents.
22  *
23  * Commercial implementations of MPEG-1 and MPEG-2 video, including shareware,
24  * are subject to royalty fees to patent holders.  Many of these patents are
25  * general enough such that they are unavoidable regardless of implementation
26  * design.
27  *
28  */
29
30 #include "config.h"
31 #include "global.h"
32 #include <stdio.h>
33 #include <stdlib.h>
34 #include <math.h>
35 #include "cpu_accel.h"
36
37 #ifdef X86_CPU
38 extern void fdct_mmx( int16_t * blk );
39 extern void idct_mmx( int16_t * blk, unsigned char *temp );
40
41 void add_pred_mmx (uint8_t *pred, uint8_t *cur,
42                                    int lx, int16_t *blk);
43 void sub_pred_mmx (uint8_t *pred, uint8_t *cur,
44                                    int lx, int16_t *blk);
45 #endif
46
47 extern void fdct( int16_t *blk );
48 extern void idct( int16_t *blk, unsigned char *temp );
49
50
51
52 /* private prototypes*/
53 static void add_pred (uint8_t *pred, uint8_t *cur,
54                                           int lx, int16_t *blk);
55 static void sub_pred (uint8_t *pred, uint8_t *cur,
56                                           int lx, int16_t *blk);
57
58 /*
59   Pointers to version of transform and prediction manipulation
60   routines to be used..
61  */
62
63 static void (*pfdct)( int16_t * blk );
64 static void (*pidct)( int16_t * blk , unsigned char *temp);
65 static void (*padd_pred) (uint8_t *pred, uint8_t *cur,
66                                                   int lx, int16_t *blk);
67 static void (*psub_pred) (uint8_t *pred, uint8_t *cur,
68                                                   int lx, int16_t *blk);
69
70 /*
71   Initialise DCT transformation routines
72   Currently just activates MMX routines if available
73  */
74
75
76 void init_transform_hv()
77 {
78 #ifdef X86_CPU
79         int flags;
80         flags = cpu_accel();
81
82         if( (flags & ACCEL_X86_MMX) ) /* MMX CPU */
83         {
84                 if(verbose) fprintf( stderr, "SETTING MMX for TRANSFORM!\n");
85                 pfdct = fdct_mmx;
86                 pidct = idct_mmx;
87                 padd_pred = add_pred_mmx;
88                 psub_pred = sub_pred_mmx;
89         }
90         else
91 #endif
92         {
93                 pfdct = fdct;
94                 pidct = idct;
95                 padd_pred = add_pred;
96                 psub_pred = sub_pred;
97
98         }
99 }
100
101 /* add prediction and prediction error, saturate to 0...255 */
102 static void add_pred(unsigned char *pred,
103         unsigned char *cur,
104         int lx,
105         short *blk)
106 {
107         int j;
108
109         for (j=0; j<8; j++)
110         {
111 /*
112  *      for (i=0; i<8; i++)
113  *        cur[i] = clp[blk[i] + pred[i]];
114  */
115         cur[0] = clp[blk[0] + pred[0]];
116         cur[1] = clp[blk[1] + pred[1]];
117         cur[2] = clp[blk[2] + pred[2]];
118         cur[3] = clp[blk[3] + pred[3]];
119         cur[4] = clp[blk[4] + pred[4]];
120         cur[5] = clp[blk[5] + pred[5]];
121         cur[6] = clp[blk[6] + pred[6]];
122         cur[7] = clp[blk[7] + pred[7]];
123  
124         blk += 8;
125         cur += lx;
126         pred += lx;
127         }
128 }
129
130 /* subtract prediction from block data */
131 static void sub_pred(unsigned char *pred,
132         unsigned char *cur,
133         int lx,
134         short *blk)
135 {
136         int j;
137
138         for (j=0; j<8; j++)
139         {
140 /*
141  *      for (i=0; i<8; i++)
142  *              blk[i] = cur[i] - pred[i];
143  */
144         blk[0] = cur[0] - pred[0];
145         blk[1] = cur[1] - pred[1];
146         blk[2] = cur[2] - pred[2];
147         blk[3] = cur[3] - pred[3];
148         blk[4] = cur[4] - pred[4];
149         blk[5] = cur[5] - pred[5];
150         blk[6] = cur[6] - pred[6];
151         blk[7] = cur[7] - pred[7];
152
153         blk += 8;
154         cur += lx;
155         pred += lx;
156         }
157 }
158
159 void transform_engine_loop(transform_engine_t *engine)
160 {
161         while(!engine->done)
162         {
163                 pthread_mutex_lock(&(engine->input_lock));
164                 
165                 if(!engine->done)
166                 {
167                         pict_data_s *picture = engine->picture;
168                         uint8_t **pred = engine->pred;
169                         uint8_t **cur = engine->cur;
170                         mbinfo_s *mbi = picture->mbinfo;
171                         int16_t (*blocks)[64] = picture->blocks;
172                         int i, j, i1, j1, k, n, cc, offs, lx;
173
174                         k = (engine->start_row / 16) * (width / 16);
175
176                         for(j = engine->start_row; j < engine->end_row; j += 16)
177                         for(i = 0; i < width; i += 16)
178                         {
179                                         mbi[k].dctblocks = &blocks[k * block_count];
180
181                                 for(n = 0; n < block_count; n++)
182                                 {
183 /* color component index */
184                                         cc = (n < 4) ? 0 : (n & 1) + 1; 
185                                         if(cc == 0)
186                                         {
187 /* A.Stevens Jul 2000 Record dct blocks associated with macroblock */
188 /* We'll use this for quantisation calculations                    */
189 /* luminance */
190                                                         if ((picture->pict_struct == FRAME_PICTURE) && mbi[k].dct_type)
191                                                         {
192 /* field DCT */
193                                                                 offs = i + ((n & 1) << 3) + width * (j + ((n & 2) >> 1));
194                                                                 lx = width << 1;
195                                                         }
196                                                         else
197                                                         {
198 /* frame DCT */
199                                                                 offs = i + ((n & 1) << 3) + width2 * (j + ((n & 2) << 2));
200                                                                 lx = width2;
201                                                         }
202
203                                                         if (picture->pict_struct == BOTTOM_FIELD)
204                                                                 offs += width;
205                                         }
206                                         else
207                                         {
208 /* chrominance */
209 /* scale coordinates */
210                                                 i1 = (chroma_format == CHROMA444) ? i : i >> 1;
211                                                 j1 = (chroma_format != CHROMA420) ? j : j >> 1;
212
213                                                 if ((picture->pict_struct==FRAME_PICTURE) && mbi[k].dct_type
214                                                 && (chroma_format!=CHROMA420))
215                                                 {
216 /* field DCT */
217                                                 offs = i1 + (n&8) + chrom_width*(j1+((n&2)>>1));
218                                                 lx = chrom_width<<1;
219                                                 }
220                                                 else
221                                                 {
222 /* frame DCT */
223                                                 offs = i1 + (n&8) + chrom_width2*(j1+((n&2)<<2));
224                                                 lx = chrom_width2;
225                                                 }
226
227                                                 if(picture->pict_struct==BOTTOM_FIELD)
228                                                 offs += chrom_width;
229                                         }
230
231                                                 (*psub_pred)(pred[cc]+offs,cur[cc]+offs,lx,
232                                                                          blocks[k*block_count+n]);
233                                                 (*pfdct)(blocks[k*block_count+n]);
234                                 }
235
236                                 k++;
237                         }
238                 }
239                 pthread_mutex_unlock(&(engine->output_lock));
240         }
241 }
242
243 /* subtract prediction and transform prediction error */
244 void transform(pict_data_s *picture,
245         uint8_t *pred[], uint8_t *cur[])
246 {
247         int i;
248 /* Start loop */
249         for(i = 0; i < processors; i++)
250         {
251                 transform_engines[i].picture = picture;
252                 transform_engines[i].pred = pred;
253                 transform_engines[i].cur = cur;
254                 pthread_mutex_unlock(&(transform_engines[i].input_lock));
255         }
256
257 /* Wait for completion */
258         for(i = 0; i < processors; i++)
259         {
260                 pthread_mutex_lock(&(transform_engines[i].output_lock));
261         }
262 }
263
264
265
266 void start_transform_engines()
267 {
268         int i;
269         int rows_per_processor = (int)((float)height2 / 16 / processors + 0.5);
270         int current_row = 0;
271         pthread_attr_t  attr;
272         pthread_mutexattr_t mutex_attr;
273
274         pthread_mutexattr_init(&mutex_attr);
275         pthread_attr_init(&attr);
276         transform_engines = calloc(1, sizeof(transform_engine_t) * processors);
277         for(i = 0; i < processors; i++)
278         {
279                 transform_engines[i].start_row = current_row * 16;
280                 current_row += rows_per_processor;
281                 if(current_row > height2 / 16) current_row = height2 / 16;
282                 transform_engines[i].end_row = current_row * 16;
283                 pthread_mutex_init(&(transform_engines[i].input_lock), &mutex_attr);
284                 pthread_mutex_lock(&(transform_engines[i].input_lock));
285                 pthread_mutex_init(&(transform_engines[i].output_lock), &mutex_attr);
286                 pthread_mutex_lock(&(transform_engines[i].output_lock));
287                 transform_engines[i].done = 0;
288                 pthread_create(&(transform_engines[i].tid), 
289                         &attr, 
290                         (void*)transform_engine_loop, 
291                         &transform_engines[i]);
292         }
293 }
294
295 void stop_transform_engines()
296 {
297         int i;
298         for(i = 0; i < processors; i++)
299         {
300                 transform_engines[i].done = 1;
301                 pthread_mutex_unlock(&(transform_engines[i].input_lock));
302                 pthread_join(transform_engines[i].tid, 0);
303                 pthread_mutex_destroy(&(transform_engines[i].input_lock));
304                 pthread_mutex_destroy(&(transform_engines[i].output_lock));
305         }
306         free(transform_engines);
307 }
308
309
310
311
312
313
314
315
316
317 /* inverse transform prediction error and add prediction */
318 void itransform_engine_loop(transform_engine_t *engine)
319 {
320         while(!engine->done)
321         {
322                 pthread_mutex_lock(&(engine->input_lock));
323
324                 if(!engine->done)
325                 {
326                         pict_data_s *picture = engine->picture;
327                         uint8_t **pred = engine->pred;
328                         uint8_t **cur = engine->cur;
329                         int i, j, i1, j1, k, n, cc, offs, lx;
330                 mbinfo_s *mbi = picture->mbinfo;
331 /* Its the quantised / inverse quantised blocks were interested in
332    for inverse transformation */
333                         int16_t (*blocks)[64] = picture->qblocks;
334
335                         k = (engine->start_row / 16) * (width / 16);
336
337                         for(j = engine->start_row; j < engine->end_row; j += 16)
338                                 for(i = 0; i < width; i += 16)
339                                 {
340                                         for(n = 0; n < block_count; n++)
341                                         {
342                                         cc = (n < 4) ? 0 : (n & 1) + 1; /* color component index */
343
344                                         if(cc == 0)
345                                         {
346 /* luminance */
347                                                 if((picture->pict_struct == FRAME_PICTURE) && mbi[k].dct_type)
348                                                 {
349 /* field DCT */
350                                                         offs = i + ((n & 1) << 3) + width * (j + ((n & 2) >> 1));
351                                                         lx = width<<1;
352                                                 }
353                                                 else
354                                                 {
355 /* frame DCT */
356                                                         offs = i + ((n & 1) << 3) + width2 * (j + ((n & 2) << 2));
357                                                         lx = width2;
358                                                 }
359
360                                                 if(picture->pict_struct == BOTTOM_FIELD)
361                                                 offs += width;
362                                         }
363                                         else
364                                         {
365 /* chrominance */
366
367 /* scale coordinates */
368                                                 i1 = (chroma_format==CHROMA444) ? i : i>>1;
369                                                 j1 = (chroma_format!=CHROMA420) ? j : j>>1;
370
371                                                 if((picture->pict_struct == FRAME_PICTURE) && mbi[k].dct_type
372                                                         && (chroma_format != CHROMA420))
373                                                 {
374 /* field DCT */
375                                                         offs = i1 + (n & 8) + chrom_width * (j1 + ((n & 2) >> 1));
376                                                         lx = chrom_width << 1;
377                                                 }
378                                                 else
379                                                 {
380 /* frame DCT */
381                                                         offs = i1 + (n&8) + chrom_width2 * (j1 + ((n & 2) << 2));
382                                                         lx = chrom_width2;
383                                                 }
384
385                                                 if(picture->pict_struct == BOTTOM_FIELD)
386                                                         offs += chrom_width;
387                                     }
388
389 //pthread_mutex_lock(&test_lock);
390                                                 (*pidct)(blocks[k*block_count+n], engine->temp);
391                                                 (*padd_pred)(pred[cc]+offs,cur[cc]+offs,lx,blocks[k*block_count+n]);
392 //pthread_mutex_unlock(&test_lock);
393                                         }
394
395                                         k++;
396                                 }
397                 }
398                 pthread_mutex_unlock(&(engine->output_lock));
399         }
400 }
401
402 void itransform(pict_data_s *picture,
403         uint8_t *pred[], uint8_t *cur[])
404 {
405         int i;
406 /* Start loop */
407         for(i = 0; i < processors; i++)
408         {
409                 itransform_engines[i].picture = picture;
410                 itransform_engines[i].cur = cur;
411                 itransform_engines[i].pred = pred;
412                 pthread_mutex_unlock(&(itransform_engines[i].input_lock));
413         }
414
415 /* Wait for completion */
416         for(i = 0; i < processors; i++)
417         {
418                 pthread_mutex_lock(&(itransform_engines[i].output_lock));
419         }
420 }
421
422 void start_itransform_engines()
423 {
424         int i;
425         int rows_per_processor = (int)((float)height2 / 16 / processors + 0.5);
426         int current_row = 0;
427         pthread_attr_t  attr;
428         pthread_mutexattr_t mutex_attr;
429
430         pthread_mutexattr_init(&mutex_attr);
431         pthread_attr_init(&attr);
432         itransform_engines = calloc(1, sizeof(transform_engine_t) * processors);
433         for(i = 0; i < processors; i++)
434         {
435                 itransform_engines[i].start_row = current_row * 16;
436                 current_row += rows_per_processor;
437                 if(current_row > height2 / 16) current_row = height2 / 16;
438                 itransform_engines[i].end_row = current_row * 16;
439                 pthread_mutex_init(&(itransform_engines[i].input_lock), &mutex_attr);
440                 pthread_mutex_lock(&(itransform_engines[i].input_lock));
441                 pthread_mutex_init(&(itransform_engines[i].output_lock), &mutex_attr);
442                 pthread_mutex_lock(&(itransform_engines[i].output_lock));
443                 itransform_engines[i].done = 0;
444                 pthread_create(&(itransform_engines[i].tid), 
445                         &attr, 
446                         (void*)itransform_engine_loop, 
447                         &itransform_engines[i]);
448         }
449 }
450
451 void stop_itransform_engines()
452 {
453         int i;
454         for(i = 0; i < processors; i++)
455         {
456                 itransform_engines[i].done = 1;
457                 pthread_mutex_unlock(&(itransform_engines[i].input_lock));
458                 pthread_join(itransform_engines[i].tid, 0);
459                 pthread_mutex_destroy(&(itransform_engines[i].input_lock));
460                 pthread_mutex_destroy(&(itransform_engines[i].output_lock));
461         }
462         free(itransform_engines);
463 }
464
465
466
467
468 /*
469  * select between frame and field DCT
470  *
471  * preliminary version: based on inter-field correlation
472  */
473
474 void dct_type_estimation(
475         pict_data_s *picture,
476         uint8_t *pred, uint8_t *cur
477         )
478 {
479
480         struct mbinfo *mbi = picture->mbinfo;
481
482         int16_t blk0[128], blk1[128];
483         int i, j, i0, j0, k, offs, s0, s1, sq0, sq1, s01;
484         double d, r;
485
486         k = 0;
487
488         for (j0=0; j0<height2; j0+=16)
489                 for (i0=0; i0<width; i0+=16)
490                 {
491                         if (picture->frame_pred_dct || picture->pict_struct!=FRAME_PICTURE)
492                                 mbi[k].dct_type = 0;
493                         else
494                         {
495                                 /* interlaced frame picture */
496                                 /*
497                                  * calculate prediction error (cur-pred) for top (blk0)
498                                  * and bottom field (blk1)
499                                  */
500                                 for (j=0; j<8; j++)
501                                 {
502                                         offs = width*((j<<1)+j0) + i0;
503                                         for (i=0; i<16; i++)
504                                         {
505                                                 blk0[16*j+i] = cur[offs] - pred[offs];
506                                                 blk1[16*j+i] = cur[offs+width] - pred[offs+width];
507                                                 offs++;
508                                         }
509                                 }
510                                 /* correlate fields */
511                                 s0=s1=sq0=sq1=s01=0;
512
513                                 for (i=0; i<128; i++)
514                                 {
515                                         s0+= blk0[i];
516                                         sq0+= blk0[i]*blk0[i];
517                                         s1+= blk1[i];
518                                         sq1+= blk1[i]*blk1[i];
519                                         s01+= blk0[i]*blk1[i];
520                                 }
521
522                                 d = (sq0-(s0*s0)/128.0)*(sq1-(s1*s1)/128.0);
523
524                                 if (d>0.0)
525                                 {
526                                         r = (s01-(s0*s1)/128.0)/sqrt(d);
527                                         if (r>0.5)
528                                                 mbi[k].dct_type = 0; /* frame DCT */
529                                         else
530                                                 mbi[k].dct_type = 1; /* field DCT */
531                                 }
532                                 else
533                                         mbi[k].dct_type = 1; /* field DCT */
534                         }
535                         k++;
536                 }
537 }