[FFmpeg-cvslog] AAC encoder: tweak rate-distortion logic

Claudio Freire Tue, 22 Sep 2015 22:33:08 -0700

ffmpeg | branch: master | Claudio Freire <klaussfre...@gmail.com> | Wed Sep 23 
02:13:56 2015 -0300| [7ec74ae4aaf50507c5da3dfbb7336e15f848b99b] | committer: 
Claudio Freire


AAC encoder: tweak rate-distortion logic

This patch modifies the encode frame function to
retry encoding the frame when the resulting bit count
is too far off target, but only adjusting lambda
in small, incremental step. It also makes the logic
more conservative - otherwise it will contend with
bit reservoir-related variations in bit allocation,
and result in artifacts when frame have to be truncated
(usually at high bit rates transitioning from low
complexity to high complexity).

> http://git.videolan.org/gitweb.cgi/ffmpeg.git/?a=commit;h=7ec74ae4aaf50507c5da3dfbb7336e15f848b99b
---

 libavcodec/aacenc.c   |   87 ++++++++++++++++++++++++++++++++++++-------------
 libavcodec/aacenc.h   |    4 ++-
 libavcodec/aacpsy.c   |    3 ++
 libavcodec/psymodel.h |    1 +
 tests/fate/aac.mak    |   10 +++---
 5 files changed, 77 insertions(+), 28 deletions(-)

diff --git a/libavcodec/aacenc.c b/libavcodec/aacenc.c
index bb90048..1b95ebd 100644
--- a/libavcodec/aacenc.c
+++ b/libavcodec/aacenc.c
@@ -489,7 +489,7 @@ static int aac_encode_frame(AVCodecContext *avctx, AVPacket 
*avpkt,
     float **samples = s->planar_samples, *samples2, *la, *overlap;
     ChannelElement *cpe;
     SingleChannelElement *sce;
-    int i, ch, w, chans, tag, start_ch, ret;
+    int i, its, ch, w, chans, tag, start_ch, ret, frame_bits;
     int ms_mode = 0, is_mode = 0, tns_mode = 0, pred_mode = 0;
     int chan_el_counter[4];
     FFPsyWindowInfo windows[AAC_MAX_CHANNELS];
@@ -581,14 +581,16 @@ static int aac_encode_frame(AVCodecContext *avctx, 
AVPacket *avpkt,
     }
     if ((ret = ff_alloc_packet2(avctx, avpkt, 8192 * s->channels, 0)) < 0)
         return ret;
+    frame_bits = its = 0;
     do {
-        int frame_bits;
+        int target_bits, too_many_bits, too_few_bits;
 
         init_put_bits(&s->pb, avpkt->data, avpkt->size);
 
         if ((avctx->frame_number & 0xFF)==1 && !(avctx->flags & 
AV_CODEC_FLAG_BITEXACT))
             put_bitstream_info(s, LIBAVCODEC_IDENT);
         start_ch = 0;
+        target_bits = 0;
         memset(chan_el_counter, 0, sizeof(chan_el_counter));
         for (i = 0; i < s->chan_map[0]; i++) {
             FFPsyWindowInfo* wi = windows + start_ch;
@@ -611,7 +613,15 @@ static int aac_encode_frame(AVCodecContext *avctx, 
AVPacket *avpkt,
                     if (sce->band_type[w] > RESERVED_BT)
                         sce->band_type[w] = 0;
             }
+            s->psy.bitres.alloc = -1;
+            s->psy.bitres.bits = avctx->frame_bits / s->channels;
             s->psy.model->analyze(&s->psy, start_ch, coeffs, wi);
+            if (s->psy.bitres.alloc > 0) {
+                /* Lambda unused here on purpose, we need to take psy's 
unscaled allocation */
+                target_bits += s->psy.bitres.alloc;
+                s->psy.bitres.alloc /= chans;
+            }
+            s->cur_type = tag;
             for (ch = 0; ch < chans; ch++) {
                 s->cur_channel = start_ch + ch;
                 s->coder->search_for_quantizers(avctx, s, &cpe->ch[ch], 
s->lambda);
@@ -692,36 +702,69 @@ static int aac_encode_frame(AVCodecContext *avctx, 
AVPacket *avpkt,
             start_ch += chans;
         }
 
-        frame_bits = put_bits_count(&s->pb);
-        if (frame_bits <= 6144 * s->channels - 3) {
-            s->psy.bitres.bits = frame_bits / s->channels;
+        if (avctx->flags & CODEC_FLAG_QSCALE) {
+            /* When using a constant Q-scale, don't mess with lambda */
             break;
         }
-        if (is_mode || ms_mode || tns_mode || pred_mode) {
-            for (i = 0; i < s->chan_map[0]; i++) {
-                // Must restore coeffs
-                chans = tag == TYPE_CPE ? 2 : 1;
-                cpe = &s->cpe[i];
-                for (ch = 0; ch < chans; ch++)
-                    memcpy(cpe->ch[ch].coeffs, cpe->ch[ch].pcoeffs, 
sizeof(cpe->ch[ch].coeffs));
-            }
-        }
 
-        s->lambda *= avctx->bit_rate * 1024.0f / avctx->sample_rate / 
frame_bits;
+        /* rate control stuff
+         * target either the nominal bitrate, or what psy's bit reservoir says 
to target
+         * whichever is greatest
+         */
+
+        frame_bits = put_bits_count(&s->pb);
+        target_bits = FFMAX(target_bits, avctx->bit_rate * 1024 / 
avctx->sample_rate);
+        target_bits = FFMIN(target_bits, 6144 * s->channels - 3);
+
+        /* When using ABR, be strict (but only for increasing) */
+        too_many_bits = target_bits + target_bits/2;
+        too_few_bits = target_bits - target_bits/8;
+
+        if (   its == 0 /* for steady-state Q-scale tracking */
+            || (its < 5 && (frame_bits < too_few_bits || frame_bits > 
too_many_bits))
+            || frame_bits >= 6144 * s->channels - 3  )
+        {
+            float ratio = ((float)target_bits) / frame_bits;
+
+            if (frame_bits >= too_few_bits && frame_bits <= too_many_bits) {
+                /*
+                 * This path is for steady-state Q-scale tracking
+                 * When frame bits fall within the stable range, we still need 
to adjust
+                 * lambda to maintain it like so in a stable fashion (large 
jumps in lambda
+                 * create artifacts and should be avoided), but slowly
+                 */
+                ratio = sqrtf(sqrtf(ratio));
+                ratio = av_clipf(ratio, 0.9f, 1.1f);
+            } else {
+                /* Not so fast though */
+                ratio = sqrtf(ratio);
+            }
+            s->lambda = FFMIN(s->lambda * ratio, 65536.f);
 
+            /* Keep iterating if we must reduce and lambda is in the sky */
+            if (s->lambda < 300.f || ratio > 0.9f) {
+                break;
+            } else {
+                if (is_mode || ms_mode || tns_mode || pred_mode) {
+                    for (i = 0; i < s->chan_map[0]; i++) {
+                        // Must restore coeffs
+                        chans = tag == TYPE_CPE ? 2 : 1;
+                        cpe = &s->cpe[i];
+                        for (ch = 0; ch < chans; ch++)
+                            memcpy(cpe->ch[ch].coeffs, cpe->ch[ch].pcoeffs, 
sizeof(cpe->ch[ch].coeffs));
+                    }
+                }
+                its++;
+            }
+        } else {
+            break;
+        }
     } while (1);
 
     put_bits(&s->pb, 3, TYPE_END);
     flush_put_bits(&s->pb);
     avctx->frame_bits = put_bits_count(&s->pb);
 
-    // rate control stuff
-    if (!(avctx->flags & AV_CODEC_FLAG_QSCALE)) {
-        float ratio = avctx->bit_rate * 1024.0f / avctx->sample_rate / 
avctx->frame_bits;
-        s->lambda *= ratio;
-        s->lambda = FFMIN(s->lambda, 65536.f);
-    }
-
     if (!frame)
         s->last_frame++;
 
diff --git a/libavcodec/aacenc.h b/libavcodec/aacenc.h
index 54951f9..7e7609b 100644
--- a/libavcodec/aacenc.h
+++ b/libavcodec/aacenc.h
@@ -96,10 +96,12 @@ typedef struct AACEncContext {
     FFPsyContext psy;
     struct FFPsyPreprocessContext* psypp;
     AACCoefficientsEncoder *coder;
-    int cur_channel;
+    int cur_channel;                             ///< current channel for 
coder context
     int last_frame;
     int random_state;
     float lambda;
+    enum RawDataBlockType cur_type;              ///< channel group type 
cur_channel belongs to
+
     AudioFrameQueue afq;
     DECLARE_ALIGNED(16, int,   qcoefs)[96];      ///< quantized coefficients
     DECLARE_ALIGNED(32, float, scoefs)[1024];    ///< scaled coefficients
diff --git a/libavcodec/aacpsy.c b/libavcodec/aacpsy.c
index 82b670d..af235c7 100644
--- a/libavcodec/aacpsy.c
+++ b/libavcodec/aacpsy.c
@@ -87,6 +87,7 @@ enum {
 };
 
 #define PSY_3GPP_BITS_TO_PE(bits) ((bits) * 1.18f)
+#define PSY_3GPP_PE_TO_BITS(bits) ((bits) / 1.18f)
 
 /* LAME psy model constants */
 #define PSY_LAME_FIR_LEN 21         ///< LAME psy model FIR order
@@ -687,6 +688,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int 
channel,
         desired_pe *= av_clipf(pctx->pe.previous / 
PSY_3GPP_BITS_TO_PE(ctx->bitres.bits),
                                0.85f, 1.15f);
     pctx->pe.previous = PSY_3GPP_BITS_TO_PE(desired_bits);
+    ctx->bitres.alloc = desired_bits;
 
     if (desired_pe < pe) {
         /* 5.6.1.3.4 "First Estimation of the reduction value" */
@@ -788,6 +790,7 @@ static void psy_3gpp_analyze_channel(FFPsyContext *ctx, int 
channel,
             psy_band->threshold = band->thr;
             psy_band->energy    = band->energy;
             psy_band->spread    = band->active_lines * 2.0f / band_sizes[g];
+            psy_band->bits      = PSY_3GPP_PE_TO_BITS(band->pe);
         }
     }
 
diff --git a/libavcodec/psymodel.h b/libavcodec/psymodel.h
index e9be1f6..a04cc4d 100644
--- a/libavcodec/psymodel.h
+++ b/libavcodec/psymodel.h
@@ -88,6 +88,7 @@ typedef struct FFPsyContext {
     struct {
         int size;                     ///< size of the bitresevoir in bits
         int bits;                     ///< number of bits used in the 
bitresevoir
+        int alloc;                    ///< number of bits allocated by the 
psy, or -1 if no allocation was done
     } bitres;
 
     void* model_priv_data;            ///< psychoacoustic model implementation 
private data
diff --git a/tests/fate/aac.mak b/tests/fate/aac.mak
index f30d4db..8e9c915 100644
--- a/tests/fate/aac.mak
+++ b/tests/fate/aac.mak
@@ -146,7 +146,7 @@ fate-aac-aref-encode: CMD = enc_dec_pcm adts wav s16le 
$(REF) -strict -2 -c:a aa
 fate-aac-aref-encode: CMP = stddev
 fate-aac-aref-encode: REF = ./tests/data/asynth-44100-2.wav
 fate-aac-aref-encode: CMP_SHIFT = -4096
-fate-aac-aref-encode: CMP_TARGET = 594
+fate-aac-aref-encode: CMP_TARGET = 584
 fate-aac-aref-encode: SIZE_TOLERANCE = 2464
 fate-aac-aref-encode: FUZZ = 6
 
@@ -172,7 +172,7 @@ fate-aac-pns-encode: CMD = enc_dec_pcm adts wav s16le 
$(TARGET_SAMPLES)/audio-re
 fate-aac-pns-encode: CMP = stddev
 fate-aac-pns-encode: REF = 
$(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-pns-encode: CMP_SHIFT = -4096
-fate-aac-pns-encode: CMP_TARGET = 633.77
+fate-aac-pns-encode: CMP_TARGET = 623.77
 fate-aac-pns-encode: SIZE_TOLERANCE = 3560
 fate-aac-pns-encode: FUZZ = 1
 
@@ -181,7 +181,7 @@ fate-aac-tns-encode: CMD = enc_dec_pcm adts wav s16le 
$(TARGET_SAMPLES)/audio-re
 fate-aac-tns-encode: CMP = stddev
 fate-aac-tns-encode: REF = 
$(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-tns-encode: CMP_SHIFT = -4096
-fate-aac-tns-encode: CMP_TARGET = 650.37
+fate-aac-tns-encode: CMP_TARGET = 644.50
 fate-aac-tns-encode: FUZZ = 2.8
 fate-aac-tns-encode: SIZE_TOLERANCE = 3560
 
@@ -190,7 +190,7 @@ fate-aac-is-encode: CMD = enc_dec_pcm adts wav s16le 
$(TARGET_SAMPLES)/audio-ref
 fate-aac-is-encode: CMP = stddev
 fate-aac-is-encode: REF = 
$(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-is-encode: CMP_SHIFT = -4096
-fate-aac-is-encode: CMP_TARGET = 616.75
+fate-aac-is-encode: CMP_TARGET = 614.04
 fate-aac-is-encode: SIZE_TOLERANCE = 3560
 fate-aac-is-encode: FUZZ = 1
 
@@ -199,7 +199,7 @@ fate-aac-pred-encode: CMD = enc_dec_pcm adts wav s16le 
$(TARGET_SAMPLES)/audio-r
 fate-aac-pred-encode: CMP = stddev
 fate-aac-pred-encode: REF = 
$(SAMPLES)/audio-reference/luckynight_2ch_44kHz_s16.wav
 fate-aac-pred-encode: CMP_SHIFT = -4096
-fate-aac-pred-encode: CMP_TARGET = 652.60
+fate-aac-pred-encode: CMP_TARGET = 657
 fate-aac-pred-encode: FUZZ = 5
 fate-aac-pred-encode: SIZE_TOLERANCE = 3560
 

_______________________________________________
ffmpeg-cvslog mailing list
ffmpeg-cvslog@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-cvslog

[FFmpeg-cvslog] AAC encoder: tweak rate-distortion logic

Reply via email to