enc: fix CELT psy analysis and packet sizing

Daniil Cherednik via ffmpeg-cvslog Sat, 23 May 2026 03:39:05 -0700

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


commit 816d74e0bb61dfd6f4f68a3c19b143fa508fb22f
Author:     Daniil Cherednik <[email protected]>
AuthorDate: Sat May 2 00:20:07 2026 +0200
Commit:     Lynne <[email protected]>
CommitDate: Sat May 23 10:38:28 2026 +0000

    avcodec/opus/enc: fix CELT psy analysis and packet sizing
    
    The CELT psychoacoustic path was effectively broken: analysis could use
    the wrong queued audio and stale scratch samples, and raw band scores were
    folded into the frame bit budget, where they could overflow instead of
    only driving alloc_boost.
    
    On top of that, c3aea7628c changed avctx->frame_size from fixed
    120-sample steps to a configuration-derived value, while the CELT input
    and psy paths still treated queue entries as 120-sample steps. That could
    misalign psy analysis, read before a short overlap frame, stall silent
    flushes, poison rate control with zero-bit silent frames, and overrun the
    range coder on EOF or short tails.
    
    This commit fixes these cases by using avctx->frame_size for psy step
    accounting, aligning bufqueue analysis with actual audio, padding short
    overlaps, and avoiding invalid bit-budget updates for silent or EOF
    packets. This lets CELT produce valid packets again.
---
 libavcodec/opus/enc.c     | 15 ++++++--
 libavcodec/opus/enc_psy.c | 90 +++++++++++++++++++++++++++++++++++------------
 2 files changed, 81 insertions(+), 24 deletions(-)

diff --git a/libavcodec/opus/enc.c b/libavcodec/opus/enc.c
index 587e4ed69d..433cf066bc 100644
--- a/libavcodec/opus/enc.c
+++ b/libavcodec/opus/enc.c
@@ -130,9 +130,20 @@ static void celt_frame_setup_input(OpusEncContext *s, 
CeltFrame *f)
 
     for (int ch = 0; ch < f->channels; ch++) {
         CeltBlock *b = &f->block[ch];
-        const void *input = cur->extended_data[ch];
+        const char *input = cur->extended_data[ch];
         size_t bps = av_get_bytes_per_sample(cur->format);
-        memcpy(b->overlap, input, bps*cur->nb_samples);
+        /* The MDCT overlap is the trailing CELT_OVERLAP samples of the
+         * previous packet's last frame. Because the encoder advertises
+         * AV_CODEC_CAP_SMALL_LAST_FRAME, that frame can be shorter than
+         * CELT_OVERLAP; in that case, zero-pad the leading part of the
+         * overlap buffer and copy only what's available. */
+        int n = FFMIN(cur->nb_samples, CELT_OVERLAP);
+        if (n < CELT_OVERLAP) {
+            memset(b->overlap, 0, (CELT_OVERLAP - n) * bps);
+        }
+        memcpy((char *)b->overlap + (CELT_OVERLAP - n) * bps,
+               input + (cur->nb_samples - n) * bps,
+               n * bps);
     }
 
     av_frame_free(&cur);
diff --git a/libavcodec/opus/enc_psy.c b/libavcodec/opus/enc_psy.c
index 250cfb567a..ae9ae021d8 100644
--- a/libavcodec/opus/enc_psy.c
+++ b/libavcodec/opus/enc_psy.c
@@ -82,20 +82,34 @@ static float pvq_band_cost(CeltPVQ *pvq, CeltFrame *f, 
OpusRangeCoder *rc, int b
 static void step_collect_psy_metrics(OpusPsyContext *s, int index)
 {
     int silence = 0, ch, i, j;
+    /* The MDCT analysis covers 2 * OPUS_BLOCK_SIZE(bsize_analysis) samples.
+     * Each bufqueue entry holds avctx->frame_size samples (120 historically,
+     * 960 after c3aea7628c for default settings). steps_per_half is how many
+     * bufqueue entries fill one MDCT half-window.
+     *
+     * bufqueue[0] is reserved for the previous packet's overlap (initially the
+     * empty padding frame). The audio that step N analyzes starts at
+     * bufqueue[index + 1]; bufqueue[index] is its preceding overlap. */
+    const int half_samples   = OPUS_BLOCK_SIZE(s->bsize_analysis);
+    const int step_samples   = s->avctx->frame_size;
+    const int steps_per_half = half_samples / step_samples;
     OpusPsyStep *st = s->steps[index];
 
     st->index = index;
 
     for (ch = 0; ch < s->avctx->ch_layout.nb_channels; ch++) {
-        const int lap_size = (1 << s->bsize_analysis);
-        for (i = 1; i <= FFMIN(lap_size, index); i++) {
-            const int offset = i*120;
-            AVFrame *cur = ff_bufqueue_peek(s->bufqueue, index - i);
+        memset(s->scratch, 0, sizeof(float) * (half_samples << 1));
+
+        for (i = 1; i <= FFMIN(steps_per_half, index + 1); i++) {
+            const int offset = (steps_per_half - i) * step_samples;
+            AVFrame *cur = ff_bufqueue_peek(s->bufqueue, index + 1 - i);
             memcpy(&s->scratch[offset], cur->extended_data[ch], 
cur->nb_samples*sizeof(float));
         }
-        for (i = 0; i < lap_size; i++) {
-            const int offset = i*120 + lap_size;
-            AVFrame *cur = ff_bufqueue_peek(s->bufqueue, index + i);
+        for (i = 0; i < steps_per_half; i++) {
+            if (index + 1 + i >= s->bufqueue->available)
+                break;
+            const int offset = (steps_per_half + i) * step_samples;
+            AVFrame *cur = ff_bufqueue_peek(s->bufqueue, index + 1 + i);
             memcpy(&s->scratch[offset], cur->extended_data[ch], 
cur->nb_samples*sizeof(float));
         }
 
@@ -187,6 +201,13 @@ static void search_for_change_points(OpusPsyContext *s, 
float tgt_change,
 
 static int flush_silent_frames(OpusPsyContext *s)
 {
+    /* buffered_steps and silent_frames count psy steps, each of which is
+     * avctx->frame_size samples (120 historically, up to 960 after
+     * c3aea7628c). The CELT framesize fsize we pick must consume at least
+     * one psy step per emitted packet, otherwise postencode_update would
+     * compute steps_out = 0 and the psy state would stall while bufqueue
+     * and afq drain. */
+    const int step_samples = s->avctx->frame_size;
     int fsize, silent_frames;
 
     for (silent_frames = 0; silent_frames < s->buffered_steps; silent_frames++)
@@ -196,9 +217,15 @@ static int flush_silent_frames(OpusPsyContext *s)
         return 0;
 
     for (fsize = CELT_BLOCK_960; fsize > CELT_BLOCK_120; fsize--) {
-        if ((1 << fsize) > silent_frames)
+        const int packet_samples   = OPUS_BLOCK_SIZE(fsize);
+        const int steps_per_packet = packet_samples / step_samples;
+
+        if (steps_per_packet < 1 || silent_frames < steps_per_packet)
             continue;
-        s->p.frames = FFMIN(silent_frames / (1 << fsize), 48 >> fsize);
+        /* 48 * CELT_SHORT_BLOCKSIZE = 5760 samples = 120 ms; matches the
+         * historical (48 >> fsize) cap for frame_size = 120. */
+        s->p.frames = FFMIN(silent_frames / steps_per_packet,
+                            (48 * CELT_SHORT_BLOCKSIZE) / packet_samples);
         s->p.framesize = fsize;
         return 1;
     }
@@ -230,7 +257,10 @@ int ff_opus_psy_process(OpusPsyContext *s, OpusPacketInfo 
*p)
     float total_energy_change = 0.0f;
 
     if (s->buffered_steps < s->max_steps && !s->eof) {
-        const int awin = (1 << s->bsize_analysis);
+        /* awin counts how many bufqueue entries fill one MDCT half-window.
+         * With frame_size=120 this is 8 (matches the historical 1 << 
bsize_analysis);
+         * with frame_size=960 it is 1, so we collect every call. */
+        const int awin = OPUS_BLOCK_SIZE(s->bsize_analysis) / 
s->avctx->frame_size;
         if (++s->steps_to_process >= awin) {
             step_collect_psy_metrics(s, s->buffered_steps - awin + 1);
             s->steps_to_process = 0;
@@ -258,7 +288,8 @@ int ff_opus_psy_process(OpusPsyContext *s, OpusPacketInfo 
*p)
 void ff_opus_psy_celt_frame_init(OpusPsyContext *s, CeltFrame *f, int index)
 {
     int i, neighbouring_points = 0, start_offset = 0;
-    int radius = (1 << s->p.framesize), step_offset = radius*index;
+    int steps_per_frame = OPUS_BLOCK_SIZE(s->p.framesize) / 
s->avctx->frame_size;
+    int step_offset = steps_per_frame*index;
     int silence = 1;
 
     f->start_band = (s->p.mode == OPUS_MODE_HYBRID) ? 17 : 0;
@@ -266,8 +297,17 @@ void ff_opus_psy_celt_frame_init(OpusPsyContext *s, 
CeltFrame *f, int index)
     f->channels   = s->avctx->ch_layout.nb_channels;
     f->size       = s->p.framesize;
 
-    for (i = 0; i < (1 << f->size); i++)
-        silence &= s->steps[index*(1 << f->size) + i]->silence;
+    for (i = 0; i < steps_per_frame; i++)
+        silence &= s->steps[index * steps_per_frame + i]->silence;
+
+    /* If we reach EOF with fewer collected psy steps than this packet wants
+     * to encode, the slots beyond buffered_steps were zeroed by the previous
+     * postencode_update and contain no valid analysis data. Encoding garbage
+     * with the full rate budget can overrun the range coder buffer (rng_bytes
+     * exceeds the per-frame size passed to ff_opus_rc_enc_end), so force a
+     * silent packet instead. */
+    if (s->eof && step_offset >= s->buffered_steps)
+        silence = 1;
 
     f->silence = silence;
     if (f->silence) {
@@ -282,8 +322,8 @@ void ff_opus_psy_celt_frame_init(OpusPsyContext *s, 
CeltFrame *f, int index)
         }
     }
 
-    for (i = start_offset; i < FFMIN(radius, s->inflection_points_count - 
start_offset); i++) {
-        if (s->inflection_points[i] < (step_offset + radius)) {
+    for (i = start_offset; i < FFMIN(steps_per_frame, 
s->inflection_points_count - start_offset); i++) {
+        if (s->inflection_points[i] < (step_offset + steps_per_frame)) {
             neighbouring_points++;
         }
     }
@@ -316,6 +356,7 @@ static void celt_gauge_psy_weight(OpusPsyContext *s, 
OpusPsyStep **start,
 {
     int i, f, ch;
     int frame_size = OPUS_BLOCK_SIZE(s->p.framesize);
+    int steps_per_frame = frame_size / s->avctx->frame_size;
     float rate, frame_bits = 0;
 
     /* Used for the global ROTATE flag */
@@ -329,7 +370,7 @@ static void celt_gauge_psy_weight(OpusPsyContext *s, 
OpusPsyStep **start,
     for (i = 0; i < CELT_MAX_BANDS; i++) {
         float weight = 0.0f;
         float tonal_contrib = 0.0f;
-        for (f = 0; f < (1 << s->p.framesize); f++) {
+        for (f = 0; f < steps_per_frame; f++) {
             weight = start[f]->stereo[i];
             for (ch = 0; ch < s->avctx->ch_layout.nb_channels; ch++) {
                 weight += start[f]->change_amp[ch][i] + start[f]->tone[ch][i] 
+ start[f]->energy[ch][i];
@@ -349,7 +390,7 @@ static void celt_gauge_psy_weight(OpusPsyContext *s, 
OpusPsyStep **start,
 
     for (i = 0; i < CELT_MAX_BANDS; i++) {
         f_out->alloc_boost[i] = (int)((band_score[i]/max_score)*3.0f);
-        frame_bits += band_score[i]*8.0f;
+        //TODO: implements frame_bits adjustment.
     }
 
     tonal /= 1333136.0f;
@@ -425,6 +466,7 @@ static void celt_search_for_intensity(OpusPsyContext *s, 
CeltFrame *f)
 static int celt_search_for_tf(OpusPsyContext *s, OpusPsyStep **start, 
CeltFrame *f)
 {
     int i, j, k, cway, config[2][CELT_MAX_BANDS] = { { 0 } };
+    int steps_per_frame = OPUS_BLOCK_SIZE(f->size) / s->avctx->frame_size;
     float score[2] = { 0 };
 
     for (cway = 0; cway < 2; cway++) {
@@ -439,7 +481,7 @@ static int celt_search_for_tf(OpusPsyContext *s, 
OpusPsyStep **start, CeltFrame
         for (i = 0; i < CELT_MAX_BANDS; i++) {
             float iscore0 = 0.0f;
             float iscore1 = 0.0f;
-            for (j = 0; j < (1 << f->size); j++) {
+            for (j = 0; j < steps_per_frame; j++) {
                 for (k = 0; k < s->avctx->ch_layout.nb_channels; k++) {
                     iscore0 += 
start[j]->tone[k][i]*start[j]->change_amp[k][i]/mag[0];
                     iscore1 += 
start[j]->tone[k][i]*start[j]->change_amp[k][i]/mag[1];
@@ -459,7 +501,8 @@ static int celt_search_for_tf(OpusPsyContext *s, 
OpusPsyStep **start, CeltFrame
 int ff_opus_psy_celt_frame_process(OpusPsyContext *s, CeltFrame *f, int index)
 {
     int start_transient_flag = f->transient;
-    OpusPsyStep **start = &s->steps[index * (1 << s->p.framesize)];
+    int steps_per_frame = OPUS_BLOCK_SIZE(s->p.framesize) / 
s->avctx->frame_size;
+    OpusPsyStep **start = &s->steps[index * steps_per_frame];
 
     if (f->silence)
         return 0;
@@ -480,7 +523,7 @@ int ff_opus_psy_celt_frame_process(OpusPsyContext *s, 
CeltFrame *f, int index)
 void ff_opus_psy_postencode_update(OpusPsyContext *s, CeltFrame *f)
 {
     int i, frame_size = OPUS_BLOCK_SIZE(s->p.framesize);
-    int steps_out = s->p.frames*(frame_size/120);
+    int steps_out = s->p.frames*(frame_size/s->avctx->frame_size);
     void *tmp[FF_BUFQUEUE_SIZE];
     float ideal_fbits;
 
@@ -502,7 +545,8 @@ void ff_opus_psy_postencode_update(OpusPsyContext *s, 
CeltFrame *f)
 
     for (i = 0; i < s->p.frames; i++) {
         s->avg_is_band += f[i].intensity_stereo;
-        s->lambda *= ideal_fbits / f[i].framebits;
+        if (f[i].framebits > 0)
+            s->lambda *= ideal_fbits / f[i].framebits;
     }
 
     s->avg_is_band /= (s->p.frames + 1);
@@ -522,7 +566,9 @@ av_cold int ff_opus_psy_init(OpusPsyContext *s, 
AVCodecContext *avctx,
     s->options = options;
     s->avctx = avctx;
     s->bufqueue = bufqueue;
-    s->max_steps = ceilf(s->options->max_delay_ms/2.5f);
+    s->max_steps = ceilf(s->options->max_delay_ms * avctx->sample_rate /
+        (1000.0f * avctx->frame_size));
+
     s->bsize_analysis = CELT_BLOCK_960;
     s->avg_is_band = CELT_MAX_BANDS - 1;
     s->inflection_points_count = 0;

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] 01/02: avcodec/opus/enc: fix CELT psy analysis and packet sizing

Reply via email to