Hi, I've applied the suggested changes and changed the commit message to fix the patchwork warning. Let's continue the discussion under the new thread (https://ffmpeg.org/pipermail/ffmpeg-devel/2025-July/346850.html).
On Sat, Jul 19, 2025 at 2:16 AM Michael Niedermayer <mich...@niedermayer.cc> wrote: > > Hi Vittorio > > On Thu, Jul 17, 2025 at 10:51:57AM +0200, Vittorio Palmisano wrote: > > It adds a new audio filter for running audio transcriptions with the > > whisper model. > > Documentation and examples are included into the patch. > > > > Signed-off-by: Vittorio Palmisano <vpalmis...@gmail.com> > > --- > > configure | 5 + > > doc/filters.texi | 107 +++++++++ > > libavfilter/Makefile | 2 + > > libavfilter/af_whisper.c | 452 +++++++++++++++++++++++++++++++++++++++ > > libavfilter/allfilters.c | 2 + > > 5 files changed, 568 insertions(+) > > create mode 100644 libavfilter/af_whisper.c > [...] > > > +static void cb_log(enum ggml_log_level level, const char *text, void > > *user_data) > > +{ > > + AVFilterContext *ctx = (AVFilterContext *) user_data; > > + switch (level) { > > + case GGML_LOG_LEVEL_ERROR: > > + av_log(ctx, AV_LOG_ERROR, "%s", text); > > + break; > > + case GGML_LOG_LEVEL_WARN: > > + av_log(ctx, AV_LOG_WARNING, "%s", text); > > + break; > > + case GGML_LOG_LEVEL_INFO: > > + case GGML_LOG_LEVEL_DEBUG: > > + av_log(ctx, AV_LOG_DEBUG, "%s", text); > > + break; > > + } > > +} > > you can factor the function calls out of the switch/case > > > > + > > +static int init(AVFilterContext *ctx) > > +{ > > + WhisperContext *wctx = ctx->priv; > > + > > + static AVOnce init_static_once = AV_ONCE_INIT; > > + ff_thread_once(&init_static_once, ggml_backend_load_all); > > + > > + whisper_log_set(cb_log, ctx); > > + > > + // Init whisper context > > + if (!wctx->model_path) { > > + av_log(ctx, AV_LOG_ERROR, "No whisper model path specified. Use > > the 'model' option.\n"); > > + return AVERROR(EINVAL); > > + } > > + > > + struct whisper_context_params params = > > whisper_context_default_params(); > > + params.use_gpu = wctx->use_gpu; > > + params.gpu_device = wctx->gpu_device; > > + > > + wctx->ctx_wsp = whisper_init_from_file_with_params(wctx->model_path, > > params); > > + if (wctx->ctx_wsp == NULL) { > > + av_log(ctx, AV_LOG_ERROR, "Failed to initialize whisper context > > from model: %s\n", wctx->model_path); > > + return AVERROR(EIO); > > + } > > + > > + // Init buffer > > > + wctx->audio_buffer_queue_size = WHISPER_SAMPLE_RATE * wctx->queue / > > 1000000; > > The multiplication can overflow also the 32bit output could overflow > best is probably to limit queue to a more reasonable value than INT64_MAX > > > > + wctx->audio_buffer = av_malloc(wctx->audio_buffer_queue_size * > > sizeof(*wctx->audio_buffer)); > > av_calloc() or av_malloc_array() > > > [...] > > +static void run_transcription(AVFilterContext *ctx, AVDictionary > > **metadata, int end_pos) > > +{ > > + WhisperContext *wctx = ctx->priv; > > + end_pos = FFMAX(0, FFMIN(end_pos, wctx->audio_buffer_fill_size)); > > + > > + if (!wctx->ctx_wsp || end_pos == 0) > > + return; > > + > > + float duration = (float) end_pos / WHISPER_SAMPLE_RATE; > [...] > > > + wctx->timestamp += duration * 1000; > > floats are not precise and the accumulated rounding errors will > add up and lead to synchronization issues between the subtitles > and audio or video over a long enough timespan > > Also for reproducability this should use integers > > what you could do, is to use: > wctx->timestamp += end_pos; > > and then replace every use of wctx->timestamp by wctx->timestamp / > WHISPER_SAMPLE_RATE > > or wctx->timestamp / (double)WHISPER_SAMPLE_RATE if the context demands a > double for example > > that way the code is exact and no errors accumulate > > > > + > > + if (metadata && segments_text) { > > + av_dict_set(metadata, "lavfi.whisper.text", segments_text, 0); > > + char *duration_text = av_asprintf("%f", duration); > > + av_dict_set(metadata, "lavfi.whisper.duration", duration_text, > > AV_DICT_DONT_STRDUP_VAL); > > + } > > + av_freep(&segments_text); > > + > > + memcpy(wctx->audio_buffer, wctx->audio_buffer + end_pos, end_pos * > > sizeof(*wctx->audio_buffer)); > > + wctx->audio_buffer_fill_size -= end_pos; > > + wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size; > > +} > > + > > > +static int filter_frame(AVFilterLink *inlink, AVFrame *frame) > > +{ > > + AVFilterContext *ctx = inlink->dst; > > + WhisperContext *wctx = ctx->priv; > > + AVFilterLink *outlink = ctx->outputs[0]; > > + AVDictionary **metadata = &frame->metadata; > > + > > + const int samples = frame->nb_samples; > > + const float *input_data = (const float *) frame->data[0]; > > + > > + if (wctx->audio_buffer_fill_size + samples > > > wctx->audio_buffer_queue_size) { > > + run_transcription(ctx, metadata, wctx->audio_buffer_fill_size); > > + } > > + > > + memcpy(wctx->audio_buffer + wctx->audio_buffer_fill_size, input_data, > > samples * sizeof(*wctx->audio_buffer)); > > + wctx->audio_buffer_fill_size += samples; > > + > > + if (wctx->ctx_vad > > + && (wctx->audio_buffer_fill_size - wctx->audio_buffer_vad_size) >= > > + WHISPER_SAMPLE_RATE * (wctx->vad_min_speech_duration + > > wctx->vad_min_silence_duration) / 1000000) { > > + struct whisper_vad_segments *segments = > > whisper_vad_segments_from_samples(wctx->ctx_vad, > > + > > wctx->vad_params, > > + > > wctx->audio_buffer, > > + > > wctx->audio_buffer_fill_size); > > + wctx->audio_buffer_vad_size = wctx->audio_buffer_fill_size; > > + > > + if (!segments) { > > + av_log(ctx, AV_LOG_ERROR, "failed to detect VAD\n"); > > + } else { > > + int n_segments = whisper_vad_segments_n_segments(segments); > > + > > + if (n_segments > 0) { > > + const float start_ms = > > whisper_vad_segments_get_segment_t0(segments, 0) * 10.0; > > + const float end_ms = > > whisper_vad_segments_get_segment_t1(segments, n_segments - 1) * 10.0; > > + int end_pos = (int) (end_ms * WHISPER_SAMPLE_RATE / 1000); > > + > > + if (end_pos <= wctx->audio_buffer_fill_size - > > WHISPER_SAMPLE_RATE * wctx->vad_min_silence_duration / 1000000) { > > + av_log(ctx, AV_LOG_INFO, > > + "VAD detected %d segments, start: %.0f ms, > > end: %.0f ms (buffer: %d ms)\n", > > + n_segments, start_ms, end_ms, 1000 * > > wctx->audio_buffer_fill_size / WHISPER_SAMPLE_RATE); > > + run_transcription(ctx, metadata, end_pos); > > + } > > + } > > + > > + whisper_vad_free_segments(segments); > > + } > > + } else if (wctx->audio_buffer_fill_size >= > > wctx->audio_buffer_queue_size) > > + run_transcription(ctx, metadata, wctx->audio_buffer_fill_size); > > + > > + wctx->next_pts = frame->pts + av_rescale_q(frame->nb_samples, > > (AVRational) { > > + 1, inlink->sample_rate} > > + , inlink->time_base); > > I think you should consistently use samples or frame->nb_samples, they are > the same > value i think > > thx > > [...] > -- > Michael GnuPG fingerprint: 9FF2128B147EF6730BADF133611EC787040B0FAB > > Never trust a computer, one day, it may think you are the virus. -- Compn > _______________________________________________ > ffmpeg-devel mailing list > ffmpeg-devel@ffmpeg.org > https://ffmpeg.org/mailman/listinfo/ffmpeg-devel > > To unsubscribe, visit link above, or email > ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe". -- /Vittorio Palmisano/ _______________________________________________ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org https://ffmpeg.org/mailman/listinfo/ffmpeg-devel To unsubscribe, visit link above, or email ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".