[FFmpeg-cvslog] [ffmpeg] avfilter/af_whisper: Add max_len parameter (branch master)

WyattBlue via ffmpeg-cvslog Wed, 04 Feb 2026 13:26:33 -0800

This is an automated email from the git hooks/post-receive script.

Git pushed a commit to branch master
in repository ffmpeg.


The following commit(s) were added to refs/heads/master by this push:
     new e48eaa8c62 avfilter/af_whisper: Add max_len parameter
e48eaa8c62 is described below

commit e48eaa8c6211069454b054e0cad4f8270fb563cc
Author:     WyattBlue <[email protected]>
AuthorDate: Sun Dec 21 23:51:15 2025 -0500
Commit:     Marton Balint <[email protected]>
CommitDate: Wed Feb 4 21:26:02 2026 +0000

    avfilter/af_whisper: Add max_len parameter
---
 doc/filters.texi         |  6 ++++++
 libavfilter/af_whisper.c | 12 ++++++++++++
 2 files changed, 18 insertions(+)

diff --git a/doc/filters.texi b/doc/filters.texi
index bd9f881aa1..22d0fcf90d 100644
--- a/doc/filters.texi
+++ b/doc/filters.texi
@@ -7768,6 +7768,12 @@ The destination format string; it could be "text" (only 
the transcribed text
 will be sent to the destination), "srt" (subtitle format) or "json".
 Default value: @code{"text"}
 
+@item max_len
+Maximum segment length in characters. When set to a value greater than 0,
+transcription segments will be split to not exceed this length. This is useful
+for generating subtitles with shorter lines.
+Default value: @code{"0"}
+
 @item vad_model
 Path to the VAD model file. If set, the filter will load an additional voice
 activity detection module (https://github.com/snakers4/silero-vad) that will be
diff --git a/libavfilter/af_whisper.c b/libavfilter/af_whisper.c
index fcc7e415cc..299a8bca7a 100644
--- a/libavfilter/af_whisper.c
+++ b/libavfilter/af_whisper.c
@@ -52,6 +52,7 @@ typedef struct WhisperContext {
     int64_t queue;
     char *destination;
     char *format;
+    int max_len;
 
     struct whisper_context *ctx_wsp;
     struct whisper_vad_context *ctx_vad;
@@ -204,6 +205,8 @@ static void run_transcription(AVFilterContext *ctx, AVFrame 
*frame, int samples)
     params.print_progress = 0;
     params.print_realtime = 0;
     params.print_timestamps = 0;
+    params.max_len = wctx->max_len;
+    params.token_timestamps = (wctx->max_len > 0);
 
     if (whisper_full(wctx->ctx_wsp, params, wctx->audio_buffer, samples) != 0) 
{
         av_log(ctx, AV_LOG_ERROR, "Failed to process audio with 
whisper.cpp\n");
@@ -224,6 +227,14 @@ static void run_transcription(AVFilterContext *ctx, 
AVFrame *frame, int samples)
             continue;
         }
 
+        // Skip segments that are parts of [BLANK_AUDIO] when max_len splits 
them
+        if (wctx->max_len > 0 && (strcmp(text_cleaned, "[") == 0 || 
strcmp(text_cleaned, "]") == 0 ||
+                                  strcmp(text_cleaned, "BLANK") == 0 || 
strcmp(text_cleaned, "_") == 0 ||
+                                  strcmp(text_cleaned, "AUDIO") == 0)) {
+            av_freep(&text_cleaned);
+            continue;
+        }
+
         const bool turn = 
whisper_full_get_segment_speaker_turn_next(wctx->ctx_wsp, i);
         const int64_t t0_ms = whisper_full_get_segment_t0(wctx->ctx_wsp, i) * 
10;
         const int64_t t1_ms = whisper_full_get_segment_t1(wctx->ctx_wsp, i) * 
10;
@@ -437,6 +448,7 @@ static const AVOption whisper_options[] = {
     { "gpu_device", "GPU device to use", OFFSET(gpu_device), AV_OPT_TYPE_INT, 
{.i64 = 0}, 0, INT_MAX, .flags = FLAGS },
     { "destination", "Output destination", OFFSET(destination), 
AV_OPT_TYPE_STRING, {.str = ""}, .flags = FLAGS },
     { "format", "Output format (text|srt|json)", OFFSET(format), 
AV_OPT_TYPE_STRING, {.str = "text"},.flags = FLAGS },
+    { "max_len", "Max segment length in characters", OFFSET(max_len), 
AV_OPT_TYPE_INT, {.i64 = 0}, 0, INT_MAX, .flags = FLAGS },
     { "vad_model", "Path to the VAD model file", OFFSET(vad_model_path), 
AV_OPT_TYPE_STRING,.flags = FLAGS },
     { "vad_threshold", "VAD threshold", OFFSET(vad_threshold), 
AV_OPT_TYPE_FLOAT, {.dbl = 0.5}, 0.0, 1.0, .flags = FLAGS },
     { "vad_min_speech_duration", "Minimum speech duration for VAD", 
OFFSET(vad_min_speech_duration), AV_OPT_TYPE_DURATION, {.i64 = 100000}, 20000, 
HOURS, .flags = FLAGS },

_______________________________________________
ffmpeg-cvslog mailing list -- [email protected]
To unsubscribe send an email to [email protected]

[FFmpeg-cvslog] [ffmpeg] avfilter/af_whisper: Add max_len parameter (branch master)

Reply via email to