Re: [FFmpeg-devel] [PATCH] avformat: Implement subtitle charenc guessing

wm4 Sat, 13 Dec 2014 07:12:06 -0800

On Fri, 12 Dec 2014 00:05:27 -0600
Rodger Combs <rodger.co...@gmail.com> wrote:


> This also moves general charenc conversion from avcodec to avformat;
> the version in avcodec is left, but renamed; I'm not sure if that's
> the optimal solution.
> 
> The documentation could probably use some improvements, and a few more
> options could be added to ENCA.
> 
> This very simply prefers libguess over ENCA, and ENCA over uchardet, but
> will fall back on a less-preferred guess if something decodes wrong, and will
> drop illegal sequences in iconv if all else fails.
> 
> It'd be possible to have ffmpeg.c present a UI if multiple guesses are
> returned, and other library consumers could do the same.
> ---
>  configure                   |  15 +++
>  libavcodec/options_table.h  |   2 +-
>  libavformat/aqtitledec.c    |   2 +
>  libavformat/assdec.c        |   2 +
>  libavformat/avformat.h      |  50 +++++++++
>  libavformat/jacosubdec.c    |   2 +
>  libavformat/microdvddec.c   |   2 +
>  libavformat/mpl2dec.c       |   2 +
>  libavformat/mpsubdec.c      |   2 +
>  libavformat/options_table.h |   7 ++
>  libavformat/pjsdec.c        |   2 +
>  libavformat/realtextdec.c   |   2 +
>  libavformat/samidec.c       |   2 +
>  libavformat/srtdec.c        |   2 +
>  libavformat/stldec.c        |   2 +
>  libavformat/subtitles.c     | 262 
> +++++++++++++++++++++++++++++++++++++++++++-
>  libavformat/subtitles.h     |   1 +
>  libavformat/subviewer1dec.c |   2 +
>  libavformat/subviewerdec.c  |   2 +
>  libavformat/utils.c         |   2 +
>  libavformat/vplayerdec.c    |   2 +
>  libavformat/webvttdec.c     |   2 +
>  22 files changed, 365 insertions(+), 4 deletions(-)
> 
> diff --git a/configure b/configure
> index e2e3619..a5a9f9b 100755
> --- a/configure
> +++ b/configure
> @@ -199,6 +199,9 @@ External library support:
>    --enable-gnutls          enable gnutls, needed for https support
>                             if openssl is not used [no]
>    --disable-iconv          disable iconv [autodetect]
> +  --disable-libguess       disable libguess [autodetect]
> +  --disable-uchardet       disable universalchardet [autodetect]
> +  --enable-enca            disable enca [no]
>    --enable-ladspa          enable LADSPA audio filtering [no]
>    --enable-libaacplus      enable AAC+ encoding via libaacplus [no]
>    --enable-libass          enable libass subtitles rendering,
> @@ -1342,6 +1345,9 @@ EXTERNAL_LIBRARY_LIST="
>      frei0r
>      gnutls
>      iconv
> +    libguess
> +    uchardet
> +    enca
>      ladspa
>      libaacplus
>      libass
> @@ -4358,6 +4364,7 @@ die_license_disabled gpl libxavs
>  die_license_disabled gpl libxvid
>  die_license_disabled gpl libzvbi
>  die_license_disabled gpl x11grab
> +die_license_disabled gpl enca
>  
>  die_license_disabled nonfree libaacplus
>  die_license_disabled nonfree libfaac
> @@ -5117,6 +5124,14 @@ enabled vdpau && enabled xlib &&
>  # Funny iconv installations are not unusual, so check it after all flags 
> have been set
>  disabled iconv || check_func_headers iconv.h iconv || check_lib2 iconv.h 
> iconv -liconv || disable iconv
>  
> +disabled iconv || disabled libguess || disable libguess && {
> +    check_pkg_config libguess libguess.h libguess_determine_encoding && 
> require_pkg_config libguess libguess.h libguess_determine_encoding && enable 
> libguess;
> +}
> +disabled iconv || disabled uchardet || disable uchardet && {
> +    check_pkg_config uchardet uchardet.h uchardet_new && require_pkg_config 
> uchardet uchardet.h uchardet_new && enable uchardet;
> +}
> +enabled enca && check_func_headers enca.h enca_analyse || check_lib2 enca.h 
> enca_analyse -lenca || die "ERROR: enca not found"
> +
>  enabled debug && add_cflags -g"$debuglevel" && add_asflags -g"$debuglevel"
>  
>  # add some useful compiler flags if supported
> diff --git a/libavcodec/options_table.h b/libavcodec/options_table.h
> index 1d5b078..93b3105 100644
> --- a/libavcodec/options_table.h
> +++ b/libavcodec/options_table.h
> @@ -472,7 +472,7 @@ static const AVOption avcodec_options[] = {
>  {"ka", "Karaoke",            0, AV_OPT_TYPE_CONST, {.i64 = 
> AV_AUDIO_SERVICE_TYPE_KARAOKE },           INT_MIN, INT_MAX, A|E, 
> "audio_service_type"},
>  {"request_sample_fmt", "sample format audio decoders should prefer", 
> OFFSET(request_sample_fmt), AV_OPT_TYPE_SAMPLE_FMT, 
> {.i64=AV_SAMPLE_FMT_NONE}, -1, INT_MAX, A|D, "request_sample_fmt"},
>  {"pkt_timebase", NULL, OFFSET(pkt_timebase), AV_OPT_TYPE_RATIONAL, {.dbl = 0 
> }, 0, INT_MAX, 0},
> -{"sub_charenc", "set input text subtitles character encoding", 
> OFFSET(sub_charenc), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, 
> S|D},
> +{"sub_charenc_lavc", "set input text subtitles character encoding", 
> OFFSET(sub_charenc), AV_OPT_TYPE_STRING, {.str = NULL}, CHAR_MIN, CHAR_MAX, 
> S|D},
>  {"sub_charenc_mode", "set input text subtitles character encoding mode", 
> OFFSET(sub_charenc_mode), AV_OPT_TYPE_FLAGS, {.i64 = 
> FF_SUB_CHARENC_MODE_AUTOMATIC}, -1, INT_MAX, S|D, "sub_charenc_mode"},
>  {"do_nothing",  NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 
> FF_SUB_CHARENC_MODE_DO_NOTHING},  INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
>  {"auto",        NULL, 0, AV_OPT_TYPE_CONST, {.i64 = 
> FF_SUB_CHARENC_MODE_AUTOMATIC},   INT_MIN, INT_MAX, S|D, "sub_charenc_mode"},
> diff --git a/libavformat/aqtitledec.c b/libavformat/aqtitledec.c
> index 9508766..65aa3e4 100644
> --- a/libavformat/aqtitledec.c
> +++ b/libavformat/aqtitledec.c
> @@ -55,6 +55,8 @@ static int aqt_read_header(AVFormatContext *s)
>      int64_t pos = 0, frame = AV_NOPTS_VALUE;
>      AVPacket *sub = NULL;
>  
> +    aqt->q.avctx = s;
> +
>      if (!st)
>          return AVERROR(ENOMEM);
>      avpriv_set_pts_info(st, 64, aqt->frame_rate.den, aqt->frame_rate.num);
> diff --git a/libavformat/assdec.c b/libavformat/assdec.c
> index c62e76f..958792b 100644
> --- a/libavformat/assdec.c
> +++ b/libavformat/assdec.c
> @@ -114,6 +114,8 @@ static int ass_read_header(AVFormatContext *s)
>      FFTextReader tr;
>      ff_text_init_avio(s, &tr, s->pb);
>  
> +    ass->q.avctx = s;
> +
>      st = avformat_new_stream(s, NULL);
>      if (!st)
>          return AVERROR(ENOMEM);
> diff --git a/libavformat/avformat.h b/libavformat/avformat.h
> index 2e54ed1..8c5fa7e 100644
> --- a/libavformat/avformat.h
> +++ b/libavformat/avformat.h
> @@ -1755,6 +1755,56 @@ typedef struct AVFormatContext {
>       * - demuxing: Set by user.
>       */
>      uint8_t *dump_separator;
> +
> +    /**
> +     * Character encoding of a subtitle file
> +     * - encoding: unused
> +     * - decoding: Set by user via AVOptions; may be changed after 
> initialization
> +     */
> +    char *sub_charenc;
> +
> +    /**
> +     * Array of guesses for the character encoding
> +     * - encoding: unused
> +     * - decoding: Set by demuxer
> +     */
> +    int nb_sub_charenc_guesses;
> +    char **sub_charenc_guesses;
> +
> +    /**
> +     * Language to pass to libguess for charenc detection.
> +     * - encoding: unused
> +     * - decoding: Set by user via AVOptions (NO direct access)
> +     */
> +    char *libguess_language;
> +
> +    /**
> +     * Language to pass to libenca for charenc detection.
> +     * - encoding: unused
> +     * - decoding: Set by user via AVOptions (NO direct access)
> +     */
> +    char *enca_language;
> +
> +    /**
> +     * Threshold parameter for libenca charenc detection.
> +     * - encoding: unused
> +     * - decoding: Set by user via AVOptions (NO direct access)
> +     */
> +    double enca_threshold;
> +
> +    /**
> +     * Whether or not to check for multibyte charsets in libenca.
> +     * - encoding: unused
> +     * - decoding: Set by user via AVOptions (NO direct access)
> +     */
> +    int enca_multibyte;
> +
> +    /**
> +     * Whether or not to let libenca return an ambiguous result.
> +     * - encoding: unused
> +     * - decoding: Set by user via AVOptions (NO direct access)
> +     */
> +    int enca_ambiguity;
>  } AVFormatContext;
>  
>  int av_format_get_probe_score(const AVFormatContext *s);
> diff --git a/libavformat/jacosubdec.c b/libavformat/jacosubdec.c
> index 1ca0055..fa332fa 100644
> --- a/libavformat/jacosubdec.c
> +++ b/libavformat/jacosubdec.c
> @@ -170,6 +170,8 @@ static int jacosub_read_header(AVFormatContext *s)
>      st->codec->codec_type = AVMEDIA_TYPE_SUBTITLE;
>      st->codec->codec_id   = AV_CODEC_ID_JACOSUB;
>  
> +    jacosub->q.avctx = s;
> +
>      jacosub->timeres = 30;
>  
>      av_bprint_init(&header, 1024+FF_INPUT_BUFFER_PADDING_SIZE, 4096);
> diff --git a/libavformat/microdvddec.c b/libavformat/microdvddec.c
> index ce3433c..5c3b48c 100644
> --- a/libavformat/microdvddec.c
> +++ b/libavformat/microdvddec.c
> @@ -85,6 +85,8 @@ static int microdvd_read_header(AVFormatContext *s)
>      char line_buf[MAX_LINESIZE];
>      int has_real_fps = 0;
>  
> +    microdvd->q.avctx = s;
> +
>      if (!st)
>          return AVERROR(ENOMEM);
>  
> diff --git a/libavformat/mpl2dec.c b/libavformat/mpl2dec.c
> index 260b7be..fa431c3 100644
> --- a/libavformat/mpl2dec.c
> +++ b/libavformat/mpl2dec.c
> @@ -77,6 +77,8 @@ static int mpl2_read_header(AVFormatContext *s)
>      AVStream *st = avformat_new_stream(s, NULL);
>      int res = 0;
>  
> +    mpl2->q.avctx = s;
> +
>      if (!st)
>          return AVERROR(ENOMEM);
>      avpriv_set_pts_info(st, 64, 1, 10);
> diff --git a/libavformat/mpsubdec.c b/libavformat/mpsubdec.c
> index eddc594..7bb08f9 100644
> --- a/libavformat/mpsubdec.c
> +++ b/libavformat/mpsubdec.c
> @@ -61,6 +61,8 @@ static int mpsub_read_header(AVFormatContext *s)
>      float multiplier = 100.0;
>      float current_pts = 0;
>  
> +    mpsub->q.avctx = s;
> +
>      av_bprint_init(&buf, 0, AV_BPRINT_SIZE_UNLIMITED);
>  
>      while (!avio_feof(s->pb)) {
> diff --git a/libavformat/options_table.h b/libavformat/options_table.h
> index 40f1e0a..741bfb2 100644
> --- a/libavformat/options_table.h
> +++ b/libavformat/options_table.h
> @@ -22,6 +22,7 @@
>  #define AVFORMAT_OPTIONS_TABLE_H
>  
>  #include <limits.h>
> +#include <float.h>  /* DBL_MAX */
>  
>  #include "libavutil/opt.h"
>  #include "avformat.h"
> @@ -99,6 +100,12 @@ static const AVOption avformat_options[] = {
>  {"dump_separator", "set information dump field separator", 
> OFFSET(dump_separator), AV_OPT_TYPE_STRING, {.str = ", "}, CHAR_MIN, 
> CHAR_MAX, D|E},
>  {"codec_whitelist", "List of decoders that are allowed to be used", 
> OFFSET(codec_whitelist), AV_OPT_TYPE_STRING, { .str = NULL },  CHAR_MIN, 
> CHAR_MAX, D },
>  {"format_whitelist", "List of demuxers that are allowed to be used", 
> OFFSET(format_whitelist), AV_OPT_TYPE_STRING, { .str = NULL },  CHAR_MIN, 
> CHAR_MAX, D },
> +{"sub_charenc", "subtitle character encoding", OFFSET(sub_charenc), 
> AV_OPT_TYPE_STRING, { .str = NULL }, CHAR_MIN, CHAR_MAX, D },
> +{"libguess_language", "Language parameter for libguess charenc detection", 
> OFFSET(libguess_language), AV_OPT_TYPE_STRING, { .str = NULL }, CHAR_MIN, 
> CHAR_MAX, D },
> +{"enca_language",     "Language parameter for enca charenc detection",     
> OFFSET(enca_language),     AV_OPT_TYPE_STRING, { .str = NULL }, CHAR_MIN, 
> CHAR_MAX, D },
> +{"enca_threshold",    "Threshold parameter for enca charenc detection",    
> OFFSET(enca_threshold),    AV_OPT_TYPE_DOUBLE, { .dbl = 1.38 }, 1.0,      
> DBL_MAX,  D },
> +{"enca_multibyte",    "Whether or not to allow enca to guess multibyte 
> charsets", OFFSET(enca_multibyte), AV_OPT_TYPE_INT, { .i64 = 1 },   0,        
> 1,        D },
> +{"enca_ambiguity",    "Whether or not to allow enca to return ambiguous 
> results", OFFSET(enca_ambiguity), AV_OPT_TYPE_INT, { .i64 = 1 },   0,        
> 1,        D },
>  {NULL},
>  };
>  
> diff --git a/libavformat/pjsdec.c b/libavformat/pjsdec.c
> index 5129b70..252e9d9 100644
> --- a/libavformat/pjsdec.c
> +++ b/libavformat/pjsdec.c
> @@ -67,6 +67,8 @@ static int pjs_read_header(AVFormatContext *s)
>      AVStream *st = avformat_new_stream(s, NULL);
>      int res = 0;
>  
> +    pjs->q.avctx = s;
> +
>      if (!st)
>          return AVERROR(ENOMEM);
>      avpriv_set_pts_info(st, 64, 1, 10);
> diff --git a/libavformat/realtextdec.c b/libavformat/realtextdec.c
> index fff85d6..d20f0c5 100644
> --- a/libavformat/realtextdec.c
> +++ b/libavformat/realtextdec.c
> @@ -67,6 +67,8 @@ static int realtext_read_header(AVFormatContext *s)
>      FFTextReader tr;
>      ff_text_init_avio(s, &tr, s->pb);
>  
> +    rt->q.avctx = s;
> +
>      if (!st)
>          return AVERROR(ENOMEM);
>      avpriv_set_pts_info(st, 64, 1, 100);
> diff --git a/libavformat/samidec.c b/libavformat/samidec.c
> index 948e1ed..968f506 100644
> --- a/libavformat/samidec.c
> +++ b/libavformat/samidec.c
> @@ -56,6 +56,8 @@ static int sami_read_header(AVFormatContext *s)
>      FFTextReader tr;
>      ff_text_init_avio(s, &tr, s->pb);
>  
> +    sami->q.avctx = s;
> +
>      if (!st)
>          return AVERROR(ENOMEM);
>      avpriv_set_pts_info(st, 64, 1, 1000);
> diff --git a/libavformat/srtdec.c b/libavformat/srtdec.c
> index b35e50f..3187490 100644
> --- a/libavformat/srtdec.c
> +++ b/libavformat/srtdec.c
> @@ -89,6 +89,8 @@ static int srt_read_header(AVFormatContext *s)
>      FFTextReader tr;
>      ff_text_init_avio(s, &tr, s->pb);
>  
> +    srt->q.avctx = s;
> +
>      if (!st)
>          return AVERROR(ENOMEM);
>      avpriv_set_pts_info(st, 64, 1, 1000);
> diff --git a/libavformat/stldec.c b/libavformat/stldec.c
> index b84c7e9..5d96737 100644
> --- a/libavformat/stldec.c
> +++ b/libavformat/stldec.c
> @@ -74,6 +74,8 @@ static int stl_read_header(AVFormatContext *s)
>      STLContext *stl = s->priv_data;
>      AVStream *st = avformat_new_stream(s, NULL);
>  
> +    stl->q.avctx = s;
> +
>      if (!st)
>          return AVERROR(ENOMEM);
>      avpriv_set_pts_info(st, 64, 1, 100);
> diff --git a/libavformat/subtitles.c b/libavformat/subtitles.c
> index 67624fc..e953080 100644
> --- a/libavformat/subtitles.c
> +++ b/libavformat/subtitles.c
> @@ -21,9 +21,23 @@
>  #include "avformat.h"
>  #include "subtitles.h"
>  #include "avio_internal.h"
> +#include "internal.h"
>  #include "libavutil/avassert.h"
>  #include "libavutil/avstring.h"
>  
> +#if CONFIG_ICONV
> +# include <iconv.h>
> +#endif
> +#if CONFIG_LIBGUESS
> +# include <libguess.h>
> +#endif
> +#if CONFIG_ENCA
> +# include <enca.h>
> +#endif
> +#if CONFIG_UCHARDET
> +# include <uchardet.h>
> +#endif
> +
>  void ff_text_init_avio(void *s, FFTextReader *r, AVIOContext *pb)
>  {
>      int i;
> @@ -166,26 +180,268 @@ static int cmp_pkt_sub_pos_ts(const void *a, const 
> void *b)
>      return s1->pos > s2->pos ? 1 : -1;
>  }
>  
> +/**
> + * Add a character encoding guess to an AVFormatContext's list
> + *
> + * @param avctx the context to add to
> + * @param enc   the encoding name to add
> + *
> + * A copy is added, so the original string should be free()d if necessary.
> + * If the same encoding name is already present, it isn't added again.
> + * If NULL or an empty string is passed, it's not added.
> + */
> +static void add_charenc(AVFormatContext *avctx, const char *enc)
> +{
> +    char *copy;
> +
> +    if (!enc || !enc[0])
> +        return;
> +
> +    for (unsigned i = 0; i < avctx->nb_sub_charenc_guesses; i++)
> +        if (!strcmp(avctx->sub_charenc_guesses[i], enc))
> +            return;
> +
> +    copy = av_strdup(enc);
> +    if (!copy)
> +        return;
> +
> +    dynarray_add(&avctx->sub_charenc_guesses, &avctx->nb_sub_charenc_guesses,
> +                 copy);
> +}
> +
> +/**
> + * Finish an FFDemuxSubtitlesQueue and prepare it for reading
> + *
> + * @param q the queue to finish
> + *
> + * This sorts packets by position and/or timestamp, adjusts durations for
> + * formats that don't set them, and (if enabled) builds a text buffer for the
> + * charenc detectors.
> + * If enabled, it then checks the buffer with each available charenc 
> detector,
> + * builds a list of guesses, and sets the AVFormatContext's encoding to its
> + * best candidate.
> + */
>  void ff_subtitles_queue_finalize(FFDemuxSubtitlesQueue *q)
>  {
>      int i;
> +    char *charenc_buf = NULL;
> +    int charenc_buf_size = 0, charenc_buf_len = 0;
> +    AVFormatContext *avctx = q->avctx;
> +    // Whether or not we're doing charenc detection here
> +    int detection = avctx && avctx->sub_charenc &&
> +                    !strcmp(avctx->sub_charenc, "auto");
>  
>      qsort(q->subs, q->nb_subs, sizeof(*q->subs),
>            q->sort == SUB_SORT_TS_POS ? cmp_pkt_sub_ts_pos
>                                       : cmp_pkt_sub_pos_ts);
> -    for (i = 0; i < q->nb_subs; i++)
> +    for (i = 0; i < q->nb_subs; i++) {
>          if (q->subs[i].duration == -1 && i < q->nb_subs - 1)
>              q->subs[i].duration = q->subs[i + 1].pts - q->subs[i].pts;
> +
> +        if (detection) {
> +            char *newbuf = av_fast_realloc(charenc_buf, &charenc_buf_size,
> +                                           charenc_buf_len + 
> q->subs[i].size);
> +            if (!newbuf)
> +                continue;
> +
> +            charenc_buf = newbuf;
> +
> +            memcpy(charenc_buf + charenc_buf_len, q->subs[i].data,
> +                   q->subs[i].size);
> +            charenc_buf_len += q->subs[i].size;
> +        }
> +    }
> +
> +    if (detection) {
> +#if CONFIG_LIBGUESS
> +        if (avctx->libguess_language) {
> +            const char *enc =
> +                libguess_determine_encoding(charenc_buf,
> +                                            charenc_buf_len,
> +                                            avctx->libguess_language);
> +            av_log(avctx, AV_LOG_INFO, "libguess selected: %s\n", enc);
> +            add_charenc(avctx, enc);
> +        }
> +#endif
> +#if CONFIG_ENCA
> +        if (avctx->enca_language) {
> +            EncaAnalyser an = enca_analyser_alloc(avctx->enca_language);
> +            if (an) {
> +                EncaEncoding enc;
> +                const char *str;
> +                enca_set_threshold(an, avctx->enca_threshold);
> +                enca_set_multibyte(an, avctx->enca_multibyte);
> +                enca_set_ambiguity(an, avctx->enca_ambiguity);
> +                enca_set_garbage_test(an, 1);
> +
> +                enc = enca_analyse_const(an, charenc_buf, charenc_buf_len);
> +
> +                str = enca_charset_name(enc.charset, ENCA_NAME_STYLE_ICONV);
> +                av_log(avctx, AV_LOG_INFO, "ENCA selected: %s\n", str);
> +                if (enca_charset_is_known(enc.charset))
> +                    add_charenc(avctx, str);
> +
> +                enca_analyser_free(an);
> +            } else {
> +                av_log(avctx, AV_LOG_ERROR, "ENCA allocation failed\n");
> +            }
> +        }
> +#endif
> +#if CONFIG_UCHARDET
> +        {
> +            uchardet_t det = uchardet_new();
> +            if (det) {
> +                const char *enc;
> +                uchardet_handle_data(det, charenc_buf, charenc_buf_len);
> +                uchardet_data_end(det);
> +                enc = uchardet_get_charset(det);
> +                av_log(avctx, AV_LOG_INFO, "uchardet selected: %s\n", enc);
> +                add_charenc(avctx, enc);
> +                uchardet_delete(det);
> +            }
> +        }
> +#endif
> +
> +        av_freep(&avctx->sub_charenc);
> +
> +        if (avctx->nb_sub_charenc_guesses)
> +            avctx->sub_charenc = av_strdup(avctx->sub_charenc_guesses[0]);
> +    }
> +}
> +
> +#define UTF8_MAX_BYTES 4 /* 5 and 6 bytes sequences should not be used */
> +/**
> + * Convert an AVPacket from one character encoding to another, using the
> + * selected encoding from an AVFormatContext and falling back on other 
> encoding
> + * guesses if necessary.
> + *
> + * @param avctx  the AVFormatContext whose character encodings we'll use
> + * @param outpkt the AVPacket to write to
> + * @param inpkt  the AVPacket to read from
> + *
> + * This first tries the AVFormatContext's sub_charenc, then falls back on its
> + * sub_charenc_guesses. If none decodes successfully, it tries sub_charenc
> + * again, but instructs iconv to keep chugging on illegal sequences.
> + * If the packet is successfully recoded with an encoding other than the
> + * sub_charenc, then sub_charenc is changed to the working encoding.
> + */
> +static int recode_subtitle(AVFormatContext *avctx,
> +                           AVPacket *outpkt, const AVPacket *inpkt)
> +{
> +#if CONFIG_ICONV
> +    iconv_t cd = (iconv_t)-1;
> +    int ret = 0;
> +    char *inb, *outb;
> +    size_t inl, outl;
> +    AVPacket tmp;
> +    int i;
> +#endif
> +
> +    // Set attributes on the output packet that aren't covered by
> +    // av_copy_packet, like the pts and duration.
> +    *outpkt = *inpkt;
> +
> +    if (av_copy_packet(outpkt, inpkt))
> +        return AVERROR(ENOMEM);
> +
> +    if (!avctx || !avctx->sub_charenc || inpkt->size == 0)
> +        return 0;
> +
> +#if CONFIG_ICONV
> +    inb = inpkt->data;
> +    inl = inpkt->size;
> +
> +    if (inl >= INT_MAX / UTF8_MAX_BYTES - FF_INPUT_BUFFER_PADDING_SIZE) {
> +        av_log(avctx, AV_LOG_ERROR, "Subtitles packet is too big for 
> recoding\n");
> +        ret = AVERROR(ENOMEM);
> +        goto end;
> +    }
> +
> +    // Allocate a dummy packet that holds new buffers
> +    ret = av_new_packet(&tmp, inl * UTF8_MAX_BYTES);
> +    if (ret < 0)
> +        goto end;
> +
> +    for (i = -1; i <= avctx->nb_sub_charenc_guesses; i++) {
> +        const char *encoding;
> +        // If this is our last attempt, skip illegal sequences.
> +        int discard_illegal = (i == avctx->nb_sub_charenc_guesses);
> +
> +        // Reset our buffers and sizes every time, as iconv might change 
> them.
> +        outpkt->buf  = tmp.buf;
> +        outpkt->data = tmp.data;
> +        outpkt->size = tmp.size;
> +        outb = outpkt->data;
> +        outl = outpkt->size;
> +
> +        // The encoding we're going to try. We use sub_charenc first, then 
> try
> +        // our array of guesses, then try sub_charenc again with illegal
> +        // sequences enabled.
> +        if (i == -1 || i == avctx->nb_sub_charenc_guesses)
> +            encoding = avctx->sub_charenc;
> +        else
> +            encoding = avctx->sub_charenc_guesses[i];
> +
> +        cd = iconv_open("UTF-8", encoding);
> +        if (cd == (iconv_t)-1) {
> +            av_log(avctx, AV_LOG_WARNING, "Invalid character encoding: %s\n",
> +                   encoding);
> +            ret = AVERROR(EINVAL);
> +            continue;
> +        }
> +
> +        iconvctl(cd, ICONV_SET_DISCARD_ILSEQ, &discard_illegal);
> +
> +        // Try to run a conversion.
> +        if (iconv(cd, &inb, &inl, &outb, &outl) != (size_t)-1 &&
> +            iconv(cd, NULL, NULL, &outb, &outl) != (size_t)-1 &&
> +            outl < outpkt->size && inl == 0) {
> +            // Success, save the new encoding and get out.
> +            if (discard_illegal) {
> +                av_log(avctx, AV_LOG_WARNING, "Needed to discard illegal "
> +                       "sequences while recoding subtitle event \"%s\" from 
> %s "
> +                       "to UTF-8\n", inpkt->data, avctx->sub_charenc);
> +            } else if (i >= 0) {
> +                av_log(avctx, AV_LOG_INFO, "Switching character encoding 
> from "
> +                       "from %s to %s\n", avctx->sub_charenc, encoding);
> +                av_freep(&avctx->sub_charenc);
> +                avctx->sub_charenc = av_strdup(encoding);
> +            }
> +
> +            // Remove and zero extra buffer space that iconv didn't end up 
> using
> +            outpkt->size -= outl;
> +            memset(outpkt->data + outpkt->size, 0, outl);
> +            iconv_close(cd);
> +            return 0;
> +        }
> +
> +        ret = FFMIN(AVERROR(errno), -1);
> +        iconv_close(cd);
> +    }
> +
> +    av_log(avctx, AV_LOG_ERROR, "Unable to recode subtitle event \"%s\" "
> +           "from %s to UTF-8\n", inpkt->data, avctx->sub_charenc);
> +    av_free_packet(&tmp);
> +
> +end:
> +    if (cd != (iconv_t)-1)
> +        iconv_close(cd);
> +    return ret;
> +#else
> +    av_log(avctx, AV_LOG_ERROR, "requesting subtitles recoding without 
> iconv");
> +    return AVERROR(EINVAL);
> +#endif
>  }
>  
>  int ff_subtitles_queue_read_packet(FFDemuxSubtitlesQueue *q, AVPacket *pkt)
>  {
>      AVPacket *sub = q->subs + q->current_sub_idx;
> +    int ret;
>  
>      if (q->current_sub_idx == q->nb_subs)
>          return AVERROR_EOF;
> -    if (av_copy_packet(pkt, sub) < 0) {
> -        return AVERROR(ENOMEM);
> +    if ((ret = recode_subtitle(q->avctx, pkt, sub)) < 0) {
> +        return ret;
>      }
>  
>      pkt->dts = pkt->pts;
> diff --git a/libavformat/subtitles.h b/libavformat/subtitles.h
> index eb719ea..69ced11 100644
> --- a/libavformat/subtitles.h
> +++ b/libavformat/subtitles.h
> @@ -100,6 +100,7 @@ int ff_text_peek_r8(FFTextReader *r);
>  void ff_text_read(FFTextReader *r, char *buf, size_t size);
>  
>  typedef struct {
> +    AVFormatContext *avctx; ///< AVFormat context; used for charenc 
> parameters
>      AVPacket *subs;         ///< array of subtitles packets
>      int nb_subs;            ///< number of subtitles packets
>      int allocated_size;     ///< allocated size for subs
> diff --git a/libavformat/subviewer1dec.c b/libavformat/subviewer1dec.c
> index 6b38533..35303ce 100644
> --- a/libavformat/subviewer1dec.c
> +++ b/libavformat/subviewer1dec.c
> @@ -47,6 +47,8 @@ static int subviewer1_read_header(AVFormatContext *s)
>      SubViewer1Context *subviewer1 = s->priv_data;
>      AVStream *st = avformat_new_stream(s, NULL);
>  
> +    subviewer1->q.avctx = s;
> +
>      if (!st)
>          return AVERROR(ENOMEM);
>      avpriv_set_pts_info(st, 64, 1, 1);
> diff --git a/libavformat/subviewerdec.c b/libavformat/subviewerdec.c
> index f1b0fdf..1197a0c 100644
> --- a/libavformat/subviewerdec.c
> +++ b/libavformat/subviewerdec.c
> @@ -76,6 +76,8 @@ static int subviewer_read_header(AVFormatContext *s)
>      int duration = -1;
>      AVPacket *sub = NULL;
>  
> +    subviewer->q.avctx = s;
> +
>      if (!st)
>          return AVERROR(ENOMEM);
>      avpriv_set_pts_info(st, 64, 1, 100);
> diff --git a/libavformat/utils.c b/libavformat/utils.c
> index 5a2a72d..052b2fc 100644
> --- a/libavformat/utils.c
> +++ b/libavformat/utils.c
> @@ -3597,6 +3597,8 @@ void avformat_free_context(AVFormatContext *s)
>      av_dict_free(&s->metadata);
>      av_freep(&s->streams);
>      av_freep(&s->internal);
> +    while (s->nb_sub_charenc_guesses--)
> +        av_freep(&s->sub_charenc_guesses[s->nb_sub_charenc_guesses]);
>      flush_packet_queue(s);
>      av_free(s);
>  }
> diff --git a/libavformat/vplayerdec.c b/libavformat/vplayerdec.c
> index 619ccfd..7cd3363 100644
> --- a/libavformat/vplayerdec.c
> +++ b/libavformat/vplayerdec.c
> @@ -59,6 +59,8 @@ static int vplayer_read_header(AVFormatContext *s)
>      VPlayerContext *vplayer = s->priv_data;
>      AVStream *st = avformat_new_stream(s, NULL);
>  
> +    vplayer->q.avctx = s;
> +
>      if (!st)
>          return AVERROR(ENOMEM);
>      avpriv_set_pts_info(st, 64, 1, 100);
> diff --git a/libavformat/webvttdec.c b/libavformat/webvttdec.c
> index e457e8f..4d82cca 100644
> --- a/libavformat/webvttdec.c
> +++ b/libavformat/webvttdec.c
> @@ -64,6 +64,8 @@ static int webvtt_read_header(AVFormatContext *s)
>      int res = 0;
>      AVStream *st = avformat_new_stream(s, NULL);
>  
> +    webvtt->q.avctx = s;
> +
>      if (!st)
>          return AVERROR(ENOMEM);
>      avpriv_set_pts_info(st, 64, 1, 1000);

I think this might be fine if
1. the charset detection code is moved somewhere else (lavu?), and
2. detection is an explicit function called by the API user (where you
   also could pass parameters like the charset, and receive information
   like the set of guessed codepages).

In general, maybe the comments of others that this stuff shouldn't be
in lavc or lavu are probably correct.
_______________________________________________
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel

Re: [FFmpeg-devel] [PATCH] avformat: Implement subtitle charenc guessing

Reply via email to