[FFmpeg-devel] [PATCH 1/4] avformat/assdec: UTF-16 support
Use the UTF-16 BOM to detect UTF-16 encoding. Convert the file contents to UTF-8 on the fly using FFTextReader, which acts as converting wrapper around AVIOContext. It also can work on a static buffer, needed for format probing. The FFTextReader wrapper now also takes care of skipping the UTF-8 BOM. --- libavformat/assdec.c| 19 +-- libavformat/subtitles.c | 63 + libavformat/subtitles.h | 51 +++ 3 files changed, 126 insertions(+), 7 deletions(-) diff --git a/libavformat/assdec.c b/libavformat/assdec.c index bb953c7..a5f792a 100644 --- a/libavformat/assdec.c +++ b/libavformat/assdec.c @@ -33,10 +33,13 @@ typedef struct ASSContext { static int ass_probe(AVProbeData *p) { -const char *header = [Script Info]; +char buf[13]; +FFTextReader tr; +ff_text_init_buf(tr, p-buf, p-buf_size); -if (!memcmp(p-buf, header, strlen(header)) || -!memcmp(p-buf + 3, header, strlen(header))) +ff_text_read(tr, buf, sizeof(buf)); + +if (!memcmp(buf, [Script Info], 13)) return AVPROBE_SCORE_MAX; return 0; @@ -66,13 +69,13 @@ static int read_ts(const uint8_t *p, int64_t *start, int *duration) return -1; } -static int64_t get_line(AVBPrint *buf, AVIOContext *pb) +static int64_t get_line(AVBPrint *buf, FFTextReader *tr) { -int64_t pos = avio_tell(pb); +int64_t pos = ff_text_pos(tr); av_bprint_clear(buf); for (;;) { -char c = avio_r8(pb); +char c = ff_text_r8(tr); if (!c) break; av_bprint_chars(buf, c, 1); @@ -88,6 +91,8 @@ static int ass_read_header(AVFormatContext *s) AVBPrint header, line; int header_remaining, res = 0; AVStream *st; +FFTextReader tr; +ff_text_init_avio(tr, s-pb); st = avformat_new_stream(s, NULL); if (!st) @@ -102,7 +107,7 @@ static int ass_read_header(AVFormatContext *s) av_bprint_init(line, 0, AV_BPRINT_SIZE_UNLIMITED); for (;;) { -int64_t pos = get_line(line, s-pb); +int64_t pos = get_line(line, tr); if (!line.str[0]) // EOF break; diff --git a/libavformat/subtitles.c b/libavformat/subtitles.c index fce2bf1..47a52ec 100644 --- a/libavformat/subtitles.c +++ b/libavformat/subtitles.c @@ -20,9 +20,72 @@ #include avformat.h #include subtitles.h +#include avio_internal.h #include libavutil/avassert.h #include libavutil/avstring.h +void ff_text_init_avio(FFTextReader *r, AVIOContext *pb) +{ +int i; +r-pb = pb; +r-buf_pos = r-buf_len = 0; +r-type = 0; +for (i = 0; i 2; i++) +r-buf[r-buf_len++] = avio_r8(r-pb); +if (strncmp(\xFF\xFE, r-buf, 2) == 0) { +r-type = 1; // UTF16LE +r-buf_pos += 2; +} else if (strncmp(\xFE\xFF, r-buf, 2) == 0) { +r-type = 2; // UTF16BE +r-buf_pos += 2; +} else { +r-buf[r-buf_len++] = avio_r8(r-pb); +if (strncmp(\xEF\xBB\xBF, r-buf, 3) == 0) { +// UTF8 +r-buf_pos += 3; +} +} +} + +void ff_text_init_buf(FFTextReader *r, void *buf, size_t size) +{ +memset(r-buf_pb, 0, sizeof(r-buf_pb)); +ffio_init_context(r-buf_pb, buf, size, 0, NULL, NULL, NULL, NULL); +ff_text_init_avio(r, r-buf_pb); +} + +int64_t ff_text_pos(FFTextReader *r) +{ +return avio_tell(r-pb) - r-buf_len + r-buf_pos; +} + +int ff_text_r8(FFTextReader *r) +{ +uint32_t val; +uint8_t tmp; +if (r-buf_pos r-buf_len) +return r-buf[r-buf_pos++]; +if (r-type == 1) { +GET_UTF16(val, avio_rl16(r-pb), return 0;) +} else if (r-type == 2) { +GET_UTF16(val, avio_rb16(r-pb), return 0;) +} else { +return avio_r8(r-pb); +} +if (!val) +return 0; +r-buf_pos = 0; +r-buf_len = 0; +PUT_UTF8(val, tmp, r-buf[r-buf_len++] = tmp;) +return r-buf[r-buf_pos++]; // buf_len is at least 1 +} + +void ff_text_read(FFTextReader *r, char *buf, size_t size) +{ +for ( ; size 0; size--) +*buf++ = ff_text_r8(r); +} + AVPacket *ff_subtitles_queue_insert(FFDemuxSubtitlesQueue *q, const uint8_t *event, int len, int merge) { diff --git a/libavformat/subtitles.h b/libavformat/subtitles.h index b5a96ec..9549b32 100644 --- a/libavformat/subtitles.h +++ b/libavformat/subtitles.h @@ -31,6 +31,57 @@ enum sub_sort { }; typedef struct { +int type; +AVIOContext *pb; +unsigned char buf[8]; +int buf_pos, buf_len; +AVIOContext buf_pb; +} FFTextReader; + +/** + * Initialize the FFTextReader from the given AVIOContext. This function will + * read some bytes from pb, and test for UTF-8 or UTF-16 BOMs. Further accesses + * to FFTextReader will read more data from pb. + * + * The purpose of FFTextReader is to transparently convert read data to UTF-8 + * if the stream had a UTF-16 BOM. + * + * @param r object which will be initialized + * @param pb stream to read
Re: [FFmpeg-devel] [PATCH 1/4] avformat/assdec: UTF-16 support
On Tue, Sep 02, 2014 at 08:56:09PM +0200, wm4 wrote: Use the UTF-16 BOM to detect UTF-16 encoding. Convert the file contents to UTF-8 on the fly using FFTextReader, which acts as converting wrapper around AVIOContext. It also can work on a static buffer, needed for format probing. The FFTextReader wrapper now also takes care of skipping the UTF-8 BOM. Haven't reviewed it in detail, but shouldn't it also detect anything with a 0 byte in the first 2 characters as UTF-16? Interpreting it as any other text format is unlikely to work anyway, and I think most subtitle formats will start with an ASCII character, giving near 100% reliability without any BOM. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH 1/4] avformat/assdec: UTF-16 support
On Tue, 2 Sep 2014 21:05:08 +0200 Reimar Döffinger reimar.doeffin...@gmx.de wrote: On Tue, Sep 02, 2014 at 08:56:09PM +0200, wm4 wrote: Use the UTF-16 BOM to detect UTF-16 encoding. Convert the file contents to UTF-8 on the fly using FFTextReader, which acts as converting wrapper around AVIOContext. It also can work on a static buffer, needed for format probing. The FFTextReader wrapper now also takes care of skipping the UTF-8 BOM. Haven't reviewed it in detail, but shouldn't it also detect anything with a 0 byte in the first 2 characters as UTF-16? Interpreting it as any other text format is unlikely to work anyway, and I think most subtitle formats will start with an ASCII character, giving near 100% reliability without any BOM. Interesting idea, but on the other hand I haven't seen any UTF-16 subtitles without BOM. (My guess is that they're all produced on Windows...) ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH 1/4] avformat/assdec: UTF-16 support
On Tue, Sep 02, 2014 at 08:56:09PM +0200, wm4 wrote: Use the UTF-16 BOM to detect UTF-16 encoding. Convert the file contents to UTF-8 on the fly using FFTextReader, which acts as converting wrapper around AVIOContext. It also can work on a static buffer, needed for format probing. The FFTextReader wrapper now also takes care of skipping the UTF-8 BOM. --- libavformat/assdec.c| 19 +-- libavformat/subtitles.c | 63 + libavformat/subtitles.h | 51 +++ 3 files changed, 126 insertions(+), 7 deletions(-) [...] +void ff_text_init_avio(FFTextReader *r, AVIOContext *pb) +{ +int i; +r-pb = pb; +r-buf_pos = r-buf_len = 0; +r-type = 0; +for (i = 0; i 2; i++) +r-buf[r-buf_len++] = avio_r8(r-pb); +if (strncmp(\xFF\xFE, r-buf, 2) == 0) { +r-type = 1; // UTF16LE Would you mind using an enum for type? You won't need these comments anymore, and the rest of the code will be easier to read. [...] Apart from that, patch looks really good to me. I'll review the rest of the patchset in a moment. -- Clément B. pgpMsxPlCkd3f.pgp Description: PGP signature ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel
Re: [FFmpeg-devel] [PATCH 1/4] avformat/assdec: UTF-16 support
On Tue, 2 Sep 2014 23:18:02 +0200 Clément Bœsch u...@pkh.me wrote: On Tue, Sep 02, 2014 at 08:56:09PM +0200, wm4 wrote: Use the UTF-16 BOM to detect UTF-16 encoding. Convert the file contents to UTF-8 on the fly using FFTextReader, which acts as converting wrapper around AVIOContext. It also can work on a static buffer, needed for format probing. The FFTextReader wrapper now also takes care of skipping the UTF-8 BOM. --- libavformat/assdec.c| 19 +-- libavformat/subtitles.c | 63 + libavformat/subtitles.h | 51 +++ 3 files changed, 126 insertions(+), 7 deletions(-) [...] +void ff_text_init_avio(FFTextReader *r, AVIOContext *pb) +{ +int i; +r-pb = pb; +r-buf_pos = r-buf_len = 0; +r-type = 0; +for (i = 0; i 2; i++) +r-buf[r-buf_len++] = avio_r8(r-pb); +if (strncmp(\xFF\xFE, r-buf, 2) == 0) { +r-type = 1; // UTF16LE Would you mind using an enum for type? You won't need these comments anymore, and the rest of the code will be easier to read. Sure. I didn't do it because all magic numbers are used in one place, but ok. [...] Apart from that, patch looks really good to me. I'll review the rest of the patchset in a moment. ___ ffmpeg-devel mailing list ffmpeg-devel@ffmpeg.org http://ffmpeg.org/mailman/listinfo/ffmpeg-devel