[FFmpeg-devel] [PATCH 1/4] avformat/assdec: UTF-16 support

2014-09-02 Thread wm4
Use the UTF-16 BOM to detect UTF-16 encoding. Convert the file contents
to UTF-8 on the fly using FFTextReader, which acts as converting wrapper
around AVIOContext. It also can work on a static buffer, needed for
format probing. The FFTextReader wrapper now also takes care of skipping
the UTF-8 BOM.
---
 libavformat/assdec.c| 19 +--
 libavformat/subtitles.c | 63 +
 libavformat/subtitles.h | 51 +++
 3 files changed, 126 insertions(+), 7 deletions(-)

diff --git a/libavformat/assdec.c b/libavformat/assdec.c
index bb953c7..a5f792a 100644
--- a/libavformat/assdec.c
+++ b/libavformat/assdec.c
@@ -33,10 +33,13 @@ typedef struct ASSContext {
 
 static int ass_probe(AVProbeData *p)
 {
-const char *header = [Script Info];
+char buf[13];
+FFTextReader tr;
+ff_text_init_buf(tr, p-buf, p-buf_size);
 
-if (!memcmp(p-buf, header, strlen(header)) ||
-!memcmp(p-buf + 3, header, strlen(header)))
+ff_text_read(tr, buf, sizeof(buf));
+
+if (!memcmp(buf, [Script Info], 13))
 return AVPROBE_SCORE_MAX;
 
 return 0;
@@ -66,13 +69,13 @@ static int read_ts(const uint8_t *p, int64_t *start, int 
*duration)
 return -1;
 }
 
-static int64_t get_line(AVBPrint *buf, AVIOContext *pb)
+static int64_t get_line(AVBPrint *buf, FFTextReader *tr)
 {
-int64_t pos = avio_tell(pb);
+int64_t pos = ff_text_pos(tr);
 
 av_bprint_clear(buf);
 for (;;) {
-char c = avio_r8(pb);
+char c = ff_text_r8(tr);
 if (!c)
 break;
 av_bprint_chars(buf, c, 1);
@@ -88,6 +91,8 @@ static int ass_read_header(AVFormatContext *s)
 AVBPrint header, line;
 int header_remaining, res = 0;
 AVStream *st;
+FFTextReader tr;
+ff_text_init_avio(tr, s-pb);
 
 st = avformat_new_stream(s, NULL);
 if (!st)
@@ -102,7 +107,7 @@ static int ass_read_header(AVFormatContext *s)
 av_bprint_init(line,   0, AV_BPRINT_SIZE_UNLIMITED);
 
 for (;;) {
-int64_t pos = get_line(line, s-pb);
+int64_t pos = get_line(line, tr);
 
 if (!line.str[0]) // EOF
 break;
diff --git a/libavformat/subtitles.c b/libavformat/subtitles.c
index fce2bf1..47a52ec 100644
--- a/libavformat/subtitles.c
+++ b/libavformat/subtitles.c
@@ -20,9 +20,72 @@
 
 #include avformat.h
 #include subtitles.h
+#include avio_internal.h
 #include libavutil/avassert.h
 #include libavutil/avstring.h
 
+void ff_text_init_avio(FFTextReader *r, AVIOContext *pb)
+{
+int i;
+r-pb = pb;
+r-buf_pos = r-buf_len = 0;
+r-type = 0;
+for (i = 0; i  2; i++)
+r-buf[r-buf_len++] = avio_r8(r-pb);
+if (strncmp(\xFF\xFE, r-buf, 2) == 0) {
+r-type = 1; // UTF16LE
+r-buf_pos += 2;
+} else if (strncmp(\xFE\xFF, r-buf, 2) == 0) {
+r-type = 2; // UTF16BE
+r-buf_pos += 2;
+} else {
+r-buf[r-buf_len++] = avio_r8(r-pb);
+if (strncmp(\xEF\xBB\xBF, r-buf, 3) == 0) {
+// UTF8
+r-buf_pos += 3;
+}
+}
+}
+
+void ff_text_init_buf(FFTextReader *r, void *buf, size_t size)
+{
+memset(r-buf_pb, 0, sizeof(r-buf_pb));
+ffio_init_context(r-buf_pb, buf, size, 0, NULL, NULL, NULL, NULL);
+ff_text_init_avio(r, r-buf_pb);
+}
+
+int64_t ff_text_pos(FFTextReader *r)
+{
+return avio_tell(r-pb) - r-buf_len + r-buf_pos;
+}
+
+int ff_text_r8(FFTextReader *r)
+{
+uint32_t val;
+uint8_t tmp;
+if (r-buf_pos  r-buf_len)
+return r-buf[r-buf_pos++];
+if (r-type == 1) {
+GET_UTF16(val, avio_rl16(r-pb), return 0;)
+} else if (r-type == 2) {
+GET_UTF16(val, avio_rb16(r-pb), return 0;)
+} else {
+return avio_r8(r-pb);
+}
+if (!val)
+return 0;
+r-buf_pos = 0;
+r-buf_len = 0;
+PUT_UTF8(val, tmp, r-buf[r-buf_len++] = tmp;)
+return r-buf[r-buf_pos++]; // buf_len is at least 1
+}
+
+void ff_text_read(FFTextReader *r, char *buf, size_t size)
+{
+for ( ; size  0; size--)
+*buf++ = ff_text_r8(r);
+}
+
 AVPacket *ff_subtitles_queue_insert(FFDemuxSubtitlesQueue *q,
 const uint8_t *event, int len, int merge)
 {
diff --git a/libavformat/subtitles.h b/libavformat/subtitles.h
index b5a96ec..9549b32 100644
--- a/libavformat/subtitles.h
+++ b/libavformat/subtitles.h
@@ -31,6 +31,57 @@ enum sub_sort {
 };
 
 typedef struct {
+int type;
+AVIOContext *pb;
+unsigned char buf[8];
+int buf_pos, buf_len;
+AVIOContext buf_pb;
+} FFTextReader;
+
+/**
+ * Initialize the FFTextReader from the given AVIOContext. This function will
+ * read some bytes from pb, and test for UTF-8 or UTF-16 BOMs. Further accesses
+ * to FFTextReader will read more data from pb.
+ *
+ * The purpose of FFTextReader is to transparently convert read data to UTF-8
+ * if the stream had a UTF-16 BOM.
+ *
+ * @param r object which will be initialized
+ * @param pb stream to read 

Re: [FFmpeg-devel] [PATCH 1/4] avformat/assdec: UTF-16 support

2014-09-02 Thread Reimar Döffinger
On Tue, Sep 02, 2014 at 08:56:09PM +0200, wm4 wrote:
 Use the UTF-16 BOM to detect UTF-16 encoding. Convert the file contents
 to UTF-8 on the fly using FFTextReader, which acts as converting wrapper
 around AVIOContext. It also can work on a static buffer, needed for
 format probing. The FFTextReader wrapper now also takes care of skipping
 the UTF-8 BOM.

Haven't reviewed it in detail, but shouldn't it also detect anything
with a 0 byte in the first 2 characters as UTF-16?
Interpreting it as any other text format is unlikely to work anyway,
and I think most subtitle formats will start with an ASCII character,
giving near 100% reliability without any BOM.
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 1/4] avformat/assdec: UTF-16 support

2014-09-02 Thread wm4
On Tue, 2 Sep 2014 21:05:08 +0200
Reimar Döffinger reimar.doeffin...@gmx.de wrote:

 On Tue, Sep 02, 2014 at 08:56:09PM +0200, wm4 wrote:
  Use the UTF-16 BOM to detect UTF-16 encoding. Convert the file contents
  to UTF-8 on the fly using FFTextReader, which acts as converting wrapper
  around AVIOContext. It also can work on a static buffer, needed for
  format probing. The FFTextReader wrapper now also takes care of skipping
  the UTF-8 BOM.
 
 Haven't reviewed it in detail, but shouldn't it also detect anything
 with a 0 byte in the first 2 characters as UTF-16?
 Interpreting it as any other text format is unlikely to work anyway,
 and I think most subtitle formats will start with an ASCII character,
 giving near 100% reliability without any BOM.

Interesting idea, but on the other hand I haven't seen any UTF-16
subtitles without BOM. (My guess is that they're all produced on
Windows...)
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 1/4] avformat/assdec: UTF-16 support

2014-09-02 Thread Clément Bœsch
On Tue, Sep 02, 2014 at 08:56:09PM +0200, wm4 wrote:
 Use the UTF-16 BOM to detect UTF-16 encoding. Convert the file contents
 to UTF-8 on the fly using FFTextReader, which acts as converting wrapper
 around AVIOContext. It also can work on a static buffer, needed for
 format probing. The FFTextReader wrapper now also takes care of skipping
 the UTF-8 BOM.
 ---
  libavformat/assdec.c| 19 +--
  libavformat/subtitles.c | 63 
 +
  libavformat/subtitles.h | 51 +++
  3 files changed, 126 insertions(+), 7 deletions(-)
 
[...]
 +void ff_text_init_avio(FFTextReader *r, AVIOContext *pb)
 +{
 +int i;
 +r-pb = pb;
 +r-buf_pos = r-buf_len = 0;
 +r-type = 0;
 +for (i = 0; i  2; i++)
 +r-buf[r-buf_len++] = avio_r8(r-pb);
 +if (strncmp(\xFF\xFE, r-buf, 2) == 0) {

 +r-type = 1; // UTF16LE

Would you mind using an enum for type? You won't need these comments
anymore, and the rest of the code will be easier to read.

[...]

Apart from that, patch looks really good to me. I'll review the rest of
the patchset in a moment.

-- 
Clément B.


pgpMsxPlCkd3f.pgp
Description: PGP signature
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel


Re: [FFmpeg-devel] [PATCH 1/4] avformat/assdec: UTF-16 support

2014-09-02 Thread wm4
On Tue, 2 Sep 2014 23:18:02 +0200
Clément Bœsch u...@pkh.me wrote:

 On Tue, Sep 02, 2014 at 08:56:09PM +0200, wm4 wrote:
  Use the UTF-16 BOM to detect UTF-16 encoding. Convert the file contents
  to UTF-8 on the fly using FFTextReader, which acts as converting wrapper
  around AVIOContext. It also can work on a static buffer, needed for
  format probing. The FFTextReader wrapper now also takes care of skipping
  the UTF-8 BOM.
  ---
   libavformat/assdec.c| 19 +--
   libavformat/subtitles.c | 63 
  +
   libavformat/subtitles.h | 51 +++
   3 files changed, 126 insertions(+), 7 deletions(-)
  
 [...]
  +void ff_text_init_avio(FFTextReader *r, AVIOContext *pb)
  +{
  +int i;
  +r-pb = pb;
  +r-buf_pos = r-buf_len = 0;
  +r-type = 0;
  +for (i = 0; i  2; i++)
  +r-buf[r-buf_len++] = avio_r8(r-pb);
  +if (strncmp(\xFF\xFE, r-buf, 2) == 0) {
 
  +r-type = 1; // UTF16LE
 
 Would you mind using an enum for type? You won't need these comments
 anymore, and the rest of the code will be easier to read.

Sure. I didn't do it because all magic numbers are used in one place,
but ok.

 [...]
 
 Apart from that, patch looks really good to me. I'll review the rest of
 the patchset in a moment.
 

___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
http://ffmpeg.org/mailman/listinfo/ffmpeg-devel