From ce7709c6145920a9eeea8618810894d3a7fe99ba Mon Sep 17 00:00:00 2001
From: Martin Vignali <martin.vignali@gmail.com>
Date: Tue, 26 Feb 2019 16:33:39 +0100
Subject: [PATCH] avcodec/proresenc_aw : replace putbitsContext by uint64
 bit_buf

also remove the target buffer length check
use uint64 as intermediate bit_buf

Speed improvment on X86_64 :
test file 1 : 140 -> 154 fps
test file 2 : 55 -> 62 fps
---
 libavcodec/proresenc_anatoliy.c | 133 ++++++++++++++++++++++++++------
 1 file changed, 108 insertions(+), 25 deletions(-)

diff --git a/libavcodec/proresenc_anatoliy.c b/libavcodec/proresenc_anatoliy.c
index e287d176ad..8d614d7c1f 100644
--- a/libavcodec/proresenc_anatoliy.c
+++ b/libavcodec/proresenc_anatoliy.c
@@ -195,9 +195,48 @@ typedef struct {
     char *vendor;
 } ProresContext;
 
-static void encode_codeword(PutBitContext *pb, int val, int codebook)
+static inline void write_codeword_bits(uint64_t *bit_buf, int *bits_left, uint8_t *buf, int *buf_offset,
+                                       uint64_t val_to_write, int nb_bits_to_write)
+{
+    int tmp_bits_left = *bits_left;
+    int remaining_bits;
+    uint64_t first_part;
+
+    if ((tmp_bits_left - nb_bits_to_write) >= 0){ /* enough space in bit_buf */
+        *bit_buf <<= nb_bits_to_write;
+        *bit_buf |= val_to_write;
+
+        if ((tmp_bits_left - nb_bits_to_write) == 0){/* bit buf is full */
+            AV_WB64(buf + *buf_offset, *bit_buf);
+            *buf_offset += 8;
+            *bit_buf = 0;
+            *bits_left = 64;
+        } else {
+            *bits_left -= nb_bits_to_write;
+        }
+    } else {/* not enough space to add bits in buffer */
+        /* fill bit_buf */
+        remaining_bits = nb_bits_to_write - tmp_bits_left;
+        *bit_buf <<= tmp_bits_left;
+        first_part = val_to_write >> remaining_bits;/* keep the first bits part to write */
+        *bit_buf |= first_part;
+        AV_WB64(buf + *buf_offset, *bit_buf);
+        *buf_offset += 8;
+
+        /* store remaining bits */
+        tmp_bits_left = 64 - remaining_bits;
+        val_to_write &= (UINT64_MAX >> tmp_bits_left);
+        *bits_left = tmp_bits_left;
+        *bit_buf = val_to_write;
+    }
+}
+
+static inline void encode_codeword_template(uint64_t *bit_buf, int *bits_left, uint8_t *buf, int *buf_offset,
+                                            int val, int codebook, int is_level_codeword, int is_negative)
 {
     unsigned int rice_order, exp_order, switch_bits, first_exp, exp, zeros;
+    uint64_t val_to_write = 0;
+    int nb_bits_to_write = 0;
 
     /* number of bits to switch between rice and exp golomb */
     switch_bits = codebook & 3;
@@ -211,16 +250,51 @@ static void encode_codeword(PutBitContext *pb, int val, int codebook)
         val += (1 << exp_order);
         exp = av_log2(val);
         zeros = exp - exp_order + switch_bits + 1;
-        put_bits(pb, zeros, 0);
-        put_bits(pb, exp + 1, val);
+        val_to_write = val;
+        nb_bits_to_write = zeros + exp + 1;
+        /* codeword value :
+         bit 0 : repeat (zeros)
+         val
+         */
     } else if (rice_order) {
-        put_bits(pb, (val >> rice_order), 0);
-        put_bits(pb, 1, 1);
-        put_sbits(pb, rice_order, val);
+        val_to_write = av_mod_uintp2(val, rice_order);
+        val_to_write |= (1 << rice_order);
+        nb_bits_to_write = rice_order + 1 + (val >> rice_order);
+        /* codeword value :
+         bit 0 : repeat (val >> rice_order)
+         bit 1 : repeat 1
+         av_mod_uintp2(val, rice_order)
+         */
     } else {
-        put_bits(pb, val, 0);
-        put_bits(pb, 1, 1);
+        val_to_write = 1;
+        nb_bits_to_write = val + 1;
+        /* codeword value :
+         bit 0 : repeat val
+         bit 1 : repeat 1
+         */
+    }
+
+    if (is_level_codeword) {
+        val_to_write <<= 1 ;/* Write zero */
+        if (is_negative) {
+            val_to_write |= 1;/* set first bit to 1 */
+        }
+        nb_bits_to_write += 1;
     }
+
+    write_codeword_bits(bit_buf, bits_left, buf, buf_offset, val_to_write, nb_bits_to_write);
+}
+
+/* dc or ac run encode codeword */
+static void encode_codeword(uint64_t *bit_buf, int *bits_left, uint8_t *buf, int *buf_offset,
+                                     int val, int codebook){
+    encode_codeword_template(bit_buf, bits_left, buf, buf_offset, val, codebook, 0, 0);
+}
+
+/* ac level encode codeword */
+static void encode_level_codeword(uint64_t *bit_buf, int *bits_left, uint8_t *buf, int *buf_offset,
+                                     int val, int codebook, int is_negative){
+    encode_codeword_template(bit_buf, bits_left, buf, buf_offset, val, codebook, 1, is_negative);
 }
 
 #define QSCALE(qmat,ind,val) ((val) / ((qmat)[ind]))
@@ -239,8 +313,8 @@ static av_always_inline int get_level(int val)
 
 static const uint8_t dc_codebook[7] = { 0x04, 0x28, 0x28, 0x4D, 0x4D, 0x70, 0x70};
 
-static void encode_dc_coeffs(PutBitContext *pb, int16_t *in,
-        int blocks_per_slice, int *qmat)
+static void encode_dc_coeffs(uint64_t *bit_buf, int *bits_left, uint8_t *buf, int *buf_offset,
+                             int16_t *in, int blocks_per_slice, int *qmat)
 {
     int prev_dc, code;
     int i, sign, idx;
@@ -248,7 +322,7 @@ static void encode_dc_coeffs(PutBitContext *pb, int16_t *in,
 
     prev_dc = QSCALE(qmat, 0, in[0] - 16384);
     code = TO_GOLOMB(prev_dc);
-    encode_codeword(pb, code, FIRST_DC_CB);
+    encode_codeword(bit_buf, bits_left, buf, buf_offset, code, FIRST_DC_CB);
 
     code = 5; sign = 0; idx = 64;
     for (i = 1; i < blocks_per_slice; i++, idx += 64) {
@@ -257,7 +331,7 @@ static void encode_dc_coeffs(PutBitContext *pb, int16_t *in,
         diff_sign = DIFF_SIGN(delta, sign);
         new_code  = TO_GOLOMB2(get_level(delta), diff_sign);
 
-        encode_codeword(pb, new_code, dc_codebook[FFMIN(code, 6)]);
+        encode_codeword(bit_buf, bits_left, buf, buf_offset, new_code, dc_codebook[FFMIN(code, 6)]);
 
         code      = new_code;
         sign      = delta >> 31;
@@ -270,8 +344,8 @@ static const uint8_t run_to_cb[16] = { 0x06, 0x06, 0x05, 0x05, 0x04, 0x29,
 static const uint8_t lev_to_cb[10] = { 0x04, 0x0A, 0x05, 0x06, 0x04, 0x28,
         0x28, 0x28, 0x28, 0x4C };
 
-static void encode_ac_coeffs(PutBitContext *pb,
-        int16_t *in, int blocks_per_slice, int *qmat, const uint8_t ff_prores_scan[64])
+static void encode_ac_coeffs(uint64_t *bit_buf, int *bits_left, uint8_t *buf, int *buf_offset,
+                             int16_t *in, int blocks_per_slice, int *qmat, const uint8_t ff_prores_scan[64])
 {
     int prev_run = 4;
     int prev_level = 2;
@@ -282,18 +356,16 @@ static void encode_ac_coeffs(PutBitContext *pb,
         for (j = 0; j < blocks_per_slice; j++) {
             int val = QSCALE(qmat, indp, in[(j << 6) + indp]);
             if (val) {
-                encode_codeword(pb, run, run_to_cb[FFMIN(prev_run, 15)]);
+                encode_codeword(bit_buf, bits_left, buf, buf_offset, run, run_to_cb[FFMIN(prev_run, 15)]);
 
                 prev_run   = run;
                 run        = 0;
                 level      = get_level(val);
                 code       = level - 1;
 
-                encode_codeword(pb, code, lev_to_cb[FFMIN(prev_level, 9)]);
+                encode_level_codeword(bit_buf, bits_left, buf, buf_offset, code, lev_to_cb[FFMIN(prev_level, 9)], IS_NEGATIVE(val));
 
                 prev_level = level;
-
-                put_bits(pb, 1, IS_NEGATIVE(val));
             } else {
                 ++run;
             }
@@ -360,16 +432,27 @@ static int encode_slice_plane(int16_t *blocks, int mb_count, uint8_t *buf, unsig
                               const uint8_t ff_prores_scan[64])
 {
     int blocks_per_slice;
-    PutBitContext pb;
+    uint64_t bit_buf = 0;
+    int bits_left = 64;
+    int buf_offset = 0;
 
     blocks_per_slice = mb_count << (2 - sub_sample_chroma);
-    init_put_bits(&pb, buf, buf_size);
-
-    encode_dc_coeffs(&pb, blocks, blocks_per_slice, qmat);
-    encode_ac_coeffs(&pb, blocks, blocks_per_slice, qmat, ff_prores_scan);
 
-    flush_put_bits(&pb);
-    return put_bits_ptr(&pb) - pb.buf;
+    encode_dc_coeffs(&bit_buf, &bits_left, buf, &buf_offset,
+                     blocks, blocks_per_slice, qmat);
+    encode_ac_coeffs(&bit_buf, &bits_left, buf, &buf_offset,
+                     blocks, blocks_per_slice, qmat, ff_prores_scan);
+
+    /* flush bit_buf */
+    if (bits_left < 64) {
+        bit_buf <<= bits_left;
+        AV_WB64(buf + buf_offset, bit_buf);
+        while (bits_left < 64) {
+            bits_left += 8;
+            buf_offset += 1;
+        }
+    }
+    return buf_offset;
 }
 
 static av_always_inline unsigned encode_slice_data(AVCodecContext *avctx,
-- 
2.17.2 (Apple Git-113)