Re: [Libva] [PATCH 2/4] Set the pipeline to use the new VP8 encoding shaders on BSW

Mark Thompson Tue, 10 Jan 2017 16:21:39 -0800

On 10/01/17 22:02, Sean V Kelley wrote:
> From: "Xiang, Haihao" <haihao.xi...@intel.com>
> 
> Currently only one temporal layer is supported
> 
> Signed-off-by: Xiang, Haihao <haihao.xi...@intel.com>
> Reviewed-by: Sean V Kelley <sea...@posteo.de>
> ---
>  src/Makefile.am        |    3 +
>  src/gen8_encoder_vp8.c |  140 +
>  src/gen8_mfc.c         |    8 +-
>  src/gen8_vme.c         |    5 +
>  src/i965_defines.h     |   10 +
>  src/i965_encoder.c     |    2 +
>  src/i965_encoder_vp8.c | 6697 
> ++++++++++++++++++++++++++++++++++++++++++++++++
>  src/i965_encoder_vp8.h | 2643 +++++++++++++++++++
>  8 files changed, 9507 insertions(+), 1 deletion(-)


I had a go with this on Kaby Lake.  In general, big win - looks like it can be 
under half the bitrate at comparable quality (though it was pretty terrible 
before...).

However, the rate control seems to do odd things at low bitrate relative to the 
frame size?  I can get GPU hangs and wildly varying output bitrate with it, 
though it seems ok at high bitrate.

I had a look around the rate control and found two minor issues in the RC 
configuration, though I don't think either of them are relevant to my problem 
(see below).  I can try to make a reproducer if this is not already known?

Thanks,

- Mark


> ...
> +
> +static void
> +i965_encoder_vp8_get_misc_parameters(VADriverContextP ctx,
> +                                     struct encode_state *encode_state,
> +                                     struct intel_encoder_context 
> *encoder_context)
> +{
> +    struct i965_encoder_vp8_context *vp8_context = 
> encoder_context->vme_context;
> +
> +    if (vp8_context->internal_rate_mode == I965_BRC_CQP) {
> +        vp8_context->init_vbv_buffer_fullness_in_bit = 0;
> +        vp8_context->vbv_buffer_size_in_bit = 0;
> +        vp8_context->target_bit_rate = 0;
> +        vp8_context->max_bit_rate = 0;
> +        vp8_context->min_bit_rate = 0;
> +        vp8_context->brc_need_reset = 0;
> +    } else {
> +        vp8_context->gop_size = encoder_context->brc.gop_size;
> +
> +        if (encoder_context->brc.need_reset) {
> +            vp8_context->framerate = encoder_context->brc.framerate[0];
> +            vp8_context->vbv_buffer_size_in_bit = 
> encoder_context->brc.hrd_buffer_size;
> +            vp8_context->init_vbv_buffer_fullness_in_bit = 
> encoder_context->brc.hrd_initial_buffer_fullness;
> +            vp8_context->max_bit_rate = 
> encoder_context->brc.bits_per_second[0]; // currently only one layer is 
> supported
> +            vp8_context->brc_need_reset = (vp8_context->brc_initted && 
> encoder_context->brc.need_reset);
> +
> +            if (vp8_context->internal_rate_mode == I965_BRC_CBR) {
> +                vp8_context->min_bit_rate = vp8_context->max_bit_rate;
> +                vp8_context->target_bit_rate = vp8_context->max_bit_rate;
> +            } else {
> +                assert(vp8_context->internal_rate_mode == I965_BRC_VBR);
> +                vp8_context->min_bit_rate = vp8_context->max_bit_rate * (2 * 
> encoder_context->brc.target_percentage[0] - 100) / 100;

If target percentage is < 50 then (2 * 
encoder_context->brc.target_percentage[0] - 100) is negative.  Since it's 
unsigned, you end up with a garbage number in min_bit_rate.

> +                vp8_context->target_bit_rate = vp8_context->max_bit_rate * 
> encoder_context->brc.target_percentage[0] / 100;
> +            }
> +        }
> +    }
> +
> +    if (encoder_context->quality_level == ENCODER_LOW_QUALITY)
> +        vp8_context->hme_16x_supported = 0;
> +}
> +
> ...
> +
> +static void
> +i965_encoder_vp8_vme_brc_init_reset_set_curbe(VADriverContextP ctx,
> +                                              struct encode_state 
> *encode_state,
> +                                              struct intel_encoder_context 
> *encoder_context,
> +                                              struct i965_gpe_context 
> *gpe_context)
> +{
> +    struct i965_encoder_vp8_context *vp8_context = 
> encoder_context->vme_context;
> +    VAEncPictureParameterBufferVP8 *pic_param = 
> (VAEncPictureParameterBufferVP8 *)encode_state->pic_param_ext->buffer;
> +    struct vp8_brc_init_reset_curbe_data *pcmd = 
> i965_gpe_context_map_curbe(gpe_context);
> +    double input_bits_per_frame, bps_ratio;
> +
> +    memset(pcmd, 0, sizeof(*pcmd));
> +
> +    pcmd->dw0.profile_level_max_frame = vp8_context->frame_width * 
> vp8_context->frame_height;
> +    pcmd->dw1.init_buf_full_in_bits = 
> vp8_context->init_vbv_buffer_fullness_in_bit;
> +    pcmd->dw2.buf_size_in_bits = vp8_context->vbv_buffer_size_in_bit;
> +    pcmd->dw3.average_bitrate = ALIGN(vp8_context->target_bit_rate, 
> VP8_BRC_KBPS) / VP8_BRC_KBPS * VP8_BRC_KBPS;
> +    pcmd->dw4.max_bitrate = ALIGN(vp8_context->max_bit_rate, VP8_BRC_KBPS) / 
> VP8_BRC_KBPS * VP8_BRC_KBPS;

VP8_BRC_KBPS is 1000 which is not a power of two, so the ALIGN macro isn't 
doing anything sensible here.

> +    pcmd->dw6.frame_rate_m = vp8_context->framerate.num;
> +    pcmd->dw7.frame_rate_d = vp8_context->framerate.den;
> +    pcmd->dw8.brc_flag = 0;
> +    pcmd->dw8.gop_minus1 = vp8_context->gop_size - 1;
> +
> +    if (vp8_context->internal_rate_mode == I965_BRC_CBR) {
> +        pcmd->dw4.max_bitrate = pcmd->dw3.average_bitrate;
> +
> +        pcmd->dw8.brc_flag = pcmd->dw8.brc_flag | BRC_KERNEL_CBR;
> +    } else if (vp8_context->internal_rate_mode == I965_BRC_VBR) {
> +        if (pcmd->dw4.max_bitrate < pcmd->dw3.average_bitrate) {
> +            pcmd->dw4.max_bitrate = 2 * pcmd->dw3.average_bitrate;
> +        }
> +
> +        pcmd->dw8.brc_flag = pcmd->dw8.brc_flag | BRC_KERNEL_VBR;
> +    }
> +
> +    input_bits_per_frame =
> +        ((double)(pcmd->dw4.max_bitrate) * (double)(pcmd->dw7.frame_rate_d) /
> +         (double)(pcmd->dw6.frame_rate_m));
> +
> +    if (pcmd->dw2.buf_size_in_bits < (unsigned int)input_bits_per_frame * 4) 
> {
> +        pcmd->dw2.buf_size_in_bits = (unsigned int)input_bits_per_frame * 4;
> +    }
> +
> +    if (pcmd->dw1.init_buf_full_in_bits == 0) {
> +        pcmd->dw1.init_buf_full_in_bits = 7 * pcmd->dw2.buf_size_in_bits / 8;
> +    }
> +
> +    if (pcmd->dw1.init_buf_full_in_bits < (unsigned 
> int)(input_bits_per_frame * 2)) {
> +        pcmd->dw1.init_buf_full_in_bits = (unsigned 
> int)(input_bits_per_frame * 2);
> +    }
> +
> +    if (pcmd->dw1.init_buf_full_in_bits > pcmd->dw2.buf_size_in_bits) {
> +        pcmd->dw1.init_buf_full_in_bits = pcmd->dw2.buf_size_in_bits;
> +    }
> +
> +    bps_ratio = input_bits_per_frame / ((double)(pcmd->dw2.buf_size_in_bits) 
> / 30);
> +    bps_ratio = (bps_ratio < 0.1) ? 0.1 : (bps_ratio > 3.5) ? 3.5 : 
> bps_ratio;
> +
> +    pcmd->dw9.frame_width_in_bytes = vp8_context->frame_width;
> +    pcmd->dw10.frame_height_in_bytes = vp8_context->frame_height;
> +    pcmd->dw10.avbr_accuracy = 30;
> +    pcmd->dw11.avbr_convergence = 150;
> +    pcmd->dw11.min_qp = pic_param->clamp_qindex_low;
> +    pcmd->dw12.max_qp = pic_param->clamp_qindex_high;
> +    pcmd->dw12.level_qp = 60;
> +
> +    // DW13 default 100
> +    pcmd->dw13.max_section_pct = 100;
> +    pcmd->dw13.under_shoot_cbr_pct = 115;
> +
> +    // DW14 default 100
> +    pcmd->dw14.min_section_pct = 100;
> +    pcmd->dw14.vbr_bias_pct = 100;
> +    pcmd->dw15.instant_rate_threshold_0_for_p = 30;
> +    pcmd->dw15.instant_rate_threshold_1_for_p = 50;
> +    pcmd->dw15.instant_rate_threshold_2_for_p = 70;
> +    pcmd->dw15.instant_rate_threshold_3_for_p = 120;
> +
> +    pcmd->dw17.instant_rate_threshold_0_for_i = 30;
> +    pcmd->dw17.instant_rate_threshold_1_for_i = 50;
> +    pcmd->dw17.instant_rate_threshold_2_for_i = 90;
> +    pcmd->dw17.instant_rate_threshold_3_for_i = 115;
> +    pcmd->dw18.deviation_threshold_0_for_p = (unsigned int)(-50 * pow(0.9, 
> bps_ratio));
> +    pcmd->dw18.deviation_threshold_1_for_p = (unsigned int)(-50 * pow(0.66, 
> bps_ratio));
> +    pcmd->dw18.deviation_threshold_2_for_p = (unsigned int)(-50 * pow(0.46, 
> bps_ratio));
> +    pcmd->dw18.deviation_threshold_3_for_p = (unsigned int)(-50 * pow(0.3, 
> bps_ratio));
> +    pcmd->dw19.deviation_threshold_4_for_p = (unsigned int)(50 * pow(0.3, 
> bps_ratio));
> +    pcmd->dw19.deviation_threshold_5_for_p = (unsigned int)(50 * pow(0.46, 
> bps_ratio));
> +    pcmd->dw19.deviation_threshold_6_for_p = (unsigned int)(50 * pow(0.7, 
> bps_ratio));
> +    pcmd->dw19.deviation_threshold_7_for_p = (unsigned int)(50 * pow(0.9, 
> bps_ratio));
> +    pcmd->dw20.deviation_threshold_0_for_vbr = (unsigned int)(-50 * pow(0.9, 
> bps_ratio));
> +    pcmd->dw20.deviation_threshold_1_for_vbr = (unsigned int)(-50 * pow(0.7, 
> bps_ratio));
> +    pcmd->dw20.deviation_threshold_2_for_vbr = (unsigned int)(-50 * pow(0.5, 
> bps_ratio));
> +    pcmd->dw20.deviation_threshold_3_for_vbr = (unsigned int)(-50 * pow(0.3, 
> bps_ratio));
> +    pcmd->dw21.deviation_threshold_4_for_vbr = (unsigned int)(100 * pow(0.4, 
> bps_ratio));
> +    pcmd->dw21.deviation_threshold_5_for_vbr = (unsigned int)(100 * pow(0.5, 
> bps_ratio));
> +    pcmd->dw21.deviation_threshold_6_for_vbr = (unsigned int)(100 * 
> pow(0.75, bps_ratio));
> +    pcmd->dw21.deviation_threshold_7_for_vbr = (unsigned int)(100 * pow(0.9, 
> bps_ratio));
> +    pcmd->dw22.deviation_threshold_0_for_i = (unsigned int)(-50 * pow(0.8, 
> bps_ratio));
> +    pcmd->dw22.deviation_threshold_1_for_i = (unsigned int)(-50 * pow(0.6, 
> bps_ratio));
> +    pcmd->dw22.deviation_threshold_2_for_i = (unsigned int)(-50 * pow(0.34, 
> bps_ratio));
> +    pcmd->dw22.deviation_threshold_3_for_i = (unsigned int)(-50 * pow(0.2, 
> bps_ratio));
> +    pcmd->dw23.deviation_threshold_4_for_i = (unsigned int)(50 * pow(0.2, 
> bps_ratio));
> +    pcmd->dw23.deviation_threshold_5_for_i = (unsigned int)(50 * pow(0.4, 
> bps_ratio));
> +    pcmd->dw23.deviation_threshold_6_for_i = (unsigned int)(50 * pow(0.66, 
> bps_ratio));
> +    pcmd->dw23.deviation_threshold_7_for_i = (unsigned int)(50 * pow(0.9, 
> bps_ratio));
> +
> +    // Default: 1
> +    pcmd->dw24.num_t_levels = 1;
> +
> +    if (!vp8_context->brc_initted) {
> +        vp8_context->brc_init_current_target_buf_full_in_bits = 
> pcmd->dw1.init_buf_full_in_bits;
> +    }
> +
> +    vp8_context->brc_init_reset_buf_size_in_bits = 
> pcmd->dw2.buf_size_in_bits;
> +    vp8_context->brc_init_reset_input_bits_per_frame = input_bits_per_frame;
> +
> +    pcmd->dw26.history_buffer_bti = VP8_BTI_BRC_INIT_RESET_HISTORY;
> +    pcmd->dw27.distortion_buffer_bti = VP8_BTI_BRC_INIT_RESET_DISTORTION;
> +
> +    i965_gpe_context_unmap_curbe(gpe_context);
> +}
> +
> ...
_______________________________________________
Libva mailing list
Libva@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/libva

Re: [Libva] [PATCH 2/4] Set the pipeline to use the new VP8 encoding shaders on BSW

Reply via email to