Re: [FFmpeg-devel] [PATCH v6 14/14] vvcdec: add full vvc decoder

2023-12-10 Thread Nuo Mi
On Sat, Dec 9, 2023 at 1:13 PM Andreas Rheinhardt <
andreas.rheinha...@outlook.com> wrote:

> Nuo Mi:
> > Hi Andreas,
> > thank you for the review.
> > On Fri, Dec 8, 2023 at 8:17 PM Andreas Rheinhardt <
> > andreas.rheinha...@outlook.com> wrote:
> >
> >>
> >>> +
> >>> +static int min_pu_arrays_init(VVCFrameContext *fc, const int
> >> pic_size_in_min_pu)
> >>> +{
> >>> +if (fc->tab.pic_size_in_min_pu != pic_size_in_min_pu) {
> >>> +min_pu_arrays_free(fc);
> >>> +fc->tab.msf  = av_mallocz(pic_size_in_min_pu);
> >>> +fc->tab.iaf  = av_mallocz(pic_size_in_min_pu);
> >>> +fc->tab.mmi  = av_mallocz(pic_size_in_min_pu);
> >>> +fc->tab.mvf  = av_mallocz(pic_size_in_min_pu *
> >> sizeof(*fc->tab.mvf));
> >>
> >> Do these have to be separate allocations? If there were allocated
> >> jointly, one memset below would suffice.
> >>
> > They are separate flags, if we combine them. We can't use memset to set
> > flags for a block.
> >
>
> I disagree: You would still be able to use different pointers for
> different parts of the large allocated block, it is just that you also
> save some unnecessary allocations (and frees and errors checks for the
> allocations) and also gain the ability to memset them via one memset
> call in case one wants to set them to the same value.
>
Good idea. done

>
> >>
> >>> +
> >>> +static int init_slice_context(SliceContext *sc, VVCFrameContext *fc,
> >> const H2645NAL *nal, const CodedBitstreamUnit *unit)
> >>> +{
> >>> +const VVCSH *sh = &sc->sh;
> >>> +const H266RawSlice *slice   = (const H266RawSlice *)unit->content;
> >>
> >> Please no pointless casts. Also, why is there unnecessary whitespace in
> >> front of '='?
> >>
> > Fix here and serval other places
> > The whitespace will make all = in a col.
> >
>
> But there is nothing that needs that much whitespace.
>
> >>> +
> >>> +static av_cold int frame_context_init(VVCFrameContext *fc,
> >> AVCodecContext *avctx)
> >>> +{
> >>> +
> >>> +fc->avctx = av_memdup(avctx, sizeof(*avctx));
> >>
> >> When I read this, I presumed you are using multiple AVCodecContexts to
> >> store the ever changing state of the AVCodecContext fields similarly to
> >> update_context_from_thread() in pthread_frame.c. But it seems you don't.
> >> These contexts are only used as a) logcontexts (where the actual
> >> user-facing AVCodecContext should be used, so that the user can make
> >> sense of the logmessages!), b) in ff_thread_get_buffer() and c) in
> >> export_frame_params() where only some basic fields
> >> (dimension-related+pix_fmt) is set. Presumably c) is done for b).
> >>
> > I remember if i did not use a local AVCodecContext  it would trigger some
> > assert when resolution changed.
> >
>
> Can you be more specific about what assert has been triggered? And have
> you set the AVFrame fields directly before ff_thread_get_buffer()?
>
hmm, this has not happened now.
Let us remove the memdup

>
> >>
> >> But the user is allowed to change the provided callbacks in the master
> >> context at any time. E.g. the call to ff_thread_get_buffer() in
> >> vvc_refs.c currently uses the VVCFrameContext and therefore uses the
> >> get_buffer2 callback in place now (during av_memdup()). This is wrong.
> >>
> > This will not happen. av_memdup only happens in vvc_decode_init.
> > Nobody will call ff_thread_get_buffer at this time
> >
>
> You missed the point: If the user changes the get_buffer2 callback after
> init, the new callback will not be used at all.
>
fixed.

>
> >>
> >> I think you can just remove VVCFrameContext.avctx and use the
> >> user-facing AVCodecContext if you set the AVFrame properties that are
> >> normally derived from the AVCodecContext directly on the AVFrame before
> >> ff_thread_get_buffer().
> >
> > Could you explain more about how to create a user-facing  AVCodecContext?
> >
>
> You do not create a user-facing AVCodecContext, the user does (and calls
> avcodec_send_packet()/avcodec_receive_frame() with it).


> >>
> >>> +
> >>> +static int decode_nal_unit(VVCContext *s, VVCFrameContext *fc, const
> >> H2645NAL *nal, const CodedBitstreamUnit *unit)
> >>> +{
> >>> +int  ret;
> >>> +
> >>> +s->temporal_id   = nal->temporal_id;
> >>> +
> >>> +switch (unit->type) {
> >>> +case VVC_VPS_NUT:
> >>> +case VVC_SPS_NUT:
> >>> +case VVC_PPS_NUT:
> >>> +/* vps, sps, sps cached by s->cbc */
> >>> +break;
> >>> +case VVC_TRAIL_NUT:
> >>> +case VVC_STSA_NUT:
> >>> +case VVC_RADL_NUT:
> >>> +case VVC_RASL_NUT:
> >>> +case VVC_IDR_W_RADL:
> >>> +case VVC_IDR_N_LP:
> >>> +case VVC_CRA_NUT:
> >>> +case VVC_GDR_NUT:
> >>> +ret = decode_slice(s, fc, nal, unit);
> >>> +if (ret < 0)
> >>> +goto fail;
> >>> +break;
> >>> +case VVC_PREFIX_APS_NUT:
> >>> +case VVC_SUFFIX_APS_NUT:
> >>> +ret = ff_vvc_decode_aps(&s->ps, unit);
> >>> +if (ret < 0)
> >>> +goto fail;
>

Re: [FFmpeg-devel] [PATCH v6 14/14] vvcdec: add full vvc decoder

2023-12-08 Thread Andreas Rheinhardt
Nuo Mi:
> Hi Andreas,
> thank you for the review.
> On Fri, Dec 8, 2023 at 8:17 PM Andreas Rheinhardt <
> andreas.rheinha...@outlook.com> wrote:
> 
>>
>>> +
>>> +static int min_pu_arrays_init(VVCFrameContext *fc, const int
>> pic_size_in_min_pu)
>>> +{
>>> +if (fc->tab.pic_size_in_min_pu != pic_size_in_min_pu) {
>>> +min_pu_arrays_free(fc);
>>> +fc->tab.msf  = av_mallocz(pic_size_in_min_pu);
>>> +fc->tab.iaf  = av_mallocz(pic_size_in_min_pu);
>>> +fc->tab.mmi  = av_mallocz(pic_size_in_min_pu);
>>> +fc->tab.mvf  = av_mallocz(pic_size_in_min_pu *
>> sizeof(*fc->tab.mvf));
>>
>> Do these have to be separate allocations? If there were allocated
>> jointly, one memset below would suffice.
>>
> They are separate flags, if we combine them. We can't use memset to set
> flags for a block.
> 

I disagree: You would still be able to use different pointers for
different parts of the large allocated block, it is just that you also
save some unnecessary allocations (and frees and errors checks for the
allocations) and also gain the ability to memset them via one memset
call in case one wants to set them to the same value.

>>
>>> +
>>> +static int init_slice_context(SliceContext *sc, VVCFrameContext *fc,
>> const H2645NAL *nal, const CodedBitstreamUnit *unit)
>>> +{
>>> +const VVCSH *sh = &sc->sh;
>>> +const H266RawSlice *slice   = (const H266RawSlice *)unit->content;
>>
>> Please no pointless casts. Also, why is there unnecessary whitespace in
>> front of '='?
>>
> Fix here and serval other places
> The whitespace will make all = in a col.
> 

But there is nothing that needs that much whitespace.

>>> +
>>> +static av_cold int frame_context_init(VVCFrameContext *fc,
>> AVCodecContext *avctx)
>>> +{
>>> +
>>> +fc->avctx = av_memdup(avctx, sizeof(*avctx));
>>
>> When I read this, I presumed you are using multiple AVCodecContexts to
>> store the ever changing state of the AVCodecContext fields similarly to
>> update_context_from_thread() in pthread_frame.c. But it seems you don't.
>> These contexts are only used as a) logcontexts (where the actual
>> user-facing AVCodecContext should be used, so that the user can make
>> sense of the logmessages!), b) in ff_thread_get_buffer() and c) in
>> export_frame_params() where only some basic fields
>> (dimension-related+pix_fmt) is set. Presumably c) is done for b).
>>
> I remember if i did not use a local AVCodecContext  it would trigger some
> assert when resolution changed.
> 

Can you be more specific about what assert has been triggered? And have
you set the AVFrame fields directly before ff_thread_get_buffer()?

>>
>> But the user is allowed to change the provided callbacks in the master
>> context at any time. E.g. the call to ff_thread_get_buffer() in
>> vvc_refs.c currently uses the VVCFrameContext and therefore uses the
>> get_buffer2 callback in place now (during av_memdup()). This is wrong.
>>
> This will not happen. av_memdup only happens in vvc_decode_init.
> Nobody will call ff_thread_get_buffer at this time
> 

You missed the point: If the user changes the get_buffer2 callback after
init, the new callback will not be used at all.

>>
>> I think you can just remove VVCFrameContext.avctx and use the
>> user-facing AVCodecContext if you set the AVFrame properties that are
>> normally derived from the AVCodecContext directly on the AVFrame before
>> ff_thread_get_buffer().
> 
> Could you explain more about how to create a user-facing  AVCodecContext?
> 

You do not create a user-facing AVCodecContext, the user does (and calls
avcodec_send_packet()/avcodec_receive_frame() with it).

>>
>>> +
>>> +static int decode_nal_unit(VVCContext *s, VVCFrameContext *fc, const
>> H2645NAL *nal, const CodedBitstreamUnit *unit)
>>> +{
>>> +int  ret;
>>> +
>>> +s->temporal_id   = nal->temporal_id;
>>> +
>>> +switch (unit->type) {
>>> +case VVC_VPS_NUT:
>>> +case VVC_SPS_NUT:
>>> +case VVC_PPS_NUT:
>>> +/* vps, sps, sps cached by s->cbc */
>>> +break;
>>> +case VVC_TRAIL_NUT:
>>> +case VVC_STSA_NUT:
>>> +case VVC_RADL_NUT:
>>> +case VVC_RASL_NUT:
>>> +case VVC_IDR_W_RADL:
>>> +case VVC_IDR_N_LP:
>>> +case VVC_CRA_NUT:
>>> +case VVC_GDR_NUT:
>>> +ret = decode_slice(s, fc, nal, unit);
>>> +if (ret < 0)
>>> +goto fail;
>>> +break;
>>> +case VVC_PREFIX_APS_NUT:
>>> +case VVC_SUFFIX_APS_NUT:
>>> +ret = ff_vvc_decode_aps(&s->ps, unit);
>>> +if (ret < 0)
>>> +goto fail;
>>> +break;
>>> +default:
>>> +av_log(s->avctx, AV_LOG_INFO,
>>> +   "Skipping NAL unit %d\n", unit->type);
>>
>> This will probably be very noisy (and warn for every SEI). I don't think
>> it is even needed, as h2645_parse.c already contains debug log messages
>> to display the unit type.
>>
> It's copied from hevcdec. It means we did not handle the nal diffrent than
> h2645_parser.c mes

Re: [FFmpeg-devel] [PATCH v6 14/14] vvcdec: add full vvc decoder

2023-12-08 Thread Nuo Mi
Hi Andreas,
thank you for the review.
On Fri, Dec 8, 2023 at 8:17 PM Andreas Rheinhardt <
andreas.rheinha...@outlook.com> wrote:

>
> > +
> > +static int min_pu_arrays_init(VVCFrameContext *fc, const int
> pic_size_in_min_pu)
> > +{
> > +if (fc->tab.pic_size_in_min_pu != pic_size_in_min_pu) {
> > +min_pu_arrays_free(fc);
> > +fc->tab.msf  = av_mallocz(pic_size_in_min_pu);
> > +fc->tab.iaf  = av_mallocz(pic_size_in_min_pu);
> > +fc->tab.mmi  = av_mallocz(pic_size_in_min_pu);
> > +fc->tab.mvf  = av_mallocz(pic_size_in_min_pu *
> sizeof(*fc->tab.mvf));
>
> Do these have to be separate allocations? If there were allocated
> jointly, one memset below would suffice.
>
They are separate flags, if we combine them. We can't use memset to set
flags for a block.

>
> > +
> > +if (!fc->cu_pool) {
> > +fc->cu_pool = ff_refstruct_pool_alloc(sizeof(CodingUnit), 0);
> > +if (!fc->cu_pool)
> > +goto fail;
>
> The size of the objects contained in this pool don't depend on any
> bitstream parameters. You can therefore simply use a single pool (in
> VVCContext) that is allocated in vvc_decode_init() and freed in
> vvc_decode_free().
> The same goes for tu_pool below.
>
A global pool may have a performance issue for huge thread number.
Move it to frame_context_init

>
>
>
> > +static int slices_realloc(VVCFrameContext *fc)
> > +{
> > +void *p;
> > +const int size = (fc->nb_slices_allocated + 1) * 3 / 2;
> > +
> > +if (fc->nb_slices < fc->nb_slices_allocated)
> > +return 0;
> > +
> > +p = av_realloc(fc->slices, size * sizeof(*fc->slices));
>
> av_realloc_array()
>
 done

>
> > +if (!p)
> > +return AVERROR(ENOMEM);
> > +
> > +fc->slices = p;
> > +for (int i = fc->nb_slices_allocated; i < size; i++) {
> > +fc->slices[i] = av_calloc(1, sizeof(*fc->slices[0]));
>
> av_mallocz().
>
done

>
> > +if (!fc->slices[i]) {
> > +for (int j = fc->nb_slices_allocated; j < i; j++)
> > +av_freep(&fc->slices[j]);
> > +return AVERROR(ENOMEM);
>
> Can't you simply set fc->nb_slices_allocated to i in order to avoid this
> loop?
>
done

> > +
> > +static int init_slice_context(SliceContext *sc, VVCFrameContext *fc,
> const H2645NAL *nal, const CodedBitstreamUnit *unit)
> > +{
> > +const VVCSH *sh = &sc->sh;
> > +const H266RawSlice *slice   = (const H266RawSlice *)unit->content;
>
> Please no pointless casts. Also, why is there unnecessary whitespace in
> front of '='?
>
Fix here and serval other places
The whitespace will make all = in a col.


> > +int nb_eps  = sh->r->num_entry_points + 1;
> > +int ctu_addr= 0;
> > +GetBitContext gb;
> > +
> > +if (sc->nb_eps != nb_eps) {
> > +eps_free(sc);
> > +sc->eps = av_calloc(nb_eps, sizeof(*sc->eps));
> > +if (!sc->eps)
> > +return AVERROR(ENOMEM);
>
> In case of error, sc->eps is NULL, yet sc->nb_eps may be != 0. Stuff
> like this can (and does) lead to crashes.
>
added "slice->nb_eps = 0;" to eps_free

>
> > +static int vvc_ref_frame(VVCFrameContext *fc, VVCFrame *dst, VVCFrame
> *src)
>
> src should be const.
>
done

>
> > +
> > +static av_cold int frame_context_init(VVCFrameContext *fc,
> AVCodecContext *avctx)
> > +{
> > +
> > +fc->avctx = av_memdup(avctx, sizeof(*avctx));
>
> When I read this, I presumed you are using multiple AVCodecContexts to
> store the ever changing state of the AVCodecContext fields similarly to
> update_context_from_thread() in pthread_frame.c. But it seems you don't.
> These contexts are only used as a) logcontexts (where the actual
> user-facing AVCodecContext should be used, so that the user can make
> sense of the logmessages!), b) in ff_thread_get_buffer() and c) in
> export_frame_params() where only some basic fields
> (dimension-related+pix_fmt) is set. Presumably c) is done for b).
>
I remember if i did not use a local AVCodecContext  it would trigger some
assert when resolution changed.

>
> But the user is allowed to change the provided callbacks in the master
> context at any time. E.g. the call to ff_thread_get_buffer() in
> vvc_refs.c currently uses the VVCFrameContext and therefore uses the
> get_buffer2 callback in place now (during av_memdup()). This is wrong.
>
This will not happen. av_memdup only happens in vvc_decode_init.
Nobody will call ff_thread_get_buffer at this time

>
> I think you can just remove VVCFrameContext.avctx and use the
> user-facing AVCodecContext if you set the AVFrame properties that are
> normally derived from the AVCodecContext directly on the AVFrame before
> ff_thread_get_buffer().

Could you explain more about how to create a user-facing  AVCodecContext?

>
> > +
> > +static int decode_nal_unit(VVCContext *s, VVCFrameContext *fc, const
> H2645NAL *nal, const CodedBitstreamUnit *unit)
> > +{
> > +int  ret;
> > +
> > +s->temporal_id   = nal->tempor

Re: [FFmpeg-devel] [PATCH v6 14/14] vvcdec: add full vvc decoder

2023-12-08 Thread Andreas Rheinhardt
Nuo Mi:
> vvc decoder plug-in to avcodec.
> split frames into slices/tiles and send them to vvc_thread for further 
> decoding
> reorder and wait for the frame decoding to be done and output the frame
> 
> Features:
> + Support I, P, B frames
> + Support 8/10/12 bits, chroma 400, 420, 422, and 444 and range extension
> + Support VVC new tools like MIP, CCLM, AFFINE, GPM, DMVR, PROF, BDOF, 
> LMCS, ALF
> + 295 conformace clips passed
> - Not support RPR, IBC, PALETTE, and other minor features yet
> 
> Performance:
> C code FPS on i7-12700 (x86):
> BQTerrace_1920x1080_60_10_420_22_RA.vvc  93.0
> Chimera_8bit_1080P_1000_frames.vvc  184.3
> NovosobornayaSquare_1920x1080.bin   191.3
> RitualDance_1920x1080_60_10_420_32_LD.266   150.7
> RitualDance_1920x1080_60_10_420_37_RA.266   170.0
> Tango2_3840x2160_60_10_420_27_LD.266 33.7
> 
> C code FPS on M1 Mac Pro (ARM):
> BQTerrace_1920x1080_60_10_420_22_RA.vvc 58.7
> Chimera_8bit_1080P_1000_frames.vvc  153.3
> NovosobornayaSquare_1920x1080.bin   150.3
> RitualDance_1920x1080_60_10_420_32_LD.266   105.0
> RitualDance_1920x1080_60_10_420_37_RA.266   133.0
> Tango2_3840x2160_60_10_420_27_LD.26621.7
> 
> Asm optimizations still working in progress. please check
> https://github.com/ffvvc/FFmpeg/wiki#performance-data for the latest
> 
> Contributors(based on code merge order):
> Nuo Mi 
> Xu Mu 
> frankplow 
> Shaun Loo 
> ---
>  libavcodec/vvc/vvcdec.c | 1007 +++
>  1 file changed, 1007 insertions(+)
> 
> diff --git a/libavcodec/vvc/vvcdec.c b/libavcodec/vvc/vvcdec.c
> index 3c591ce875..e40eb7339f 100644
> --- a/libavcodec/vvc/vvcdec.c
> +++ b/libavcodec/vvc/vvcdec.c
> @@ -21,28 +21,1035 @@
>   * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 
> USA
>   */
>  #include "libavcodec/codec_internal.h"
> +#include "libavcodec/decode.h"
>  #include "libavcodec/profiles.h"
> +#include "libavcodec/refstruct.h"
> +#include "libavutil/cpu.h"
>  
>  #include "vvcdec.h"
> +#include "vvc_ctu.h"
> +#include "vvc_data.h"
> +#include "vvc_refs.h"
> +#include "vvc_thread.h"
> +
> +static int vvc_frame_start(VVCContext *s, VVCFrameContext *fc, SliceContext 
> *sc)
> +{
> +const VVCPH *ph = &fc->ps.ph;
> +const H266RawSliceHeader *rsh   = sc->sh.r;
> +int ret;
> +
> +// 8.3.1 Decoding process for picture order count
> +if (!s->temporal_id && !ph->r->ph_non_ref_pic_flag && !(IS_RASL(s) || 
> IS_RADL(s)))
> +s->poc_tid0 = ph->poc;
> +
> +if ((ret = ff_vvc_set_new_ref(s, fc, &fc->frame)) < 0)
> +goto fail;
> +
> +if (!IS_IDR(s))
> +ff_vvc_bump_frame(s, fc);
> +
> +av_frame_unref(fc->output_frame);
> +
> +if ((ret = ff_vvc_output_frame(s, fc, 
> fc->output_frame,rsh->sh_no_output_of_prior_pics_flag, 0)) < 0)
> +goto fail;
> +
> +if ((ret = ff_vvc_frame_rpl(s, fc, sc)) < 0)
> +goto fail;
> +
> +if ((ret = ff_vvc_frame_thread_init(fc)) < 0)
> +goto fail;
> +return 0;
> +fail:
> +if (fc->ref)
> +ff_vvc_unref_frame(fc, fc->ref, ~0);
> +fc->ref = NULL;
> +return ret;
> +}
> +
> +static void ctb_arrays_free(VVCFrameContext *fc)
> +{
> +av_freep(&fc->tab.deblock);
> +av_freep(&fc->tab.sao);
> +av_freep(&fc->tab.alf);
> +av_freep(&fc->tab.slice_idx);
> +av_freep(&fc->tab.coeffs);
> +if (fc->tab.ctus) {
> +for (int i = 0; i < fc->tab.ctu_count; i++)
> +ff_vvc_ctu_free_cus(fc->tab.ctus + i);
> +av_freep(&fc->tab.ctus);
> +}
> +ff_refstruct_pool_uninit(&fc->rpl_tab_pool);
> +}
> +
> +static int ctb_arrays_init(VVCFrameContext *fc, const int ctu_count, const 
> int ctu_size)
> +{
> +if (fc->tab.ctu_count != ctu_count || fc->tab.ctu_size != ctu_size) {
> +ctb_arrays_free(fc);
> +fc->tab.deblock = av_calloc(ctu_count, 
> sizeof(*fc->tab.deblock));
> +fc->tab.sao = av_calloc(ctu_count, sizeof(*fc->tab.sao));
> +fc->tab.alf = av_calloc(ctu_count, sizeof(*fc->tab.alf));
> +fc->tab.ctus= av_calloc(ctu_count, 
> sizeof(*fc->tab.ctus));
> +fc->tab.slice_idx   = av_malloc(ctu_count * 
> sizeof(*fc->tab.slice_idx));
> +if (!fc->tab.deblock || !fc->tab.sao || !fc->tab.alf || 
> !fc->tab.ctus || !fc->tab.slice_idx )
> +return AVERROR(ENOMEM);
> +fc->tab.coeffs = av_malloc(ctu_count * sizeof(*fc->tab.coeffs) * 
> ctu_size * VVC_MAX_SAMPLE_ARRAYS);
> +if (!fc->tab.coeffs)
> +return AVERROR(ENOMEM);
> +fc->rpl_tab_pool = ff_refstruct_pool_alloc(ctu_count * 
> sizeof(RefPicListTab), 0);
> +if (!fc->rpl_tab_pool)
> +return AVERROR(ENOMEM);
> +} else {
> +memset(fc->tab.deblock, 0, ctu_co

Re: [FFmpeg-devel] [PATCH v6 14/14] vvcdec: add full vvc decoder

2023-12-05 Thread Nuo Mi
On Tue, Dec 5, 2023 at 10:46 PM Nuo Mi  wrote:

> vvc decoder plug-in to avcodec.
> split frames into slices/tiles and send them to vvc_thread for further
> decoding
> reorder and wait for the frame decoding to be done and output the frame
>
> Features:
> + Support I, P, B frames
> + Support 8/10/12 bits, chroma 400, 420, 422, and 444 and range
> extension
> + Support VVC new tools like MIP, CCLM, AFFINE, GPM, DMVR, PROF, BDOF,
> LMCS, ALF
> + 295 conformace clips passed
> - Not support RPR, IBC, PALETTE, and other minor features yet
>
> Performance:
> C code FPS on i7-12700 (x86):
> BQTerrace_1920x1080_60_10_420_22_RA.vvc  93.0
> Chimera_8bit_1080P_1000_frames.vvc  184.3
> NovosobornayaSquare_1920x1080.bin   191.3
> RitualDance_1920x1080_60_10_420_32_LD.266   150.7
> RitualDance_1920x1080_60_10_420_37_RA.266   170.0
> Tango2_3840x2160_60_10_420_27_LD.266 33.7
>
> C code FPS on M1 Mac Pro (ARM):
> BQTerrace_1920x1080_60_10_420_22_RA.vvc 58.7
> Chimera_8bit_1080P_1000_frames.vvc  153.3
> NovosobornayaSquare_1920x1080.bin   150.3
> RitualDance_1920x1080_60_10_420_32_LD.266   105.0
> RitualDance_1920x1080_60_10_420_37_RA.266   133.0
> Tango2_3840x2160_60_10_420_27_LD.26621.7
>
> Asm optimizations still working in progress. please check
> https://github.com/ffvvc/FFmpeg/wiki#performance-data for the latest
>
> Contributors(based on code merge order):
> Nuo Mi 
> Xu Mu 
> frankplow 
> Shaun Loo 
>
> changes since v5:
Fix c header guard for "make fate-source"
___
ffmpeg-devel mailing list
ffmpeg-devel@ffmpeg.org
https://ffmpeg.org/mailman/listinfo/ffmpeg-devel

To unsubscribe, visit link above, or email
ffmpeg-devel-requ...@ffmpeg.org with subject "unsubscribe".


[FFmpeg-devel] [PATCH v6 14/14] vvcdec: add full vvc decoder

2023-12-05 Thread Nuo Mi
vvc decoder plug-in to avcodec.
split frames into slices/tiles and send them to vvc_thread for further decoding
reorder and wait for the frame decoding to be done and output the frame

Features:
+ Support I, P, B frames
+ Support 8/10/12 bits, chroma 400, 420, 422, and 444 and range extension
+ Support VVC new tools like MIP, CCLM, AFFINE, GPM, DMVR, PROF, BDOF, 
LMCS, ALF
+ 295 conformace clips passed
- Not support RPR, IBC, PALETTE, and other minor features yet

Performance:
C code FPS on i7-12700 (x86):
BQTerrace_1920x1080_60_10_420_22_RA.vvc  93.0
Chimera_8bit_1080P_1000_frames.vvc  184.3
NovosobornayaSquare_1920x1080.bin   191.3
RitualDance_1920x1080_60_10_420_32_LD.266   150.7
RitualDance_1920x1080_60_10_420_37_RA.266   170.0
Tango2_3840x2160_60_10_420_27_LD.266 33.7

C code FPS on M1 Mac Pro (ARM):
BQTerrace_1920x1080_60_10_420_22_RA.vvc 58.7
Chimera_8bit_1080P_1000_frames.vvc  153.3
NovosobornayaSquare_1920x1080.bin   150.3
RitualDance_1920x1080_60_10_420_32_LD.266   105.0
RitualDance_1920x1080_60_10_420_37_RA.266   133.0
Tango2_3840x2160_60_10_420_27_LD.26621.7

Asm optimizations still working in progress. please check
https://github.com/ffvvc/FFmpeg/wiki#performance-data for the latest

Contributors(based on code merge order):
Nuo Mi 
Xu Mu 
frankplow 
Shaun Loo 
---
 libavcodec/vvc/vvcdec.c | 1007 +++
 1 file changed, 1007 insertions(+)

diff --git a/libavcodec/vvc/vvcdec.c b/libavcodec/vvc/vvcdec.c
index 3c591ce875..e40eb7339f 100644
--- a/libavcodec/vvc/vvcdec.c
+++ b/libavcodec/vvc/vvcdec.c
@@ -21,28 +21,1035 @@
  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  */
 #include "libavcodec/codec_internal.h"
+#include "libavcodec/decode.h"
 #include "libavcodec/profiles.h"
+#include "libavcodec/refstruct.h"
+#include "libavutil/cpu.h"
 
 #include "vvcdec.h"
+#include "vvc_ctu.h"
+#include "vvc_data.h"
+#include "vvc_refs.h"
+#include "vvc_thread.h"
+
+static int vvc_frame_start(VVCContext *s, VVCFrameContext *fc, SliceContext 
*sc)
+{
+const VVCPH *ph = &fc->ps.ph;
+const H266RawSliceHeader *rsh   = sc->sh.r;
+int ret;
+
+// 8.3.1 Decoding process for picture order count
+if (!s->temporal_id && !ph->r->ph_non_ref_pic_flag && !(IS_RASL(s) || 
IS_RADL(s)))
+s->poc_tid0 = ph->poc;
+
+if ((ret = ff_vvc_set_new_ref(s, fc, &fc->frame)) < 0)
+goto fail;
+
+if (!IS_IDR(s))
+ff_vvc_bump_frame(s, fc);
+
+av_frame_unref(fc->output_frame);
+
+if ((ret = ff_vvc_output_frame(s, fc, 
fc->output_frame,rsh->sh_no_output_of_prior_pics_flag, 0)) < 0)
+goto fail;
+
+if ((ret = ff_vvc_frame_rpl(s, fc, sc)) < 0)
+goto fail;
+
+if ((ret = ff_vvc_frame_thread_init(fc)) < 0)
+goto fail;
+return 0;
+fail:
+if (fc->ref)
+ff_vvc_unref_frame(fc, fc->ref, ~0);
+fc->ref = NULL;
+return ret;
+}
+
+static void ctb_arrays_free(VVCFrameContext *fc)
+{
+av_freep(&fc->tab.deblock);
+av_freep(&fc->tab.sao);
+av_freep(&fc->tab.alf);
+av_freep(&fc->tab.slice_idx);
+av_freep(&fc->tab.coeffs);
+if (fc->tab.ctus) {
+for (int i = 0; i < fc->tab.ctu_count; i++)
+ff_vvc_ctu_free_cus(fc->tab.ctus + i);
+av_freep(&fc->tab.ctus);
+}
+ff_refstruct_pool_uninit(&fc->rpl_tab_pool);
+}
+
+static int ctb_arrays_init(VVCFrameContext *fc, const int ctu_count, const int 
ctu_size)
+{
+if (fc->tab.ctu_count != ctu_count || fc->tab.ctu_size != ctu_size) {
+ctb_arrays_free(fc);
+fc->tab.deblock = av_calloc(ctu_count, 
sizeof(*fc->tab.deblock));
+fc->tab.sao = av_calloc(ctu_count, sizeof(*fc->tab.sao));
+fc->tab.alf = av_calloc(ctu_count, sizeof(*fc->tab.alf));
+fc->tab.ctus= av_calloc(ctu_count, sizeof(*fc->tab.ctus));
+fc->tab.slice_idx   = av_malloc(ctu_count * 
sizeof(*fc->tab.slice_idx));
+if (!fc->tab.deblock || !fc->tab.sao || !fc->tab.alf || !fc->tab.ctus 
|| !fc->tab.slice_idx )
+return AVERROR(ENOMEM);
+fc->tab.coeffs = av_malloc(ctu_count * sizeof(*fc->tab.coeffs) * 
ctu_size * VVC_MAX_SAMPLE_ARRAYS);
+if (!fc->tab.coeffs)
+return AVERROR(ENOMEM);
+fc->rpl_tab_pool = ff_refstruct_pool_alloc(ctu_count * 
sizeof(RefPicListTab), 0);
+if (!fc->rpl_tab_pool)
+return AVERROR(ENOMEM);
+} else {
+memset(fc->tab.deblock, 0, ctu_count * sizeof(*fc->tab.deblock));
+memset(fc->tab.sao, 0, ctu_count * sizeof(*fc->tab.sao));
+memset(fc->tab.alf, 0, ctu_count * sizeof(*fc->tab.alf));
+for (int i = 0; i < fc->tab.ctu_count; i++)
+ff_vvc_ctu_free_cus(fc->tab.ctus + i);
+memset(fc-